### First things first

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings # Ignore warning
import pickle
from zipfile import ZipFile

In [2]:
!ls

Baseline.ipynb
Public_Dev_Predictions.ipynb
README.md
clf.pkl
humor_controversy_Baseline.ipynb
humor_controversy_V1.ipynb
humor_rating_Baseline.ipynb
vectorizer.pkl


#### helper functions

In [2]:
def stemmer(text, stemmer):
    return(' '.join([stemmer.stem(w) for w in word_tokenize(text)]))

def count_words(input):
    """ Returns number of occurences of characters specified in char """     
    return len(input.split())

def remove_punctuation(s_input, include_char = None):
    """ Returns input string without punctuation """
    import string as String
    punct = String.punctuation
    
    if not include_char is None:
        index = String.punctuation.index(include_char)
        punct = String.punctuation[:index] + String.punctuation[(index + 1):]
        
    punct += '\n'
        
    translator = str.maketrans(punct, ' '*len(punct))
    
    return s_input.translate(translator)

def remove_stopwords(text, use_stopwords = None, df = True, exclude_number = True):
    """ Returns input string removing stopwords from it. """
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    
    if use_stopwords is None:
        use_stopwords = set(stopwords.words("english"))
        
    if df:
        new_text = word_tokenize(text)
        if exclude_number:
            new_text = [word for word in new_text if not word.isnumeric()]
        new_text = " ".join([word for word in new_text if word not in use_stopwords])
    else:
        new_text = ""
        for word in text:
            if word not in use_stopwords:
                new_text += word + " "

    return new_text

def sep_upper(text):
    """ Take a text as input and insert space before every uppercase letter. """
    
    new_text = ""
    for letter in text:
        if letter.isupper():
            new_text += " " + letter
        else:
            new_text += letter
    
    return new_text

def remove_space(text):
    return(re.sub(' +',' ',text)) 

#### (basic) pre-process of text columns

In [3]:
def pre_proc(text_col):
    text_col = text_col.apply(remove_punctuation) # removes String.punctuation characters
    #text_col = text_col.apply(remove_stopwords)   # removes english stopwords 
    text_col = text_col.str.replace('[^\w\s]','').str.strip() # and removes whitespaces
    text_col = text_col.apply(sep_upper) # adds space before an uppercase
    text_col = text_col.str.lower() # lowercase
    
    return text_col

# loading the public dev dataset

In [4]:
df = pd.read_csv('../public_dev.csv')
df.head()

Unnamed: 0,id,text
0,8001,What's the difference between a Bernie Sanders...
1,8002,"Vodka, whisky, tequila. I'm calling the shots."
2,8003,French people don't masturbate They Jacque off
3,8004,A lot of Suicide bombers are Muslims - I don't...
4,8005,What happens when you fingerbang a gypsy on he...


# creating df for predictions

In [5]:
final_pred = pd.DataFrame({'id': df.id}) 

# Task 1a

In [7]:
loaded_vec = 
loaded_model = pickle.load(open('1c_clf.pkl', 'rb')) # 1a !!!

In [8]:
# df.text = pre_proc(df.text)
# X_trans = pd.DataFrame(loaded_vec.transform(df.text).toarray()
#                         , columns = loaded_vec.get_feature_names()
#                         , index = df.index)
# X_trans['qtd_words'] = df.text.apply(count_words)

In [8]:
final_pred['is_humor'] = loaded_model.predict(X_trans)
final_pred

Unnamed: 0,id,is_humor
0,8001,1.0
1,8002,1.0
2,8003,0.0
3,8004,0.0
4,8005,0.0
...,...,...
995,8996,1.0
996,8997,0.0
997,8998,0.0
998,8999,0.0


# Task 1b

In [7]:
loaded_vec = 
loaded_model = pickle.load(open('1c_clf.pkl', 'rb')) # 1b

In [8]:
# df.text = pre_proc(df.text)
# X_trans = pd.DataFrame(loaded_vec.transform(df.text).toarray()
#                         , columns = loaded_vec.get_feature_names()
#                         , index = df.index)
# X_trans['qtd_words'] = df.text.apply(count_words)

In [9]:
final_pred['humor_rating'] = loaded_model.predict(X_trans)
final_pred

Unnamed: 0,id,is_humor,humor_rating
0,8001,1.0,1.0
1,8002,1.0,1.0
2,8003,0.0,0.0
3,8004,0.0,0.0
4,8005,0.0,0.0
...,...,...,...
995,8996,1.0,1.0
996,8997,0.0,0.0
997,8998,0.0,0.0
998,8999,0.0,0.0


# Task 1c

#### loading the model

In [6]:
loaded_vec = CountVectorizer(decode_error = "replace", vocabulary = pickle.load(open('1c_vectorizer.pkl', "rb")))
loaded_model = pickle.load(open('1c_clf.pkl', 'rb'))

In [7]:
df.text = pre_proc(df.text)
X_trans = pd.DataFrame(loaded_vec.transform(df.text).toarray()
                        , columns = loaded_vec.get_feature_names()
                        , index = df.index)
X_trans['qtd_words'] = df.text.apply(count_words)

In [10]:
final_pred['humor_controversy'] = loaded_model.predict(X_trans)
final_pred

Unnamed: 0,id,is_humor,humor_rating,humor_controversy
0,8001,1.0,1.0,1.0
1,8002,1.0,1.0,1.0
2,8003,0.0,0.0,0.0
3,8004,0.0,0.0,0.0
4,8005,0.0,0.0,0.0
...,...,...,...,...
995,8996,1.0,1.0,1.0
996,8997,0.0,0.0,0.0
997,8998,0.0,0.0,0.0
998,8999,0.0,0.0,0.0


# Task 2a

#### loading the model

In [7]:
loaded_vec = 
loaded_model = pickle.load(open('2a_clf.pkl', 'rb')) # 2a

In [8]:
# df.text = pre_proc(df.text)
# X_trans = pd.DataFrame(loaded_vec.transform(df.text).toarray()
#                         , columns = loaded_vec.get_feature_names()
#                         , index = df.index)
# X_trans['qtd_words'] = df.text.apply(count_words)

In [11]:
final_pred['offense_rating'] = loaded_model.predict(X_trans)
final_pred

Unnamed: 0,id,is_humor,humor_rating,humor_controversy,offense_rating
0,8001,1.0,1.0,1.0,1.0
1,8002,1.0,1.0,1.0,1.0
2,8003,0.0,0.0,0.0,0.0
3,8004,0.0,0.0,0.0,0.0
4,8005,0.0,0.0,0.0,0.0
...,...,...,...,...,...
995,8996,1.0,1.0,1.0,1.0
996,8997,0.0,0.0,0.0,0.0
997,8998,0.0,0.0,0.0,0.0
998,8999,0.0,0.0,0.0,0.0


# Creating the zipped CSV

In [12]:
final_pred.to_csv('final_pred.csv')
ZipFile('final_pred.zip', mode='w').write("final_pred.csv")

In [None]:
df.iloc[1705].text

In [None]:
X_test.loc[1705].text

#### M