### First things first

In [17]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import RepeatedKFold

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score

import warnings # Ignore warning
import pickle

In [18]:
!ls

1c_clf.pkl
1c_vectorizer.pkl
Baseline.ipynb
Public_Dev_Predictions.ipynb
README.md
humor_controversy_Baseline.ipynb
humor_controversy_V1.ipynb
humor_rating_Baseline.ipynb


In [19]:
df = pd.read_csv('../train.csv').drop(columns = 'id')
df.head()

Unnamed: 0,text,is_humor,humor_rating,humor_controversy,offense_rating
0,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.2
1,A man inserted an advertisement in the classif...,1,2.5,1.0,1.1
2,How many men does it take to open a can of bee...,1,1.95,0.0,2.4
3,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.0
4,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.1


In [20]:
df[df.is_humor ==0].isna().sum()

text                    0
is_humor                0
humor_rating         3068
humor_controversy    3068
offense_rating          0
dtype: int64

#### selecting rows where the text is classified as humorous

In [21]:
X = df[df['is_humor']==1][['text']] #4932 Do not reset the index!
y = df[df['is_humor']==1][['humor_controversy']] #4932

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)
"X_train and y_train shape: {0}, {1}, X_test and y_test shape {2}, {3}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

'X_train and y_train shape: (3945, 1), (3945, 1), X_test and y_test shape (987, 1), (987, 1)'

In [23]:
total = len(y_train)
print('% of positives is ', round(y_train.sum()/total, 4)) #'% of positives is {:.2f}'.format(y_train.sum()/total)
print('% of negatives is ', round((total - y_train.sum())/total, 4)) #'% of negatives is {:.2f}'.format((total - y_train.sum())/total)

% of positives is  humor_controversy    0.5037
dtype: float64
% of negatives is  humor_controversy    0.4963
dtype: float64


#### helper functions

In [24]:
def stemmer(text, stemmer):
    return(' '.join([stemmer.stem(w) for w in word_tokenize(text)]))

def count_words(input):
    """ Returns number of occurences of characters specified in char """     
    return len(input.split())

def remove_punctuation(s_input, include_char = None):
    """ Returns input string without punctuation """
    import string as String
    punct = String.punctuation
    
    if not include_char is None:
        index = String.punctuation.index(include_char)
        punct = String.punctuation[:index] + String.punctuation[(index + 1):]
        
    punct += '\n'
        
    translator = str.maketrans(punct, ' '*len(punct))
    
    return s_input.translate(translator)

def remove_stopwords(text, use_stopwords = None, df = True, exclude_number = True):
    """ Returns input string removing stopwords from it. """
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    
    if use_stopwords is None:
        use_stopwords = set(stopwords.words("english"))
        
    if df:
        new_text = word_tokenize(text)
        if exclude_number:
            new_text = [word for word in new_text if not word.isnumeric()]
        new_text = " ".join([word for word in new_text if word not in use_stopwords])
    else:
        new_text = ""
        for word in text:
            if word not in use_stopwords:
                new_text += word + " "

    return new_text

def sep_upper(text):
    """ Take a text as input and insert space before every uppercase letter. """
    
    new_text = ""
    for letter in text:
        if letter.isupper():
            new_text += " " + letter
        else:
            new_text += letter
    
    return new_text

def remove_space(text):
    return(re.sub(' +',' ',text)) 

#### (basic) pre-process of text columns

In [25]:
def pre_proc(text_col):
    text_col = text_col.apply(remove_punctuation) # removes String.punctuation characters
    #text_col = text_col.apply(remove_stopwords)   # removes english stopwords 
    text_col = text_col.str.replace('[^\w\s]','').str.strip() # and removes whitespaces
    text_col = text_col.apply(sep_upper) # adds space before an uppercase
    text_col = text_col.str.lower() # lowercase
    
    return text_col

#### CV Magic

In [15]:
rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state = 21)
#rkf.get_n_splits(X_train) # 5folds x 5times

f1scores = []
for train_index, test_index in rkf.split(X_train):
    #print("TRAIN:", train_index, "\nTEST:", test_index)
    print('new fold!')
    X_kf_train, X_kf_test = X_train.iloc[train_index], X_train.iloc[test_index]
    y_kf_train, y_kf_test = y_train.iloc[train_index], y_train.iloc[test_index]
    
    X_kf_train.text = pre_proc(X_kf_train.text)
    X_kf_test.text = pre_proc(X_kf_test.text)
    
    X_kf_train['qtd_words'] = X_kf_train.text.apply(count_words)
    X_kf_test['qtd_words'] = X_kf_test.text.apply(count_words)
    
    vectorizer = CountVectorizer() # ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=4
    X_train_trans = pd.DataFrame(vectorizer.fit_transform(X_kf_train.text).toarray()
                                 , columns = vectorizer.get_feature_names()
                                 , index = X_kf_train.index)
    X_train_trans['qtd_words'] = X_kf_train['qtd_words']

    X_test_trans = pd.DataFrame(vectorizer.transform(X_kf_test.text).toarray()
                                , columns = vectorizer.get_feature_names()
                                , index = X_kf_test.index)
    X_test_trans['qtd_words'] = X_kf_test['qtd_words']

    clf = MultinomialNB()
    clf.fit(X_train_trans, y_kf_train.humor_controversy)

    y_kf_pred = clf.predict(X_test_trans)
    f1scores.append(f1_score(y_kf_test.humor_controversy, y_kf_pred))

new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!
new fold!


In [26]:
sum(f1scores)/len(f1scores) # F1 score from CV

0.5174270843173556

### final model

In [36]:
X_train.text = pre_proc(X_train.text)
X_test.text = pre_proc(X_test.text)

#X_train['qtd_words'] = X_train.text.apply(count_words)
#X_test['qtd_words'] = X_test.text.apply(count_words)

vectorizer = CountVectorizer() # ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=4
X_train_trans = pd.DataFrame(vectorizer.fit_transform(X_train.text).toarray()
                             , columns = vectorizer.get_feature_names()
                             , index = X_train.index)
X_train_trans['qtd_words'] = X_train.text.apply(count_words)

X_test_trans = pd.DataFrame(vectorizer.transform(X_test.text).toarray()
                            , columns = vectorizer.get_feature_names()
                            , index = X_test.index)
X_test_trans['qtd_words'] = X_test.text.apply(count_words)

clf = MultinomialNB()
clf.fit(X_train_trans, y_train.humor_controversy)

y_pred = clf.predict(X_test_trans)
print(f1_score(y_test.humor_controversy, y_pred))

0.49115504682622274


#### saving the model

In [37]:
with open('1c_vectorizer.pkl', 'wb') as file: 
    pickle.dump(vectorizer.vocabulary_, file) # countvectorizer
with open('1c_clf.pkl', 'wb') as file: 
    pickle.dump(clf, file) # countvectorizer    

In [None]:
df.iloc[1705].text

In [None]:
X_test.loc[1705].text

#### M