### First Things first

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import ElasticNet
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, accuracy_score

#from matplotlib import pyplot as plt
#from sklearn.model_selection import RepeatedKFold
import warnings # Ignore warning
import pickle

In [2]:
df = pd.read_csv('../train.csv').drop(columns = 'id')
df.head()

Unnamed: 0,text,is_humor,humor_rating,humor_controversy,offense_rating
0,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.2
1,A man inserted an advertisement in the classif...,1,2.5,1.0,1.1
2,How many men does it take to open a can of bee...,1,1.95,0.0,2.4
3,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.0
4,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.1


In [3]:
def stemmer(text, stemmer):
    return(' '.join([stemmer.stem(w) for w in word_tokenize(text)]))

def count_words(input):
    """ Returns number of occurences of characters specified in char """     
    return len(input.split())

def remove_punctuation(s_input, include_char = None):
    """ Returns input string without punctuation """
    import string as String
    punct = String.punctuation
    
    if not include_char is None:
        index = String.punctuation.index(include_char)
        punct = String.punctuation[:index] + String.punctuation[(index + 1):]
        
    punct += '\n'
        
    translator = str.maketrans(punct, ' '*len(punct))
    
    return s_input.translate(translator)

def remove_stopwords(text, use_stopwords = None, df = True, exclude_number = True):
    """ Returns input string removing stopwords from it. """
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    
    if use_stopwords is None:
        use_stopwords = set(stopwords.words("english"))
        
    if df:
        new_text = word_tokenize(text)
        if exclude_number:
            new_text = [word for word in new_text if not word.isnumeric()]
        new_text = " ".join([word for word in new_text if word not in use_stopwords])
    else:
        new_text = ""
        for word in text:
            if word not in use_stopwords:
                new_text += word + " "

    return new_text

def sep_upper(text):
    """ Take a text as input and insert space before every uppercase letter. """
    
    new_text = ""
    for letter in text:
        if letter.isupper():
            new_text += " " + letter
        else:
            new_text += letter
    
    return new_text

def remove_space(text):
    return(re.sub(' +',' ',text)) 

In [4]:
def pre_proc(text_col):
    text_col = text_col.apply(remove_punctuation) # removes String.punctuation characters
    #text_col = text_col.apply(remove_stopwords)   # removes english stopwords 
    text_col = text_col.str.replace('[^\w\s]','').str.strip() # and removes whitespaces
    text_col = text_col.apply(sep_upper) # adds space before an uppercase
    text_col = text_col.str.lower() # lowercase
    
    return text_col

### 1a => is_humor

In [5]:
X = df[['text']] #4932 Do not reset the index!
y = df[['is_humor']] #4932

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)
"X_train and y_train shape: {0}, {1}, X_test and y_test shape {2}, {3}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

'X_train and y_train shape: (6400, 1), (6400, 1), X_test and y_test shape (1600, 1), (1600, 1)'

In [7]:
total = len(y_train)
print('% of positives is ', round(y_train.sum()/total, 2)) #'% of positives is {:.2f}'.format(y_train.sum()/total)
print('% of negatives is ', round((total - y_train.sum())/total, 2)) #'% of negatives is {:.2f}'.format((total - y_train.sum())/total)

% of positives is  is_humor    0.61
dtype: float64
% of negatives is  is_humor    0.39
dtype: float64


In [8]:
X_train.text = pre_proc(X_train.text)
X_test.text = pre_proc(X_test.text)

vectorizer = CountVectorizer(max_features=4_000) # ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=4
X_train_trans = pd.DataFrame(vectorizer.fit_transform(X_train.text).toarray()
                             , columns = vectorizer.get_feature_names()
                             , index = X_train.index)
X_train_trans['qtd_words'] = X_train.text.apply(count_words)

X_test_trans = pd.DataFrame(vectorizer.transform(X_test.text).toarray()
                            , columns = vectorizer.get_feature_names()
                            , index = X_test.index)
X_test_trans['qtd_words'] = X_test.text.apply(count_words)

In [9]:
clf = MultinomialNB()
clf.fit(X_train_trans, y_train.is_humor)

y_pred = clf.predict(X_test_trans)
print(f1_score(y_test.is_humor, y_pred))

0.8822947576656776


saving the model

In [10]:
with open('1a_vectorizer.pkl', 'wb') as file: 
    pickle.dump(vectorizer.vocabulary_, file) # countvectorizer
with open('1a_clf.pkl', 'wb') as file: 
    pickle.dump(clf, file) # countvectorizer    

In [30]:
del X, y, X_train, X_test, y_train, y_test, vectorizer, X_train_trans, X_test_trans

### 1b => humor_rating

In [31]:
X = df[df['is_humor']==1][['text']] # Do not reset the index!
y = df[df['is_humor']==1][['humor_rating']] #

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)
"X_train and y_train shape: {0}, {1}, X_test and y_test shape {2}, {3}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

'X_train and y_train shape: (3945, 1), (3945, 1), X_test and y_test shape (987, 1), (987, 1)'

In [33]:
X_train.text = pre_proc(X_train.text)
X_test.text = pre_proc(X_test.text)

vectorizer = CountVectorizer() # ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=4

X_train_trans = pd.DataFrame(vectorizer.fit_transform(X_train.text).toarray()
                             , columns = vectorizer.get_feature_names()
                             , index = X_train.index)
X_train_trans['qtd_words'] = X_train.text.apply(count_words)

X_test_trans = pd.DataFrame(vectorizer.transform(X_test.text).toarray()
                            , columns = vectorizer.get_feature_names()
                            , index = X_test.index)
X_test_trans['qtd_words'] = X_test.text.apply(count_words)


In [34]:
reg = ElasticNet(l1_ratio = 0.0)
reg.fit(X_train_trans, y_train)

y_pred = reg.predict(X_test_trans)
mean_squared_error(y_test, y_pred, squared = False) # squared = F will return the RMSE

  positive)


0.5508491704701347

saving the model

In [35]:
with open('1b_vectorizer.pkl', 'wb') as file: 
    pickle.dump(vectorizer.vocabulary_, file) # countvectorizer
with open('1b_reg.pkl', 'wb') as file: 
    pickle.dump(reg, file) # countvectorizer    

In [18]:
del X, y, X_train, X_test, y_train, y_test, vectorizer, X_train_trans, X_test_trans

### 1c => humor_controversy

In [19]:
X = df[df['is_humor']==1][['text']] #4932 Do not reset the index!
y = df[df['is_humor']==1][['humor_controversy']] #4932

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)
"X_train and y_train shape: {0}, {1}, X_test and y_test shape {2}, {3}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

'X_train and y_train shape: (3945, 1), (3945, 1), X_test and y_test shape (987, 1), (987, 1)'

In [21]:
total = len(y_train)
print('% of positives is ', round(y_train.sum()/total, 4)) #'% of positives is {:.2f}'.format(y_train.sum()/total)
print('% of negatives is ', round((total - y_train.sum())/total, 4)) #'% of negatives is {:.2f}'.format((total - y_train.sum())/total)

% of positives is  humor_controversy    0.5037
dtype: float64
% of negatives is  humor_controversy    0.4963
dtype: float64


In [22]:
X_train.text = pre_proc(X_train.text)
X_test.text = pre_proc(X_test.text)

#X_train['qtd_words'] = X_train.text.apply(count_words)
#X_test['qtd_words'] = X_test.text.apply(count_words)

vectorizer = CountVectorizer(min_df=1000) # ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=4
X_train_trans = pd.DataFrame(vectorizer.fit_transform(X_train.text).toarray()
                             , columns = vectorizer.get_feature_names()
                             , index = X_train.index)
X_train_trans['qtd_words'] = X_train.text.apply(count_words)

X_test_trans = pd.DataFrame(vectorizer.transform(X_test.text).toarray()
                            , columns = vectorizer.get_feature_names()
                            , index = X_test.index)
X_test_trans['qtd_words'] = X_test.text.apply(count_words)

clf = MultinomialNB()
clf.fit(X_train_trans, y_train.humor_controversy)

y_pred = clf.predict(X_test_trans)
print(f1_score(y_test.humor_controversy, y_pred))

0.549738219895288


saving the model

In [23]:
with open('1c_vectorizer.pkl', 'wb') as file: 
    pickle.dump(vectorizer.vocabulary_, file) # countvectorizer
with open('1c_clf.pkl', 'wb') as file: 
    pickle.dump(clf, file) # countvectorizer    

In [24]:
del X, y, X_train, X_test, y_train, y_test, vectorizer, X_train_trans, X_test_trans

### 2a => offense_rating

In [25]:
X = df[['text']] #4932 Do not reset the index!
y = df[['offense_rating']] #4932

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)
"X_train and y_train shape: {0}, {1}, X_test and y_test shape {2}, {3}".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

'X_train and y_train shape: (6400, 1), (6400, 1), X_test and y_test shape (1600, 1), (1600, 1)'

In [27]:
X_train.text = pre_proc(X_train.text)
X_test.text = pre_proc(X_test.text)

vectorizer = CountVectorizer(max_features = 5_00) # ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=4
X_train_trans = pd.DataFrame(vectorizer.fit_transform(X_train.text).toarray()
                             , columns = vectorizer.get_feature_names()
                             , index = X_train.index)
X_train_trans['qtd_words'] = X_train.text.apply(count_words)

X_test_trans = pd.DataFrame(vectorizer.transform(X_test.text).toarray()
                            , columns = vectorizer.get_feature_names()
                            , index = X_test.index)
X_test_trans['qtd_words'] = X_test.text.apply(count_words)


xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)
xgb_model.fit(X_train_trans, y_train)

y_pred_xgb = xgb_model.predict(X_test_trans)
mse=mean_squared_error( y_test, y_pred_xgb)

print((mse)**(1/2))

0.8121392197691781


saving the model

In [28]:
with open('2a_vectorizer.pkl', 'wb') as file: 
    pickle.dump(vectorizer.vocabulary_, file) # countvectorizer
with open('2a_reg.pkl', 'wb') as file: 
    pickle.dump(xgb_model, file) # countvectorizer    