In [33]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import GridSearchCV
import os.path
import pickle

In [34]:
train = pd.read_pickle('train_pickle.pkl');
test = pd.read_pickle('test_pickle.pkl');

In [35]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_final
0,0000997932d777bf,"[explanation, why, the, edits, made, under, my...",0,0,0,0,0,0,"[explanation, edits, make, username, hardcore,..."
1,000103f0d9cfb60f,"[d'aww, !, he, matches, this, background, colo...",0,0,0,0,0,0,"[match, background, colour, seemingly, stick, ..."
2,000113f07ec002fd,"[hey, man, ,, i, 'm, really, not, trying, to, ...",0,0,0,0,0,0,"[hey, man, really, try, edit, war, guy, consta..."
3,0001b41b1c6bb37e,"[``, more, i, ca, n't, make, any, real, sugges...",0,0,0,0,0,0,"[ca, make, real, suggestion, improvement, wond..."
4,0001d958c54c6e35,"[you, ,, sir, ,, are, my, hero, ., any, chance...",0,0,0,0,0,0,"[sir, hero, chance, remember, page]"


In [36]:
test.head()

Unnamed: 0,id,comment_text,comment_text_final
0,00001cee341fdb12,"[yo, bitch, ja, rule, is, more, succesful, the...","[yo, bitch, ja, rule, succesful, ever, whats, ..."
1,0000247867823ef7,"[==, from, rfc, ==, the, title, is, fine, as, ...","[rfc, title, fine, imo]"
2,00013b17ad220c46,"[``, ==, sources, ==, *, zawe, ashton, on, lap...","[source, zawe, ashton, lapland]"
3,00017563c3f7919a,"[:, if, you, have, a, look, back, at, the, sou...","[look, back, source, information, update, corr..."
4,00017695ad8997eb,"[i, do, n't, anonymously, edit, articles, at, ...","[anonymously, edit, article]"


In [37]:
vec = TfidfVectorizer(ngram_range=(1, 2), min_df=3,
                      max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1)
train['comment_text_final'] = [" ".join(text) for text in train['comment_text_final'].values]
vec = vec.fit(train['comment_text_final'])

In [38]:
X_train, X_test, y_train, y_test = train_test_split(train['comment_text_final'], 
                                                    train['toxic'], test_size=0.33, random_state=42)

In [39]:
train_term_doc = vec.transform(X_train)
test_term_doc = vec.transform(X_test)

In [40]:
scaler = MaxAbsScaler()
train_term_doc = scaler.fit_transform(train_term_doc)
test_term_doc = scaler.fit_transform(test_term_doc)

In [42]:
if os.path.isfile('ridge_lr.sav') and os.path.isfile('lasso_lr.sav'):
    ridge_logistic_regressor_grid_cv = pickle.load(open('ridge_lr.sav', 'rb'))
    lasso_logistic_regressor_grid_cv = pickle.load(open('lasso_lr.sav', 'rb'))
else:
    ridge_logistic_regressor = LogisticRegression(penalty="l2", solver="liblinear")
    lasso_logistic_regressor = LogisticRegression(penalty="l1", solver="liblinear")

    ridge_logistic_regressor_grid_cv = GridSearchCV(estimator=ridge_logistic_regressor, 
                                                 param_grid={'C':np.logspace(-4, 4, 20)}, cv= 5, iid=False)
    lasso_logistic_regressor_grid_cv = GridSearchCV(estimator=lasso_logistic_regressor, 
                                                 param_grid={'C':np.logspace(-4, 4, 20)}, cv= 5, iid=False)

    ridge_logistic_regressor_grid_cv.fit(train_term_doc, y_train)
    lasso_logistic_regressor_grid_cv.fit(train_term_doc, y_train)

    pickle.dump(ridge_logistic_regressor_grid_cv, open('ridge_lr.sav', 'wb'))
    pickle.dump(lasso_logistic_regressor_grid_cv, open('lasso_lr.sav', 'wb'))

In [43]:
ridge_train_pred = ridge_logistic_regressor_grid_cv.predict(train_term_doc)
lasso_train_pred = lasso_logistic_regressor_grid_cv.predict(train_term_doc)

ridge_test_pred = ridge_logistic_regressor_grid_cv.predict(test_term_doc)
lasso_test_pred = lasso_logistic_regressor_grid_cv.predict(test_term_doc)

ridge_train_accuracy = ridge_logistic_regressor_grid_cv.score(train_term_doc, y_train)
ridge_train_f1_score = f1_score(y_train, ridge_train_pred)

lasso_train_accuracy = lasso_logistic_regressor_grid_cv.score(train_term_doc, y_train)
lasso_train_f1_score = f1_score(y_train, lasso_train_pred)

ridge_pred = ridge_logistic_regressor_grid_cv.predict(test_term_doc)
lasso_pred = lasso_logistic_regressor_grid_cv.predict(test_term_doc)

ridge_test_accuracy = ridge_logistic_regressor_grid_cv.score(test_term_doc, y_test)
ridge_test_f1_score = f1_score(y_test, ridge_test_pred)

lasso_test_accuracy = lasso_logistic_regressor_grid_cv.score(test_term_doc, y_test)
lasso_test_f1_score = f1_score(y_test, lasso_test_pred)

print("Ridge Train Accuracy - " + str(ridge_train_accuracy))
print("Lasso Train Accuracy - " + str(lasso_train_accuracy) + '\n')

print("Ridge Train F1 Score - " + str(ridge_train_f1_score))
print("Lasso Train F1 Score - " + str(lasso_train_f1_score) + '\n')

print("Ridge Test Accuracy - " + str(ridge_test_accuracy))
print("Lasso Test Accuracy - " + str(lasso_test_accuracy) + '\n')

print("Ridge Test F1 Score - " + str(ridge_test_f1_score))
print("Lasso Test F1 Score - " + str(lasso_test_f1_score))

Ridge Train Accuracy - 0.998615683926968
Lasso Train Accuracy - 0.976728524393894

Ridge Train F1 Score - 0.9927365528072242
Lasso Train F1 Score - 0.8675327441167074

Ridge Test Accuracy - 0.9560379042518847
Lasso Test Accuracy - 0.9569494293473101

Ridge Test F1 Score - 0.7434334478554804
Lasso Test F1 Score - 0.7611421346538826
