In [35]:
import pandas as pd
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix


In [2]:
stops = stopwords.words("english")
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df['comment_text'].fillna("unknown", inplace=True)
test_df['comment_text'].fillna("unknown", inplace=True)

X = train_df['comment_text'].values
y = train_df[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

X_test = test_df.drop("id", axis =1).values.ravel()
#X_test.shape

def text_prepare(text):
    text = [i for i in text.split() if i not in stops]
    return " ".join(text)

X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]


In [3]:
tfidf_vectorizer = TfidfVectorizer(min_df=5,max_df = 0.9,ngram_range=(1, 3))
tfidf_vectorizer.fit(X_train)
X_train_tfid = tfidf_vectorizer.transform(X_train)
X_val_tfid = tfidf_vectorizer.transform(X_val) #validation
X_test_tfid = tfidf_vectorizer.transform(X_test) #test

X_train_tfid

<111699x140456 sparse matrix of type '<class 'numpy.float64'>'
	with 5172475 stored elements in Compressed Sparse Row format>

In [26]:
##taking c= 4 where it produced the best score from below
model_tfidf = OneVsRestClassifier(LogisticRegression(C = 4)).fit(X_train_tfid, y_train)
predicted_val_tfidf = model_tfidf.predict(X_val_tfid) 

#predicted_test_tfidf = model_tfidf.predict(X_test_tfid) 





In [48]:
#'macro': Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
def get_score(y_val, pred):
    accuracy = accuracy_score(y_val, pred)
    f1 = f1_score(y_val, pred,average='macro')
    precision = average_precision_score(y_val, pred,average='macro')
    recall = recall_score(y_val, pred, average='macro')
    #confu_matrix = confusion_matrix(y_val, pred, labels=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
    roc_auc = roc_auc_score(y_val, pred, average='macro')
    print("accuracy: ", accuracy)
    print("f1: ",f1)
    print("precision: ",precision)
    print("recall", recall)
    #print("confusion matrix:", confu_matrix)
    print("ROC AUC:", roc_auc)
    
###2*2 confusion matrix for each class
labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
conf_mat_dict={}
acc_dict = {}
for label_col in range(len(labels)):
    y_val_label = y_val[:, label_col]
    y_pred_label = predicted_tfidf[:, label_col]
    acc_dict[labels[label_col]] = accuracy_score(y_val_label,y_pred_label)
    conf_mat_dict[labels[label_col]] = confusion_matrix(y_pred=y_pred_label, y_true=y_val_label)
    
for label, matrix in conf_mat_dict.items():
    print("For label {}:".format(label))
    print("Confusion Matrix:\n", matrix)
    print("Acc:", acc_dict[label])
    print("\n")

    
### General scores
print("Metrics in general")
get_score(y_val, predicted_tfidf)
#cf = confusion_matrix(y_val.argmax(axis=1),predicted_tfidf.argmax(axis=1))
#print("\n Confusion matrix - y is actual, x is predict, labels are 'toxic','severe_toxic','obscene','threat','insult','identity_hate'")
#print(cf)


For label toxic:
Confusion Matrix:
 [[42955   335]
 [ 1600  2982]]
Acc: 0.9595797125668449


For label severe_toxic:
Confusion Matrix:
 [[47285   101]
 [  366   120]]
Acc: 0.9902448195187166


For label obscene:
Confusion Matrix:
 [[45153   163]
 [  902  1654]]
Acc: 0.9777531751336899


For label threat:
Confusion Matrix:
 [[47727     9]
 [  105    31]]
Acc: 0.9976186497326203


For label insult:
Confusion Matrix:
 [[45180   303]
 [ 1068  1321]]
Acc: 0.9713611296791443


For label identity_hate:
Confusion Matrix:
 [[47397    43]
 [  343    89]]
Acc: 0.9919368315508021


Metrics in general
accuracy:  0.9199740975935828
f1:  0.5295317982291321
precision:  0.3608543319515538
recall 0.4219561099564177
ROC AUC: 0.7092094152073677


In [38]:
#Changing the regularization parameter lambda in the Logit Regression function

lambdas = [10**-5, 10**-3, 10**-2, 10**-1, 1, 10, 100, 1000]

for lambda_val in lambdas:
    
    print("Lambda: ",lambda_val)
    model_lambda = OneVsRestClassifier(LogisticRegression(C = 1/lambda_val)).fit(X_train_tfid, y_train)
    predicted_tfidf_lambda = model_lambda.predict(X_val_tfid)
    get_score(y_val, predicted_tfidf_lambda)
    print("\n")
    
   

Lambda:  1e-05




accuracy:  0.8850058489304813
f1:  0.48295042223947804
precision:  0.2875945405211972
recall 0.4668274513372188
ROC AUC: 0.7267703081295919


Lambda:  0.001




accuracy:  0.9088611296791443
f1:  0.5295034824809032
precision:  0.33946516210398975
recall 0.4565927771486305
ROC AUC: 0.7247690142216304


Lambda:  0.01




accuracy:  0.9161096256684492
f1:  0.5428371624109026
precision:  0.35784428362725706
recall 0.45664671163089415
ROC AUC: 0.725582912326629


Lambda:  0.1




accuracy:  0.9194727606951871
f1:  0.5391209364943597
precision:  0.3646493574753645
recall 0.43838140361143624
ROC AUC: 0.7171229970042269


Lambda:  1




accuracy:  0.9163602941176471
f1:  0.44332867671987813
precision:  0.3037272489789623
recall 0.3351844217713172
ROC AUC: 0.6663836087124374


Lambda:  10


  'precision', 'predicted', average, warn_for)


accuracy:  0.9049339906417112
f1:  0.20649623693823527
precision:  0.15055917207699201
recall 0.12957133728043302
ROC AUC: 0.5645940975773734


Lambda:  100


  'precision', 'predicted', average, warn_for)


accuracy:  0.8983748328877005
f1:  0.0013191496117279918
precision:  0.03744653552176972
recall 0.0006609406019836301
ROC AUC: 0.5003304703009918


Lambda:  1000




accuracy:  0.8983330548128342
f1:  0.0
precision:  0.03683781751336898
recall 0.0
ROC AUC: 0.5




  'precision', 'predicted', average, warn_for)


In [34]:
lambda_vals = [0.55,0.1,0.25,0.5]    

for lambda_val in lambda_vals:
    print("Lambda: ",lambda_val)
    model_lambda = OneVsRestClassifier(LogisticRegression(C = 1/lambda_val)).fit(X_train_tfid, y_train)
    predicted_tfidf_lambda = model_lambda.predict(X_val_tfid)
    get_score(y_val, predicted_tfidf_lambda)
    print("\n")

Lambda:  0.55




accuracy:  0.9178434157754011
f1:  0.48850229733571066
precision:  0.33284801146967463


Lambda:  0.1




accuracy:  0.9194727606951871
f1:  0.5391209364943597
precision:  0.3646493574753645


Lambda:  0.25




accuracy:  0.9199740975935828
f1:  0.5295317982291321
precision:  0.3608543319515538


Lambda:  0.5




accuracy:  0.9183447526737968
f1:  0.49606877726376936
precision:  0.33783332153204376




In [35]:
#l1 regulation
model_l1 = OneVsRestClassifier(LogisticRegression(C= 1/0.25,penalty="l1")).fit(X_train_tfid, y_train)
predicted_tfidf_l1 = model_lambda.predict(X_val_tfid)
get_score(y_val, predicted_tfidf_l1)



accuracy:  0.9183447526737968
f1:  0.49606877726376936
precision:  0.33783332153204376
