### Toxic Comment Classification

In [1]:
#importing librarie
%matplotlib inline
import pandas as pd
import numpy as np
#warnings
import warnings
warnings.filterwarnings('ignore') #to ingnore the warnings in jupyter notebooks....
#visulaisation
import  seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
#nlp
import nltk
from nltk.corpus import stopwords
import re  
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
#pickle & joblib&tqdm
import pickle
import joblib
from tqdm import tqdm
#sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.adapt import MLkNN

### Feature Engineering

In [2]:
'''reading the saved dataframes saved in previous file'''
X_train = pd.read_pickle('X_train.pkl')
X_test = pd.read_pickle('X_test.pkl')
y_train = pd.read_pickle('y_train.pkl')

In [3]:
print("X_train shape:",X_train.shape)
print("X_test shape:",X_test.shape)
print("Train Labels shape:",y_train.shape)

X_train shape: (159571, 2)
X_test shape: (153164, 2)
Train Labels shape: (159571, 6)


In [4]:
print("X_train columns",list(X_train.columns))
print("X_test columns",list(X_test.columns))
print("y_train columns",list(y_train.columns))

X_train columns ['id', 'comments']
X_test columns ['id', 'comments']
y_train columns ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


TFIDF vectorizer

In [5]:
'''tfidf vectoriser has been used as it is far better than bag of words approach.'''
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(ngram_range=(1,2),min_df=3, max_df=0.9,use_idf=1, #ngram_range(1,2), it will take text in 1 as well in pairs
               smooth_idf=1, sublinear_tf=1 )

In [6]:
train = tf.fit_transform(X_train['comments'])

In [7]:
print("Comments Dimensions:",train.shape)
print("Non-Zeros:",train[0].nonzero())

Comments Dimensions: (159571, 327390)
Non-Zeros: (array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int32), array([253971, 199520, 274353, 276197, 232043, 186113, 208633,  13623,
       181098, 306059, 186825, 282782, 236474, 169593, 117979, 162351,
        81328, 313779,  93679, 235825, 253501, 198206, 273803, 276036,
       231472, 208374,  13204,  94429,  77936, 325466, 180611, 305977,
       107771,  51574, 303155, 183544, 282620, 236116,  96668, 169592,
       117975, 301326, 161036,  80683, 313712,  93579], dtype=int32))


In [8]:
test = tf.transform(X_test['comments'])
print("Comments Dimensions:",test.shape)
print("Non-Zeros:",test[0].nonzero())

Comments Dimensions: (153164, 327390)
Non-Zeros: (array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int32), array([325407, 323194, 313143, 313125, 313107, 312293, 298241, 293324,
       287253, 283544, 283294, 269029, 255444, 252132, 250796, 240485,
       239545, 237903, 237564, 214734, 187810, 181606, 181500, 175460,
       162667, 162616, 152399, 144776, 144775, 140367, 140365, 118295,
       116194, 109913, 108953, 106203,  94478,  94452,  90068,  78132,
        76590,  66613,  38850,  37740,  34203,  31311,  23560],
      dtype=int32))


In [10]:
#importing the y_test
y_test = pd.read_csv('test_labels.csv')

## Classical Models

### Logistic regression LB=(0.975)

##### One-vs-the-rest (OvR) multiclass/multilabel strategy has used to fit the multilabel problem...

In [None]:
'''grid -search cv has been applied to search the best hyperparameter tunning....'''
model_to_set = OneVsRestClassifier(LogisticRegression(class_weight='balanced',solver='sag'))
score = 'f1_macro' #Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
parameters = {
    "estimator__C": [2,4,6],
    'estimator__penalty':['l1','l2'],

}
model= GridSearchCV(model_to_set, param_grid=parameters,cv=5,  #k fold-cross validation =5
                             scoring=score,n_jobs=-1,verbose=1)
model.fit(train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 30.6min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=OneVsRestClassifier(estimator=LogisticRegression(C=1.0,
                                                                        class_weight='balanced',
                                                                        dual=False,
                                                                        fit_intercept=True,
                                                                        intercept_scaling=1,
                                                                        l1_ratio=None,
                                                                        max_iter=100,
                                                                        multi_class='auto',
                                                                        n_jobs=None,
                                                                        penalty='l2',
                                                                        random_stat

In [11]:
filename = 'hyper_one.sav'
joblib.dump(model, filename)

In [39]:
#loaded wigth without class balanced
loaded_model = joblib.load('hyper_one.sav')

In [14]:
loaded_model.best_estimator_

OneVsRestClassifier(estimator=LogisticRegression(C=4, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [None]:
model.best_params_

{'estimator__C': 6, 'estimator__penalty': 'l2'}

In [None]:
model.best_estimator_

OneVsRestClassifier(estimator=LogisticRegression(C=4, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [15]:
log_pred = loaded_model.predict_proba(test)

In [16]:
pred_cols=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
pred_df = pd.DataFrame(log_pred,columns=pred_cols)

In [17]:
idd =X_test['id']
sub = pd.concat([idd,pred_df],axis=1)
sub.to_csv('log_sub.csv',index=False)

### MLKNN (model fails) (accuracy not tested on LB)

##### kNN classification method adapted for multi-label classification.
##### MLkNN builds uses k-NearestNeighbors find nearest examples to a test class and uses Bayesian inference to select assigned labels.
<b>It takes the X,y in the scipy.sparse matrix or numpy.ndarray only.<b><br>
<b><i>No hyperparameter tunning done in this case beacause it is costly than others to computer one vs rest.<i><b>

In [None]:
!pip install scikit-multilearn

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |████████████████████████████████| 92kB 4.6MB/s eta 0:00:011
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [19]:
y = y_train.to_numpy() #mlknn takes the x and y in the numpy.ndarray()

In [None]:
from skmultilearn.adapt import MLkNN
classifier = MLkNN(k=2)# with increasing neighbours the cost to compute the train and predict the test is too high
classifier.fit(train, y)

MLkNN(ignore_first_neighbours=0, k=2, s=1.0)

In [None]:
'''saving the model to mlknn.sav'''
filename = 'mlknn.sav'
joblib.dump(classifier, filename)

['mlknn.sav']

In [21]:
loaded_model = joblib.load('mlknn.sav')

In [None]:
pre = loaded_model.predict_proba(test).toarray() #it generates the sparse matrix

In [None]:
pred_cols=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
pred_df = pd.DataFrame(pre,columns=pred_cols)

In [None]:
idd =X_test['id']
sub = pd.concat([idd,pred_df],axis=1)
sub.to_csv('mlknn_sub.csv',index=False)

In [None]:
sub #accuracy of the mlknn model is too low and the time to compute is too high

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.000119,0.000056,0.0001,0.000006,0.000075,0.000038
1,0000247867823ef7,0.000119,0.000056,0.0001,0.000006,0.000075,0.000038
2,00013b17ad220c46,0.000119,0.000056,0.0001,0.000006,0.000075,0.000038
3,00017563c3f7919a,0.000119,0.000056,0.0001,0.000006,0.000075,0.000038
4,00017695ad8997eb,0.000119,0.000056,0.0001,0.000006,0.000075,0.000038
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.000119,0.000056,0.0001,0.000006,0.000075,0.000038
153160,fffd7a9a6eb32c16,0.000119,0.000056,0.0001,0.000006,0.000075,0.000038
153161,fffda9e8d6fafa9e,0.000119,0.000056,0.0001,0.000006,0.000075,0.000038
153162,fffe8f1340a79fc2,0.000119,0.000056,0.0001,0.000006,0.000075,0.000038


### SGD (LB=0.97543)

In [41]:
params = {'estimator__alpha':[0.00001,0.0001],
    'estimator__loss': ['log'], # logistic regression,
    'estimator__penalty': ['l2','l1']
}
clf = OneVsRestClassifier(SGDClassifier(class_weight='balanced'), n_jobs=-1) #class_weight == balanced
model = GridSearchCV(clf, param_grid=params, scoring = 'f1_weighted', cv=5, verbose=1,n_jobs=-1)
model.fit(train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  5.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001,
                                                                   average=False,
                                                                   class_weight='balanced',
                                                                   early_stopping=False,
                                                                   epsilon=0.1,
                                                                   eta0=0.0,
                                                                   fit_intercept=True,
                                                                   l1_ratio=0.15,
                                                                   learning_rate='optimal',
                                                                   loss='hinge',
                                                                   max_iter=1000,
                                       

In [22]:
filename = 'sgd.sav'
joblib.dump(model, filename)

In [26]:
loaded_model = joblib.load('sgd.sav')

In [27]:
loaded_model.predict_proba(test)

array([[0.99966423, 0.92005793, 0.99976254, 0.62116948, 0.99674848,
        0.95145158],
       [0.05019689, 0.02369676, 0.02340761, 0.00894582, 0.05032426,
        0.03814973],
       [0.23344182, 0.03763918, 0.10169536, 0.01126754, 0.11937999,
        0.05837812],
       ...,
       [0.02723207, 0.00893921, 0.03263597, 0.0096157 , 0.0255035 ,
        0.01695951],
       [0.11853469, 0.01440857, 0.10063712, 0.01543782, 0.05585723,
        0.15905622],
       [0.97205806, 0.02028746, 0.88900316, 0.01820378, 0.7827824 ,
        0.07705995]])

In [29]:
loaded_model.best_params_

{'estimator__alpha': 1e-05,
 'estimator__loss': 'log',
 'estimator__penalty': 'l2'}

In [30]:
test_values=loaded_model.predict_proba(test)

In [24]:
pred_cols=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
pred_df = pd.DataFrame(test_values,columns=pred_cols)

In [None]:
idd =X_test['id']
# submision=pd.read_csv('/content/drive/My Drive/sample_submission.csv')
sub = pd.concat([idd,pred_df],axis=1)
sub.to_csv('sgd1_sub.csv',index=False)

### Naive Bays (LB=0.94074)

Mutlinomial BN

In [None]:
'''only alpha hyperparameter has been passed in grid search cv params'''
params = {'estimator__alpha':[0.001,0.01,1]}
clf = OneVsRestClassifier(MultinomialNB(), n_jobs=-1)
model = GridSearchCV(clf, param_grid=params, scoring = 'f1_macro', cv=3, verbose=1,n_jobs=-1)
model.fit(train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    8.6s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0,
                                                                   class_prior=None,
                                                                   fit_prior=True),
                                           n_jobs=-1),
             iid='deprecated', n_jobs=-1,
             param_grid={'estimator__alpha': [0.001, 0.01, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=1)

In [None]:
joblib.dump(model, 'naive.sav')

['naive.sav']

In [32]:
loaded_model = joblib.load('naive.sav')

In [33]:
loaded_model.best_params_

{'estimator__alpha': 0.01}

In [34]:
loaded_model.best_estimator_

OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,
                                            fit_prior=True),
                    n_jobs=-1)

In [36]:
nav_pred = loaded_model.predict_proba(test)

In [37]:
pred_cols=['toxic','severe_toxic','obscene','threat','insult','identity_hate']
pred_df = pd.DataFrame(nav_pred,columns=pred_cols)

In [38]:
idd =X_test['id']
sub = pd.concat([idd,pred_df],axis=1)
sub.to_csv('naive_sub.csv',index=False)