In [59]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction.text import HashingVectorizer

In [8]:
data = pd.read_csv("train.csv")
data = data.drop('id', 1)

In [27]:
x = data['comment_text']
y = data.drop('comment_text', 1)
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=0)

In [71]:
yTrain.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
33595,0,0,0,0,0,0
2087,0,0,0,0,0,0
42406,0,0,0,0,0,0
62258,0,0,0,0,0,0
76244,1,0,0,0,0,0


In [49]:
vectorizer = HashingVectorizer(n_features=10000)
vector = vectorizer.transform(xTrain)

In [50]:
print vector.shape
print vector.toarray()

(76680, 10000)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [53]:
testVector = vectorizer.transform(xTest)

In [60]:
dt = DecisionTreeClassifier()
dt.fit(vector, yTrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [62]:
dtPredict = dt.predict(testVector)
print metrics.classification_report(yTest, dtPredict)

             precision    recall  f1-score   support

          0       0.59      0.58      0.59      1841
          1       0.32      0.23      0.27       197
          2       0.62      0.64      0.63      1036
          3       0.27      0.19      0.22        54
          4       0.49      0.53      0.51       934
          5       0.22      0.14      0.17       173

avg / total       0.55      0.54      0.54      4235



In [51]:
mlp = MLPClassifier()
mlp.fit(vector, yTrain)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [57]:
mlpPredict = mlp.predict(testVector)
print metrics.classification_report(yTest, mlpPredict)

             precision    recall  f1-score   support

          0       0.66      0.69      0.68      1841
          1       0.38      0.34      0.36       197
          2       0.70      0.70      0.70      1036
          3       0.52      0.31      0.39        54
          4       0.63      0.60      0.62       934
          5       0.48      0.34      0.39       173

avg / total       0.64      0.64      0.64      4235



In [88]:
adaToxic = AdaBoostClassifier()
adaToxic.fit(vector,yTrain['toxic'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [89]:
adaToxicPred = adaToxic.predict(testVector)
print metrics.classification_report(yTest['toxic'], adaToxicPred)

             precision    recall  f1-score   support

          0       0.95      0.99      0.97     17330
          1       0.83      0.49      0.62      1841

avg / total       0.94      0.94      0.94     19171



In [90]:
adaSToxic = AdaBoostClassifier()
adaSToxic.fit(vector,yTrain['severe_toxic'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [91]:
adaSToxicPred = adaSToxic.predict(testVector)
print metrics.classification_report(yTest['severe_toxic'], adaSToxicPred)

             precision    recall  f1-score   support

          0       0.99      1.00      0.99     18974
          1       0.49      0.26      0.34       197

avg / total       0.99      0.99      0.99     19171



In [93]:
adaObscene = AdaBoostClassifier()
adaObscene.fit(vector, yTrain['obscene'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [94]:
adaObscenePred = adaObscene.predict(testVector)
print metrics.classification_report(yTest['obscene'], adaObscenePred)

             precision    recall  f1-score   support

          0       0.97      0.99      0.98     18135
          1       0.85      0.53      0.66      1036

avg / total       0.97      0.97      0.97     19171



In [95]:
adaThreat = AdaBoostClassifier()
adaThreat.fit(vector, yTrain['threat'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [99]:
adaThreatPred = adaThreat.predict(testVector)
print metrics.classification_report(yTest['threat'], adaThreatPred)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     19117
          1       0.38      0.24      0.30        54

avg / total       1.00      1.00      1.00     19171



In [100]:
adaInsult = AdaBoostClassifier()
adaInsult.fit(vector, yTrain['insult'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [101]:
adaInsultPred = adaThreat.predict(testVector)
print metrics.classification_report(yTest['insult'], adaInsultPred)

             precision    recall  f1-score   support

          0       0.95      1.00      0.98     18237
          1       0.50      0.02      0.04       934

avg / total       0.93      0.95      0.93     19171



In [104]:
adaIH = AdaBoostClassifier()
adaIH.fit(vector, yTrain['identity_hate'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [105]:
adaIHPred = adaIH.predict(testVector)
print metrics.classification_report(yTest['identity_hate'], adaIHPred)

             precision    recall  f1-score   support

          0       0.99      1.00      1.00     18998
          1       0.50      0.27      0.35       173

avg / total       0.99      0.99      0.99     19171



In [73]:
svcToxic = SVC()
svcToxic.fit(vector,yTrain['toxic'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [75]:
svcToxicPred = svcToxic.predict(testVector)
print metrics.classification_report(yTest['toxic'], svcToxicPred)

             precision    recall  f1-score   support

          0       0.90      1.00      0.95     17330
          1       0.00      0.00      0.00      1841

avg / total       0.82      0.90      0.86     19171



  'precision', 'predicted', average, warn_for)


In [76]:
svcSToxic = SVC()
svcSToxic.fit(vector,yTrain['severe_toxic'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [77]:
svcSToxicPred = svcSToxic.predict(testVector)
print metrics.classification_report(yTest['severe_toxic'], svcSToxicPred)

             precision    recall  f1-score   support

          0       0.99      1.00      0.99     18974
          1       0.00      0.00      0.00       197

avg / total       0.98      0.99      0.98     19171



In [78]:
svcObscene = SVC()
svcObscene.fit(vector, yTrain['obscene'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [79]:
svcObscenePred = svcObscene.predict(testVector)
print metrics.classification_report(yTest['obscene'], svcObscenePred)

             precision    recall  f1-score   support

          0       0.95      1.00      0.97     18135
          1       0.00      0.00      0.00      1036

avg / total       0.89      0.95      0.92     19171



In [80]:
svcThreat = SVC()
svcThreat.fit(vector, yTrain['threat'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [82]:
svcThreatPred = svcThreat.predict(testVector)
print metrics.classification_report(yTest['threat'], svcThreatPred)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     19117
          1       0.00      0.00      0.00        54

avg / total       0.99      1.00      1.00     19171



In [83]:
svcInsult = SVC()
svcInsult.fit(vector, yTrain['insult'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [85]:
svcInsultPred = svcInsult.predict(testVector)
print metrics.classification_report(yTest['insult'], svcInsultPred)

             precision    recall  f1-score   support

          0       0.95      1.00      0.98     18237
          1       0.00      0.00      0.00       934

avg / total       0.90      0.95      0.93     19171



In [86]:
svcIH = SVC()
svcIH.fit(vector, yTrain['identity_hate'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [87]:
svcIHPred = svcIH.predict(testVector)
print metrics.classification_report(yTest['identity_hate'], svcIHPred)

             precision    recall  f1-score   support

          0       0.99      1.00      1.00     18998
          1       0.00      0.00      0.00       173

avg / total       0.98      0.99      0.99     19171



In [65]:
knn = KNeighborsClassifier()
knn.fit(vector,yTrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [68]:
knnPred = knn.predict(testVector)
print metrics.classification_report(yTest, knnPred)

             precision    recall  f1-score   support

          0       0.16      0.73      0.26      1841
          1       0.37      0.22      0.27       197
          2       0.09      0.70      0.16      1036
          3       0.38      0.06      0.10        54
          4       0.08      0.64      0.15       934
          5       0.31      0.05      0.09       173

avg / total       0.14      0.64      0.20      4235

