In [36]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import urllib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import roc_auc_score

In [None]:
# download annotated comments and annotations

ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
# download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
# download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [3]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
# print(comments)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')
# print(annotations)

In [4]:
len(annotations['rev_id'].unique())

115864

In [5]:
# labels a comment as an atack if the majority of annoatators did so
# print(annotations.groupby('rev_id').groups)
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5
# print(annotations)
# print(labels)

In [6]:
comments['attack'] = labels

In [7]:
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [8]:
comments.query('attack')['comment'].head()

rev_id
801279             Iraq is not good  ===  ===  USA is bad   
2702703      ____ fuck off you little asshole. If you wan...
4632658         i have a dick, its bigger than yours! hahaha
6545332      == renault ==  you sad little bpy for drivin...
6545351      == renault ==  you sad little bo for driving...
Name: comment, dtype: object

In [46]:
mnb = MultinomialNB()
mnb.get_params()
# cv = CountVectorizer()
# cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [42]:
train_comments = comments.query("split=='train'")
test_comments = comments.query("split=='test'")

Test ROC AUC: 0.936


### N gram Feature

In [47]:
clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', MultinomialNB()),
])
clf = clf.fit(train_comments['comment'], train_comments['attack'])

In [48]:
auc = roc_auc_score(test_comments['attack'], clf.predict_proba(test_comments['comment'])[:, 1])
print('Test ROC AUC: %.3f' %auc)

Test ROC AUC: 0.936


In [49]:
train_accuracy = clf.score(train_comments['comment'], train_comments['attack'])
print('Accuracy on the training subset: %.3f' %train_accuracy)
test_accuracy = clf.score(test_comments['comment'], test_comments['attack'])
print('Accuracy on the testing subset: %.3f' %test_accuracy)

Accuracy on the training subset: 0.938
Accuracy on the testing subset: 0.934


In [50]:
predicted = clf.predict(test_comments['comment'])

In [51]:
print(metrics.classification_report(test_comments['attack'], predicted))

             precision    recall  f1-score   support

      False       0.94      0.99      0.96     20422
       True       0.88      0.51      0.65      2756

avg / total       0.93      0.93      0.93     23178



In [52]:
metrics.confusion_matrix(test_comments['attack'], predicted)

array([[20235,   187],
       [ 1345,  1411]])

### Hyper-parameter Tuning 

In [53]:
text_clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000)),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', MultinomialNB()),
])

In [54]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range':[(1,2), (1,3)],
              'tfidf__use_idf':(True, False),
              'clf__alpha': (1e-2, 1e-3),
            }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_comments['comment'], train_comments['attack'])

In [55]:
print(gs_clf.best_score_)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.934700687513
clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 2)


### Using best parameters

In [56]:
clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', MultinomialNB(alpha = 0.01)),
])
clf = clf.fit(train_comments['comment'], train_comments['attack'])

In [57]:
auc = roc_auc_score(test_comments['attack'], clf.predict_proba(test_comments['comment'])[:, 1])
print('Test ROC AUC: %.3f' %auc)

Test ROC AUC: 0.931


In [58]:
train_accuracy = clf.score(train_comments['comment'], train_comments['attack'])
print('Accuracy on the training subset: %.3f' %train_accuracy)
test_accuracy = clf.score(test_comments['comment'], test_comments['attack'])
print('Accuracy on the testing subset: %.3f' %test_accuracy)

Accuracy on the training subset: 0.941
Accuracy on the testing subset: 0.935


In [59]:
predicted = clf.predict(test_comments['comment'])
print(metrics.classification_report(test_comments['attack'], predicted))

             precision    recall  f1-score   support

      False       0.94      0.99      0.96     20422
       True       0.84      0.55      0.67      2756

avg / total       0.93      0.93      0.93     23178



In [34]:
# correctly classify nice comment
clf.predict(['Thanks for you contribution, you did a great job!'])

array([False], dtype=bool)

In [35]:
# correctly classify nasty comment
clf.predict(['People as stupid as you should not edit Wikipedia!'])

array([False], dtype=bool)

In [36]:
clf.predict(['What the fuck?!'])

array([ True], dtype=bool)