In [2]:
%load_ext jupyternotify

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [2]:
import pandas as pd
import urllib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score

In [None]:
# download annotated comments and annotations

ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
# download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
# download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [3]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv', sep = '\t')

In [4]:
len(annotations['rev_id'].unique())

115864

In [5]:
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [6]:
comments['attack'] = labels

### clean the comment text  
The text_clean is a pipeline of work that clean the comments text.
* split the entire comment string into words
* remove token "NEWLINE_TOKEN"
* remove token "TAB_TOKEN"
* convert all words to lowercase
* filter out punctuation
* filter out stop words

In [7]:
# import nltk
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

def text_clean(text):
    tokens = word_tokenize(text)
#     tokens = re.split(r'\W+', text)
    tokens = [w.replace("NEWLINE_TOKEN", " ") for w in tokens]
    tokens = [w.replace("TAB_TOKEN", " ") for w in tokens]
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('','', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
    stop_words_list = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words_list]
    filted_text = ' '.join(tokens)
    return filted_text
comments['comment'] = comments['comment'].apply(text_clean)

In [8]:
comments.query('attack')['comment'].head()

rev_id
801279                                     iraq good usa bad
2702703    little asshole want talk human start showing f...
4632658                                   dick bigger hahaha
6545332    renault sad little bpy driving renault clio va...
6545351    renault sad little bo driving renault clio vaa...
Name: comment, dtype: object

In [9]:
train_comments = comments.query("split=='train'")
test_comments = comments.query("split=='test'")

### N gram Feature

In [10]:
clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', KNeighborsClassifier(n_neighbors=5))
])
clf = clf.fit(train_comments['comment'], train_comments['attack'])

In [11]:
predicted = clf.predict(test_comments['comment'])

### Area Under the Receiver Operating Characteristic Curve
The high the score the better the classifier is

In [13]:
auc = roc_auc_score(test_comments['attack'], clf.predict_proba(test_comments['comment'])[:,1])
print('Test ROC AUC: %.3f' %auc)

Test ROC AUC: 0.675


In [12]:
test_accuracy = clf.score(test_comments['comment'], test_comments['attack'])
print('Accuracy on the testing subset: %.3f' %test_accuracy)

Accuracy on the testing subset: 0.884


In [13]:
print(metrics.classification_report(test_comments['attack'], predicted))

             precision    recall  f1-score   support

      False       0.90      0.97      0.94     20422
       True       0.53      0.24      0.33      2756

avg / total       0.86      0.88      0.86     23178

