In [3]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [4]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
# download annotated comments and annotations

ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
# download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
# download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [5]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv', sep = '\t')

In [6]:
len(annotations['rev_id'].unique())

115864

In [7]:
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [8]:
comments['attack'] = labels

### clean the comment text  
The text_clean is a pipeline of work that clean the comments text.
* split the entire comment string into words
* remove token "NEWLINE_TOKEN"
* remove token "TAB_TOKEN"
* convert all words to lowercase
* filter out punctuation
* filter out stop words

In [9]:
# import nltk
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

def text_clean(text):
    tokens = word_tokenize(text)
#     tokens = re.split(r'\W+', text)
    tokens = [w.replace("NEWLINE_TOKEN", " ") for w in tokens]
    tokens = [w.replace("TAB_TOKEN", " ") for w in tokens]
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('','', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
#     stop_words_list = set(stopwords.words('english'))
#     tokens = [w for w in tokens if w not in stop_words_list]
    filted_text = ' '.join(tokens)
    return filted_text
comments['comment'] = comments['comment'].apply(text_clean)

In [10]:
comments.query('attack')['comment'].head()

rev_id
801279                           iraq is not good usa is bad
2702703    off you little asshole if you want to talk to ...
4632658           i have a dick its bigger than yours hahaha
6545332    renault sad little bpy for driving a renault c...
6545351    renault sad little bo for driving a renault cl...
Name: comment, dtype: object

### N gram Feature

In [27]:
train_comments = comments.query("split== 'train'")
test_comments = comments.query("split== 'test'")

clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', RandomForestClassifier(
                                    n_estimators=500,
#                                    max_features='sqrt', 
#                                    min_samples_leaf=15, 
#                                    min_samples_split=5, 
#                                    max_depth= 50, 
#                                    class_weight='balanced_subsample',
                                   random_state=0)
                                   ),
])

In [28]:
clf = clf.fit(train_comments['comment'], train_comments['attack'])

### Area Under the Receiver Operating Characteristic Curve
The high the score the better the classifier is

In [29]:
auc = roc_auc_score(test_comments['attack'], clf.predict_proba(test_comments['comment'])[:,1])
print('Test ROC AUC: %.3f' %auc)

Test ROC AUC: 0.946


### Training Accuracy and Testing Accuracy 

In [30]:
train_accuracy = clf.score(train_comments['comment'], train_comments['attack'])
print('Accuracy on the training subset: %.3f' %train_accuracy)
test_accuracy = clf.score(test_comments['comment'], test_comments['attack'])
print('Accuracy on the testing subset: %.3f' %test_accuracy)

Accuracy on the training subset: 0.999
Accuracy on the testing subset: 0.935


### Consufion Matrix

In [31]:
predicted = clf.predict(test_comments['comment'])

In [32]:
%%notify -m "Yay!!!!!!!!!!!!!!"

from sklearn import metrics
# print(test_comments['attack'])
# print(clf.predict_proba(test_comments['comment']))
print(metrics.classification_report(test_comments['attack'], predicted))

             precision    recall  f1-score   support

      False       0.94      0.99      0.96     20422
       True       0.91      0.50      0.64      2756

avg / total       0.93      0.93      0.93     23178



<IPython.core.display.Javascript object>

### Hyper-parameter Tuning

In [218]:
text_clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000)),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', RandomForestClassifier(n_jobs=-1)),
])

In [221]:
param_grid = { 
           "clf__n_estimators" : [300, 500, 700],
           "clf__max_depth" : [100, 200, 300],
           "clf__min_samples_leaf" : [20, 50, 80]}

from pprint import pprint
pprint(param_grid)

{'clf__max_depth': [100, 200, 300],
 'clf__min_samples_leaf': [20, 50, 80],
 'clf__n_estimators': [300, 500, 700]}


In [222]:
from sklearn.model_selection import GridSearchCV

gs_clf = GridSearchCV(estimator=text_clf, param_grid=param_grid, n_jobs=-1)
gs_clf.fit(train_comments['comment'], train_comments['attack'])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        stri..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__n_estimators': [300, 500, 700], 'clf__max_depth': [100, 200, 300], 'clf__min_samples_leaf': [20, 50, 80]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [1]:
print(gs_clf.best_score_)
for param_name in sorted(param_grid.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

NameError: name 'gs_clf' is not defined

# Summary

## Text cleaning methods that I applyed
* split the comment string into words
* remove token "NEWLINE_TOKEN"
* remove token "TAB_TOKEN"
* convert all words to lowercase
* filter out punctuation
* filter out stop words 

## Feature Extraction
* N gram feature
     * Character level
     * Word level
* Embedding derived feature
    * Word2vec
    * Dor2vec
* Syntactic feature

I applyied N-gram feature to all three models

## Models
* KNN 
    
      precision    recall  f1-score   support
      False       0.90      0.97      0.94     20422
      True       0.53      0.24      0.33      2756
      avg / total       0.86      0.88      0.86     23178
* Multinomial Naive Bayes
    
       precision    recall  f1-score   support
       False       0.94      0.99      0.96     20422
       True       0.84      0.55      0.67      2756
       avg / total   0.93      0.93      0.93     23178

* Random Forest
        
      precision    recall  f1-score   support
      False       0.95      0.98      0.97     20422
       True       0.83      0.61      0.70      2756
       avg / total       0.94      0.94      0.93     23178
       
Best model: Random Forest

## Hyper-parameter Tuning

### Multinomial Naive Bayes
    
    parameters = {'vect__ngram_range':[(1,2), (1,3)],
              'tfidf__use_idf':(True, False),
              'clf__alpha': (1e-2, 1e-3),
            }

### Random Forest


    param_grid = { 
           "clf__n_estimators" : [300, 500, 700],
           "clf__max_depth" : [1, 15, 30],
           "clf__min_samples_leaf" : [1, 6, 12]}
           
Somehow, I did not successfully calculate the result due to lack of GPU running time on Google.

## Metrics
* I learn that Random forest has high F1 score comparing to that of Naive Bayes.
* Random forest did a better job on find comments that contains personal attack.
* I did not get to compute the result of cross-validation.

## Result
* The Random Forest model gives the best result, however it has lower auc_roc score than the logistic regression model.
* Random Forest gives almost the same accuracy as Logistic Regression does on testing.

## Interesting thing about this project
* learned about different between parametric model and non-parametric model
* Introduction to sklearn which is a very powerful tool.

## hardest thing
* Tunning Hyper-parater is the hartest part, because it requires you to understand your dataset and model before choosing the range of the parameters, otherwise you will have to wait for a long time for the result of best parameters.


