<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Testing-to-see-how-well-metadata-alone-predicts-toxicity" data-toc-modified-id="Testing-to-see-how-well-metadata-alone-predicts-toxicity-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Testing to see how well metadata alone predicts toxicity</a></span><ul class="toc-item"><li><span><a href="#Testing-the-performance-of-XGBoost" data-toc-modified-id="Testing-the-performance-of-XGBoost-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Testing the performance of XGBoost</a></span></li><li><span><a href="#testing-using-tfidf-vectorizer" data-toc-modified-id="testing-using-tfidf-vectorizer-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>testing using tfidf vectorizer</a></span></li></ul></li></ul></div>

In [210]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp
import re
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet, RidgeCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

In [161]:
df = pd.read_csv('cleaned.csv')

In [162]:
def downsample(df):
    return df.drop(df[df.is_toxic == 0].sample(len(df[df.is_toxic == 0]) - df.is_toxic.sum()).index)

In [163]:
X_train.columns

Index(['comment_text_char_space', 'question', 'exclamation', 'words',
       'avg_word_len', 'caps_percentage'],
      dtype='object')

# Testing to see how well metadata alone predicts toxicity

In [211]:
# building a function to peform train test split and simple regression on different target variables
def tester(x_values, y_value):
    
    df = pd.read_csv('cleaned.csv')
    
    print('Predictors: {}'.format(x_values))
    print("Target: '{}'\n".format(y_value))
    
    X_train, X_test, y_train, y_test = train_test_split(df[x_values], df[y_value])
    df = pd.concat([X_train, y_train], axis=1)    
    downsampled = df.drop(df[df[y_value] == 0].sample(len(df[df[y_value] == 0]) - df[y_value].sum()).index)
    X_train = downsampled.drop(y_train.name, axis=1)
    y_train = downsampled[y_train.name]
    
    ss = StandardScaler()
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)
    
#     slr = LinearRegression()
#     slr.fit(X_train, y_train)
#     print('Linear Regression score is {}'.format(slr.score(X_test, y_test)))
 
#     ridge = Ridge()
#     ridge.fit(X_train, y_train)
#     print('Ridge Regression score is {}'.format(ridge.score(X_test, y_test)))
    
    logit = LogisticRegression()
    logit.fit(X_train, y_train)
    print('Logistic Regression score is {}'.format(logit.score(X_test, y_test)))
    
    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)
    print('Random Forest Classifier score is {}'.format(rfc.score(X_test, y_test)))
    
    xgboost = xgb.XGBClassifier(max_depth=5, silent=False)
    xgboost.fit(X_train, y_train)
    print('XGBoost score is {}'.format(xgboost.score(X_test, y_test)))

    print('Majority class proportion is {}\n'.format(pd.DataFrame(y_test.values)[0].value_counts()[0] / len(y_test)))
    
    # takes the higher scoring model and prints the confusion matrix using it as a predictor
    print(confusion_matrix(y_test, sorted(zip([logit.score(X_test, y_test), rfc.score(X_test, y_test)], [logit, rfc]), reverse=True)[0][1].predict(X_test)), '\n')
    
    print(classification_report(y_test, sorted(zip([logit.score(X_test, y_test), rfc.score(X_test, y_test)], [logit, rfc]), reverse=True)[0][1].predict(X_test)))


In [212]:
for toxicity in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'total_toxic', 'is_toxic', 'is_toxic_no_profanity']:
    tester(['question', 'exclamation', 'words', 'avg_word_len', 'caps_percentage'], toxicity)
    print('-'*75)

Predictors: ['question', 'exclamation', 'words', 'avg_word_len', 'caps_percentage']
Target: 'toxic'

Logistic Regression score is 0.769624188332038
Random Forest Classifier score is 0.6795447138165317
XGBoost score is 0.7121869280718028
Majority class proportion is 0.9043548023165442

[[29242  6830]
 [ 2359  1456]] 

             precision    recall  f1-score   support

          0       0.93      0.81      0.86     36072
          1       0.18      0.38      0.24      3815

avg / total       0.85      0.77      0.80     39887

---------------------------------------------------------------------------
Predictors: ['question', 'exclamation', 'words', 'avg_word_len', 'caps_percentage']
Target: 'severe_toxic'

Logistic Regression score is 0.8535111690525735
Random Forest Classifier score is 0.762955348860531
XGBoost score is 0.7821846716975456
Majority class proportion is 0.9902725198686289

[[33871  5628]
 [  215   173]] 

             precision    recall  f1-score   support

          

## testing using tfidf vectorizer

In [221]:
def nlpscorer(nlp, model):
    
    df = pd.read_csv('cleaned.csv')

    X_train, X_test, y_train, y_test = train_test_split(df[['comment_text_char_space', 'question', 'exclamation', 'words',
                                                            'avg_word_len', 'caps_percentage']], df['is_toxic_no_profanity'])
    
    df = pd.concat([X_train, y_train], axis=1)    
    downsampled = df.drop(df[df['is_toxic_no_profanity'] == 0].sample(len(df[df['is_toxic_no_profanity'] == 0]) - df['is_toxic_no_profanity'].sum()).index)
    X_train = downsampled.drop(y_train.name, axis=1)
    y_train = downsampled[y_train.name]
    
    nlp.fit(X_train.comment_text_char_space)
    transformed = nlp.transform(X_train.comment_text_char_space)
    X_train.drop('comment_text_char_space', axis=1).shape
    
    model.fit(transformed, y_train)
    
    print('NLP ONLY')
    print(classification_report(y_test, model.predict(nlp.transform(X_test.comment_text_char_space))))
    print(pd.DataFrame(confusion_matrix(y_test, model.predict(nlp.transform(X_test.comment_text_char_space))), index=['true_0', 'true_1'], columns=['predicted_0', 'predicted_1']), '\n')
    print('NLP only score: ', model.score(nlp.transform(X_test.comment_text_char_space), y_test), '\n')
    
    X_train_combined = sp.sparse.hstack([transformed, sp.sparse.csr_matrix(X_train.drop('comment_text_char_space', axis=1))])
    X_test_combined = sp.sparse.hstack([nlp.transform(X_test.comment_text_char_space), sp.sparse.csr_matrix(X_test.drop('comment_text_char_space', axis=1))])
    
    model.fit(X_train_combined, y_train)
    
    print('COMBINED')
    print(classification_report(y_test, model.predict(X_test_combined)))
    print(pd.DataFrame(confusion_matrix(y_test, model.predict(X_test_combined)), index=['true_0', 'true_1'], columns=['predicted_0', 'predicted_1']), '\n')
    print('Combined score: ', model.score(X_test_combined, y_test))

In [237]:
tfidf = TfidfVectorizer(ngram_range=(2,5), max_df=0.7, max_features=10000)
cv = CountVectorizer(ngram_range=(2,5), max_df=0.7, max_features=10000)

In [238]:
xgboost = xgb.XGBClassifier(max_depth=5, silent=False)
logit = LogisticRegression()
rfc = RandomForestClassifier()
nb = BernoulliNB()
svc = SVC()

In [241]:
nlpscorer(tfidf, logit)

NLP ONLY
             precision    recall  f1-score   support

          0       0.97      0.83      0.90     35937
          1       0.34      0.77      0.47      3950

avg / total       0.91      0.83      0.85     39887

        predicted_0  predicted_1
true_0        29977         5960
true_1          909         3041 

NLP only score:  0.8277885025196179 

COMBINED
             precision    recall  f1-score   support

          0       0.97      0.86      0.91     35937
          1       0.37      0.72      0.49      3950

avg / total       0.91      0.85      0.87     39887

        predicted_0  predicted_1
true_0        30995         4942
true_1         1099         2851 

Combined score:  0.8485471456865645


In [242]:
nlpscorer(tfidf, rfc)

NLP ONLY
             precision    recall  f1-score   support

          0       0.95      0.89      0.92     35903
          1       0.37      0.58      0.45      3984

avg / total       0.89      0.86      0.87     39887

        predicted_0  predicted_1
true_0        31897         4006
true_1         1678         2306 

NLP only score:  0.8574974302404292 

COMBINED
             precision    recall  f1-score   support

          0       0.96      0.78      0.86     35903
          1       0.26      0.73      0.39      3984

avg / total       0.89      0.77      0.81     39887

        predicted_0  predicted_1
true_0        27835         8068
true_1         1079         2905 

Combined score:  0.7706771629854339


In [243]:
nlpscorer(tfidf, xgboost)

NLP ONLY
             precision    recall  f1-score   support

          0       0.97      0.62      0.76     35919
          1       0.20      0.84      0.32      3968

avg / total       0.89      0.64      0.71     39887

        predicted_0  predicted_1
true_0        22323        13596
true_1          647         3321 

NLP only score:  0.6429162383734048 

COMBINED
             precision    recall  f1-score   support

          0       0.96      0.81      0.88     35919
          1       0.29      0.72      0.41      3968

avg / total       0.90      0.80      0.83     39887

        predicted_0  predicted_1
true_0        28934         6985
true_1         1123         2845 

Combined score:  0.7967257502444406


In [244]:
nlpscorer(cv, logit)

NLP ONLY
             precision    recall  f1-score   support

          0       0.98      0.79      0.87     35899
          1       0.31      0.83      0.45      3988

avg / total       0.91      0.79      0.83     39887

        predicted_0  predicted_1
true_0        28397         7502
true_1          676         3312 

NLP only score:  0.7949707924887808 

COMBINED
             precision    recall  f1-score   support

          0       0.97      0.80      0.88     35899
          1       0.31      0.81      0.45      3988

avg / total       0.91      0.80      0.83     39887

        predicted_0  predicted_1
true_0        28656         7243
true_1          745         3243 

Combined score:  0.7997342492541429


In [245]:
nlpscorer(cv, rfc)

NLP ONLY
             precision    recall  f1-score   support

          0       0.95      0.87      0.91     35895
          1       0.35      0.61      0.44      3992

avg / total       0.89      0.85      0.86     39887

        predicted_0  predicted_1
true_0        31335         4560
true_1         1557         2435 

NLP only score:  0.8466417629804197 

COMBINED
             precision    recall  f1-score   support

          0       0.97      0.75      0.84     35895
          1       0.25      0.76      0.38      3992

avg / total       0.89      0.75      0.80     39887

        predicted_0  predicted_1
true_0        26884         9011
true_1          966         3026 

Combined score:  0.7498683781683255


In [246]:
nlpscorer(cv, xgboost)

NLP ONLY
             precision    recall  f1-score   support

          0       0.97      0.62      0.76     35907
          1       0.19      0.82      0.31      3980

avg / total       0.89      0.64      0.71     39887

        predicted_0  predicted_1
true_0        22253        13654
true_1          697         3283 

NLP only score:  0.6402085892646727 

COMBINED
             precision    recall  f1-score   support

          0       0.96      0.81      0.88     35907
          1       0.29      0.72      0.42      3980

avg / total       0.90      0.80      0.83     39887

        predicted_0  predicted_1
true_0        28912         6995
true_1         1101         2879 

Combined score:  0.7970266001454108


In [247]:
nlpscorer(cv, nb)

NLP ONLY
             precision    recall  f1-score   support

          0       0.99      0.13      0.23     35823
          1       0.11      0.99      0.21      4064

avg / total       0.90      0.22      0.23     39887

        predicted_0  predicted_1
true_0         4602        31221
true_1           31         4033 

NLP only score:  0.2164865745731692 

COMBINED
             precision    recall  f1-score   support

          0       0.99      0.13      0.23     35823
          1       0.11      0.99      0.21      4064

avg / total       0.90      0.22      0.23     39887

        predicted_0  predicted_1
true_0         4611        31212
true_1           30         4034 

Combined score:  0.21673728282397775


In [248]:
nlpscorer(tfidf, nb)

NLP ONLY
             precision    recall  f1-score   support

          0       0.99      0.13      0.22     35923
          1       0.11      0.99      0.20      3964

avg / total       0.90      0.21      0.22     39887

        predicted_0  predicted_1
true_0         4536        31387
true_1           36         3928 

NLP only score:  0.21219946348434326 

COMBINED
             precision    recall  f1-score   support

          0       0.99      0.13      0.22     35923
          1       0.11      0.99      0.20      3964

avg / total       0.90      0.21      0.22     39887

        predicted_0  predicted_1
true_0         4541        31382
true_1           36         3928 

Combined score:  0.21232481760974753


In [251]:
nlpscorer(tfidf, svc)

NLP ONLY
             precision    recall  f1-score   support

          0       0.91      1.00      0.95     35934
          1       0.87      0.15      0.25      3953

avg / total       0.91      0.91      0.88     39887

        predicted_0  predicted_1
true_0        35847           87
true_1         3378          575 

NLP only score:  0.9131295910948429 

COMBINED
             precision    recall  f1-score   support

          0       0.94      0.18      0.31     35934
          1       0.11      0.90      0.19      3953

avg / total       0.86      0.25      0.29     39887

        predicted_0  predicted_1
true_0         6542        29392
true_1          393         3560 

Combined score:  0.25326547496678115
