In [21]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.externals import joblib

from sklearn.model_selection import cross_val_score, cross_val_predict
from scipy.sparse import hstack
from scipy.special import logit, expit

In [15]:
import sklearn
print(sklearn.__version__)

0.21.1


In [16]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../jigsaw-toxic-comment-classification-challenge/train.csv').fillna(' ')
test = pd.read_csv('../jigsaw-toxic-comment-classification-challenge/test.csv').fillna(' ')

list_sentences_train = train['comment_text']
list_sentences_test = test['comment_text']
all_text = pd.concat([list_sentences_train, list_sentences_test])

# Clean Text

In [11]:
import re

cl_path = './cleaning/clean_letters.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

def clean_word(text):
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

    text = text.lower()
    # Replace links
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)

    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    return text

train_text = []
test_text = []
for text in list_sentences_train:
    train_text.append(clean_word(text))
    
for text in list_sentences_test:
    test_text.append(clean_word(text))

In [18]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=20000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

# char_vectorizer = TfidfVectorizer(
#     sublinear_tf=True,
#     strip_accents='unicode',
#     analyzer='char',
#     ngram_range=(1, 6),
#     max_features=30000)
# char_vectorizer.fit(all_text)

In [19]:
word_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=20000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents='unicode',
                sublinear_tf=True, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [None]:
# train_char_features = char_vectorizer.transform(train_text)
# test_char_features = char_vectorizer.transform(test_text)

# Extra Tree Classifier

In [None]:
joblib

In [20]:
train_features = train_word_features
test_features = test_word_features
losses = []
predictions = {'id': test['id']}
model_dict = dict()
for class_name in class_names:
    train_target = train[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    classifier.fit(train_features, train_target)
    model_dict[class_name] = classifier
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

CV score for class toxic is 0.9540573174056127
CV score for class severe_toxic is 0.940742843803584
CV score for class obscene is 0.9755727809885308
CV score for class threat is 0.8772669637141552
CV score for class insult is 0.9584597405987246
CV score for class identity_hate is 0.8851517034144708


In [1]:
model_dict_imported

NameError: name 'model_dict_imported' is not defined

In [3]:
# Load the model from the file 
model_dict_imported = joblib.load('models/models.p') 



In [8]:
!ls -lah


total 3246608
drwxr-xr-x  14 jchow  staff   448B May 21 18:02 [34m.[m[m
drwxr-xr-x   8 jchow  staff   256B May 20 14:38 [34m..[m[m
drwxr-xr-x  13 jchow  staff   416B May 20 11:46 [34m.git[m[m
-rw-r--r--   1 jchow  staff    54B May 18 21:29 .gitignore
drwxr-xr-x   7 jchow  staff   224B May 18 21:16 [34m.ipynb_checkpoints[m[m
-rw-r--r--   1 jchow  staff   432K May 17 10:20 EDA_tokenized.ipynb
-rw-r--r--   1 jchow  staff    57K May 17 17:00 EDA_wiki_dataset.ipynb
-rw-r--r--   1 jchow  staff    19K May 14 13:48 ExporatoryDataAnalysis.ipynb
-rw-r--r--   1 jchow  staff   1.5G Mar  5  2015 GoogleNews-vectors-negative300.bin.gz
-rw-r--r--   1 jchow  staff    28K May 21 18:02 Kaggle_Inspired.ipynb
-rw-r--r--   1 jchow  staff   201B May 20 11:41 README.md
drwxr-xr-x@ 16 jchow  staff   512B May 19 00:35 [34mchat_logs[m[m
drwxr-xr-x   3 jchow  staff    96B May 18 20:55 [34mcleaning[m[m
drwxr-xr-x   5 jchow  staff   160B May 21 18:00 [34mmodels[m[m


In [4]:
# joblib.dump(model_dict_imported,'models/models_compressed.p',compress = 9)

['models/models_compressed.p']

In [253]:
# joblib.dump(word_vectorizer, 'word_vectorizer.p') 

['word_vectorizer.p']

In [225]:

def raw_chat_to_model_input(raw_input_string):
    
    cleaned_text = []
    for text in [raw_input_string]:
        cleaned_text.append(clean_word(text))
    #print(cleaned_text)
    return word_vectorizer.transform(cleaned_text)

    
def predict_toxicity(raw_input_string):
    model_input = raw_chat_to_model_input(raw_input_string)
    results = []
    for key,model in model_dict.items():
        results.append(round(model.predict_proba(model_input)[0,1],4))
    return results

In [252]:
raw_chat_to_model_input(chat_input)

<1x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [164]:
model_dict['toxic'].predict_proba(raw_chat_to_model_input(chat_input))

['fuck you this is bullshit gay queer']


array([[0.06666667, 0.93333333]])

In [189]:
len(output_list[0])

6

In [262]:
model_dict

{'toxic': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'severe_toxic': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'obscene': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_node

In [266]:

chat_input = 'trash is garbage'

output_list = [list(model_dict.keys()),predict_toxicity(chat_input)]
for index in range(len(output_list[0])):
    print(output_list[0][index],output_list[1][index])

toxic 0.7
severe_toxic 0.1333
obscene 0.0703
threat 0.0
insult 0.0685
identity_hate 0.1667


In [118]:
for key in model_dict:
    print(key)

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [221]:
forsen_chat = joblib.load('./chat_logs/forsen_chat.p')

In [231]:
forsen_chat.shape

(14019, 4)

In [232]:
forsen_chat['message'][0:1000].apply(lambda msg : predict_toxicity(msg))

0            [0.0, 0.0, 0.0333, 0.0, 0.0, 0.0]
1         [0.027, 0.0, 0.027, 0.0, 0.027, 0.0]
2         [0.027, 0.0, 0.027, 0.0, 0.027, 0.0]
3         [0.0, 0.0, 0.0333, 0.0, 0.0333, 0.0]
4            [0.0, 0.0, 0.0333, 0.0, 0.0, 0.0]
5         [0.0333, 0.0, 0.0333, 0.0, 0.0, 0.0]
6         [0.0, 0.0, 0.0333, 0.0, 0.0333, 0.0]
7      [0.1333, 0.0, 0.0333, 0.0, 0.0667, 0.0]
8         [0.091, 0.0, 0.091, 0.0, 0.027, 0.0]
9         [0.0, 0.0, 0.0333, 0.0, 0.0333, 0.0]
10     [0.2333, 0.0, 0.0333, 0.0333, 0.0, 0.0]
11           [0.0, 0.0, 0.0333, 0.0, 0.0, 0.0]
12        [0.0, 0.0, 0.0333, 0.0, 0.0333, 0.0]
13        [0.091, 0.0, 0.091, 0.0, 0.027, 0.0]
14       [0.027, 0.0, 0.0919, 0.0, 0.027, 0.0]
15           [0.0, 0.0, 0.0333, 0.0, 0.0, 0.0]
16        [0.0, 0.0, 0.0333, 0.0, 0.0333, 0.0]
17           [0.0, 0.0, 0.0333, 0.0, 0.0, 0.0]
18        [0.0333, 0.0, 0.0333, 0.0, 0.0, 0.0]
19     [0.2333, 0.0, 0.0333, 0.0333, 0.0, 0.0]
20     [0.0333, 0.0, 0.0, 0.0, 0.0333, 0.1333]
21        [0.

In [258]:
list(model_dict.keys() )

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [260]:

pred_probs = predict_toxicity(chat_input)

probs = [{'name': list(model_dict.keys())[index], 'prob': pred_probs[index]}
         for index in np.argsort(pred_probs)[::-1]]
probs

[{'name': 'toxic', 'prob': 0.7},
 {'name': 'identity_hate', 'prob': 0.1667},
 {'name': 'severe_toxic', 'prob': 0.1333},
 {'name': 'obscene', 'prob': 0.0703},
 {'name': 'insult', 'prob': 0.0685},
 {'name': 'threat', 'prob': 0.0}]