# Testing Logistic Regression and ExtraTreesClassifier

In [3]:
# Standard data imports
import numpy as np
import pandas as pd
import pickle
from copy import deepcopy

# Model imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

# Model compression
from sklearn.externals import joblib

# Cross validation
from sklearn.model_selection import cross_val_score, cross_val_predict
# from scipy.sparse import hstack





In [15]:
import sklearn
print(sklearn.__version__)

0.21.1


In [16]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../jigsaw-toxic-comment-classification-challenge/train.csv').fillna(' ')
test = pd.read_csv('../jigsaw-toxic-comment-classification-challenge/test.csv').fillna(' ')

list_sentences_train = train['comment_text']
list_sentences_test = test['comment_text']
all_text = pd.concat([list_sentences_train, list_sentences_test])

# Clean Text

In [9]:
import re

cl_path = './cleaning/clean_letters.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

def clean_word(text):
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)

    text = text.lower()
    # Replace links
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)
    
    # Replace common typos
    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    return text

train_text = []
test_text = []
# Append entire training corpus into one string
for text in list_sentences_train:
    train_text.append(clean_word(text))
    
for text in list_sentences_test:
    test_text.append(clean_word(text))

NameError: name 'list_sentences_train' is not defined

In [18]:
word_vectorizer = TfidfVectorizer(
    # Sublinear tf uses 1+log(tf) instead of tf to treat duplicate occurrences of words in a less than linear
    # manner
    sublinear_tf=True,
    strip_accents='unicode', # unicode is more comprehensive for removing accents
    analyzer='word', # word ngrams
    token_pattern=r'\w{1,}', # Matches any word that is 1 or more characters
    ngram_range=(1, 1), # Only look at single word n-gram
    max_features=20000 # Maximum vocabulary size)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

# char_vectorizer = TfidfVectorizer(
#     sublinear_tf=True,
#     strip_accents='unicode',
#     analyzer='char',
#     ngram_range=(1, 6),
#     max_features=30000)
# char_vectorizer.fit(all_text)

In [19]:
word_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=20000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents='unicode',
                sublinear_tf=True, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [None]:
# train_char_features = char_vectorizer.transform(train_text)
# test_char_features = char_vectorizer.transform(test_text)

# Logistic Regression
Logistic regression using Stochastic Average Gradient solver. This solver incorporates memory of previous gradient values to achieve faster convergence rate than standard Stochastic Gradient. However, it only supports L2 regularization.

https://arxiv.org/abs/1309.2388

In [22]:
losses = []
log_predictions = {}
log_predictions = {'id': test['id']}
log_models = {}
for class_name in class_names:
    train_target = train[class_name]
    log_classifier = LogisticRegression(solver='sag')
    log_classifier.fit(train_features, train_target)
    
    print('Accuracy of logistic regression classifier on {} set: {:.5f}'.format(class_name,log_classifier.score(train_features, train_target)))
    
    # Score using ROC_AUC metric
    cv_loss = np.mean(cross_val_score(log_classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    log_models[class_name] = log_classifier
    log_predictions[class_name] = log_classifier.predict_proba(test_features)[:, 1]



NameError: name 'test' is not defined

In [None]:
#pickle the models
# Save Model as a pickle Using joblib
# Save the model as a pickle in a file 
joblib.dump(log_models, 'Logistic_Regression_models.p')
pickle.dump(word_vectorizer.fit(all_text), open("log_word_vectorizer.p", "wb"))
  
# Load the model from the file 
# pickled_models = joblib.load('models.p')

# Extra Tree Classifier

From SKLearn. Subset of random forest where a random subset of candidate features is used, but instead of looking for the most discriminative thresholds, thresholds are drawn at random for each candidate feature and the best of these randomly-generated thresholds is picked as the splitting rule. This usually allows to reduce the variance of the model a bit more, at the expense of a slightly greater increase in bias.

In [20]:
train_features = train_word_features
test_features = test_word_features
losses = []
predictions = {'id': test['id']}
model_dict = dict()
for class_name in class_names:
    train_target = train[class_name]
    classifier = ExtraTreesClassifier(n_estimators=30)
    
    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))
    
    classifier.fit(train_features, train_target)
    model_dict[class_name] = classifier
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

CV score for class toxic is 0.9540573174056127
CV score for class severe_toxic is 0.940742843803584
CV score for class obscene is 0.9755727809885308
CV score for class threat is 0.8772669637141552
CV score for class insult is 0.9584597405987246
CV score for class identity_hate is 0.8851517034144708


In [4]:
# Load the model from the file 
model_dict_imported = joblib.load('models/log_models.p') 
word_vectorizer = joblib.load('models/log_word_vectorizer.p')
model_dict = model_dict_imported



In [4]:
# joblib.dump(model_dict_imported,'models/models_compressed.p',compress = 9)

['models/models_compressed.p']

In [253]:
# joblib.dump(word_vectorizer, 'word_vectorizer.p') 

['word_vectorizer.p']

In [10]:

def raw_chat_to_model_input(raw_input_string):
    
    cleaned_text = []
    for text in [raw_input_string]:
        cleaned_text.append(clean_word(text))
    #print(cleaned_text)
    return word_vectorizer.transform(cleaned_text)

    
def predict_toxicity(raw_input_string):
    model_input = raw_chat_to_model_input(raw_input_string)
    results = []
    for key,model in model_dict.items():
        results.append(round(model.predict_proba(model_input)[0,1],4))
    return results

In [11]:
raw_chat_to_model_input(chat_input)

<1x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [12]:
model_dict['toxic'].predict_proba(raw_chat_to_model_input(chat_input))

array([[0.10914499, 0.89085501]])

In [14]:
model_dict

{'toxic': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='sag', tol=0.0001, verbose=0,
                    warm_start=False),
 'severe_toxic': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='sag', tol=0.0001, verbose=0,
                    warm_start=False),
 'obscene': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='sag', tol=0.0001, verbose=0,
             

# Example Output

In [15]:
chat_input = 'trash is garbage'

output_list = [list(model_dict.keys()),predict_toxicity(chat_input)]
for index in range(len(output_list[0])):
    print(output_list[0][index],output_list[1][index])

toxic 0.8909
severe_toxic 0.0109
obscene 0.061
threat 0.0013
insult 0.0774
identity_hate 0.0191


# Testing model on actual chat from streamer 'Forsen'

In [17]:
forsen_chat = joblib.load('./chat_logs/forsen_chat.p')

In [18]:
forsen_chat.shape

(14019, 4)

Testing classification speed

In [20]:
%%timeit
forsen_chat['message'][:].apply(lambda msg : predict_toxicity(msg))

13.2 s ± 291 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


13.2s for 14000 messages = 1060 messages classified per secondk using Logistic Regression

In [16]:
pred_probs = predict_toxicity(chat_input)
probs = [{'name': list(model_dict.keys())[index], 'prob': pred_probs[index]}
         for index in np.argsort(pred_probs)[::-1]]
probs

[{'name': 'toxic', 'prob': 0.8909},
 {'name': 'insult', 'prob': 0.0774},
 {'name': 'obscene', 'prob': 0.061},
 {'name': 'identity_hate', 'prob': 0.0191},
 {'name': 'severe_toxic', 'prob': 0.0109},
 {'name': 'threat', 'prob': 0.0013}]

In [21]:
highest_coeff= np.squeeze(model_dict['toxic'].coef_).max()

In [67]:
toxic_betas_abs = [np.abs(round(x,5)) for x in toxic_betas]

In [1]:
def max_n_elements(list_of_betas,n_elements=5):
    '''Code for finding the "top n" elements in tfidf weighting. '''
    highest_terms_index_set=set()
    highest_terms_dict=dict()
    list_of_betas_updated = deepcopy(list_of_betas)
    for n in range(n_elements):
        # N passes for each element we want to grab
        
        for counter,element in enumerate(list_of_betas_updated):
            if counter in highest_terms_index_set:
                # If the element is already in our index list, set it to 0 so we don't flag it again
                list_of_betas_updated[counter] = 0     
                
        # Now that we've removed max values already stored in set, find the next highest (max) term
        next_highest_term_index = list_of_betas_updated.index(max(list_of_betas_updated))
        highest_terms_dict[next_highest_term_index] = max(list_of_betas_updated)
        # Store that element in set of highest terms
        highest_terms_index_set.add(next_highest_term_index)
        
    return highest_terms_index_set,highest_terms_dict

In [154]:
max_indexes,index_dict = max_n_elements(toxic_betas_abs)

In [155]:
index_dict

{7091: 17.8847, 7103: 14.88584, 16664: 13.3805, 15663: 13.1592, 8480: 12.70755}

In [158]:
word_vectorizer.get_

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 20000,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': 'unicode',
 'sublinear_tf': True,
 'token_pattern': '\\w{1,}',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [145]:
for x in max_indexes:
    print(word_vectorizer.get_feature_names()[x])

idiot
shit
fuck
stupid
fucking
