In [1]:
import numpy as np
import pandas as pd
import string
import re

#sklearn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

#scipy imports
from scipy.sparse import hstack

#Visualization imports
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import bokeh
#! pip install bokeh

# target classes
target_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']



In [8]:
# read frames localy through csv
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

# Random index generator for splitting training data
# Note: Each rerun of cell will create new splits.
randIndexCut = np.random.rand(len(train_df)) < 0.7

#S plit up data
test_data = test_df["comment_text"]
dev_data, dev_labels = train_df[~randIndexCut]["comment_text"], train_df[~randIndexCut][target_names]
train_data, train_labels = train_df[randIndexCut]["comment_text"], train_df[randIndexCut][target_names]
tiny_data,small_data = train_df[:200]["comment_text"],train_df[:1000]["comment_text"]
tiny_labels,small_labels = train_df[:200][target_names],train_df[:1000][target_names]


print('total training observations:', train_df.shape[0])
print('training data shape:', train_data.shape)
print('training label shape:', train_labels.shape)
print('dev label shape:', dev_labels.shape)
print ('labels names:', target_names)

total training observations: 159571
training data shape: (111829,)
training label shape: (111829, 6)
dev label shape: (47742, 6)
labels names: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [2]:
from nltk.corpus import brown

word_corpus = '../data/words.txt'
word_file = open(word_corpus, 'rt')
large_word_corpus = word_file.read()
word_file.close
large_word_corpus = large_word_corpus.split()
large_word_corpus = [ word.lower() for word in large_word_corpus]
large_word_corpus = set(large_word_corpus)

good_words_list = brown.words()
good_word_set = set([word.lower() for word in good_words_list])
#punctuation = re.sub("[\'\-]",'',string.punctuation)
punctuation = "[\!\?\"\#\$\%\&\(\)\*\+\,\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\']"
print('Size of corpus ' + str(len(large_word_corpus)))

Size of corpus 462986


In [3]:
# from http://norvig.com/spell-correct.html
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('../data/big.txt').read()))

def norvig_P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def norvig_correction(word): 
    "Most probable spelling correction for word."
    return max(norvig_candidates(word), key=norvig_P)

def norvig_candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [4]:
# Functions to support finding and correcting spellings
# using pyenchant for spell checking
from enchant import DictWithPWL
from enchant.checker import SpellChecker
import difflib
# import splitter # not useful, does a worse job than my implementation

# mywords.txt currently contains:
# - list of firstnames and surnames gathered from internet searches
# http://www.birkenhoerdt.net/surnames-all.php?tree=1
my_dict=DictWithPWL('en_US', "../data/mywords.txt")
my_checker = SpellChecker(my_dict)

# list of swear words correctly spelt courtesy of https://www.noswearing.com/

def my_preprocessor(textblock):
    # u -> you
    # c -> see
    # k -> okay
    return_words = textblock

#     return_words = re.sub(r"[^A-Za-z0-9,!?*.;’´'\/]", " ", return_words)
    return_words = re.sub(r"[^A-Za-z0-9]", " ", return_words)
    return_words = re.sub(r","," ",return_words)
    return_words = re.sub(r"\.\.+"," ",return_words)
    return_words = re.sub(r"\."," ",return_words)
    return_words = re.sub(r"\("," ", return_words)
    return_words = re.sub(r"\)"," ", return_words)
    return_works = re.sub(r"\;", " ", return_words)
    return_words = re.sub(r":"," ", return_words)
    return_words = re.sub(r"´", "'", return_words)
    return_words = re.sub(r"`", "'", return_words)
    return_words = re.sub(r"''+", "'", return_words)
    return_words = re.sub(r" '", " ", return_words)
    return_words = re.sub(r"' ", " ", return_words)
    return_words = re.sub(r"\"", " ", return_words)
    return_words = re.sub(r"\/", " ", return_words)
    return_words = re.sub(r"\!\!+", "!!", return_words)
    return_words = re.sub(r"\?\?+", "?!", return_words)
    return_words = re.sub(r"\!", " !", return_words)
    return_words = re.sub(r"\?", " ?", return_words)
    return_words = re.sub(r"\b\d+\b", "999", return_words)
    # slang and abbreviations, need to be aware of capitolization and spaces
    return_words = re.sub(r"[Ww]on't", "will not", return_words)
    return_words = re.sub(r"n't", " not", return_words)
    return_words = re.sub(r"'s\b", " is", return_words)
    return_words = re.sub(r"\b[Aa]bt\b", "about", return_words)
    return return_words

def trysplit(word, verbose=False):
    split_candidates = []
    max_proba = 0.0
    for i in range(1,len(word)):
        # I will only allow single letters of 'a' and 'i', all others ignored.  Pyenchant allows for
        # any single letter to be a legitimate word, and so too does norvig.  The dictionary defines
        # them as nouns that represent the letter, however even though several can be used in slang
        # (e.g. k->okay, c->see, u->you) using them in conjoined words would make the splitting far
        # too difficult and also human understanding much more difficult #howucthisk, u c?
        if (len(word[:i]) != 1 or (word[:i].lower() == 'a' or word[:i].lower() == 'i')) and (
            len(word[i:]) != 1 or (word[i:].lower() == 'a' or word[i:].lower() == 'i')):
            if my_checker.check(word[:i]) and my_checker.check(word[i:]):
                norvig_score = norvig_P(word[:i]) + norvig_P(word[i:])
                if norvig_score > max_proba:
                    max_proba = norvig_score
                    split_candidates = [word[:i],word[i:]]
    for i in range(1,len(word)):
        for j in range(i+1,len(word)):        
            if (len(word[:i]) != 1 or (word[:i].lower() == 'a' or word[:i].lower() == 'i')) and (
                len(word[i:j]) != 1 or (word[i:j].lower() == 'a' or word[i:j].lower() == 'i')) and (
                len(word[i:]) != 1 or (word[i:].lower() == 'a' or word[i:].lower() == 'i')):
                
                if my_checker.check(word[:i]) and my_checker.check(word[i:j]) and my_checker.check(word[j:]):
                    norvig_score = norvig_P(word[:i]) + norvig_P(word[i:j]) + norvig_P(word[j:])
                    if norvig_score > max_proba:
                        max_proba = norvig_score
                        split_candidates = [word[:i],word[i:j],word[j:]]
    for i in range(1,len(word)):
        for j in range(i+1,len(word)):
            for k in range(j+1,len(word)):
                if (len(word[:i]) != 1 or (word[:i].lower() == 'a' or word[:i].lower() == 'i')) and (
                    len(word[i:j]) != 1 or (word[i:j].lower() == 'a' or word[i:j].lower() == 'i')) and (
                    len(word[j:k]) != 1 or (word[j:k].lower() == 'a' or word[j:k].lower() == 'i')) and (
                    len(word[k:]) != 1 or (word[k:].lower() == 'a' or word[k:].lower() == 'i')):
                    verbose and print("making it here with i=%s j=%s k=%s %s  max_proba=%d" %(word[:i],word[i:j],word[j:k],word[k:], max_proba))
                    verbose and print("lengths are %d %d %d %d" % (len(word[:i]), len(word[i:j]),len(word[j:k]),len(word[k:])))
                    if my_checker.check(word[:i]) and my_checker.check(word[i:j]) and my_checker.check(word[j:k]) and my_checker.check(word[k:]):
                        verbose and print('found words ' + word[i:] + ' ' + word[k:])
                        norvig_score = norvig_P(word[:i]) + norvig_P(word[i:j]) + norvig_P(word[j:k]) + norvig_P(word[k:])
                        if norvig_score > max_proba:
                            verbose and print("found higher probability %d with %s %s %s %s" % (norvig_score, word[:i], word[i:j], word[j:k], word[k:]))
                            max_proba = norvig_score
                            split_candidates = [word[:i],word[i:j],word[j:k],word[k:]]
    return split_candidates

def get_best_candidates(word):
    best_words = []
    best_ratio = 0
    a = set(my_checker.suggest(word))
    for b in a:
        if not '-' in b:
            tmp = difflib.SequenceMatcher(None, word, b).ratio()
            if tmp > best_ratio:
                best_words=[b]
                best_ratio=tmp
            elif tmp == best_ratio:
                best_words.append(b)
    return best_words
    
def fix_spellings(textinput, verbose=False):
    words = textinput.split()
    return_list = []
    for word in words:
        if my_checker.check(word) or my_checker.check(word.lower()) or word in punctuation or\
            any(i.isdigit() for i in word) or (word[-1].lower() == 's' and my_checker.check(word[:-1].lower())):
            return_list.append(word)
            # continue
        else:            
            candidates = get_best_candidates(word)
            if len(candidates) == 1:
                return_list.append(candidates.pop())
            elif len(candidates) > 1:
                # try another spell checker
                nv_candidates = norvig_candidates(word)
                tmp_set = set(nv_candidates).intersection(set(candidates))
                if len(tmp_set) == 1:
                    # only 1 overlap, should be correct
                    return_list.append(tmp_set.pop())
                elif len(nv_candidates) == 1 and next(iter(nv_candidates)) == word:
                        # this is suspicious, pyenchants' "suggest" method always returns something, however if
                        # norvigs method cannot find a suitable match within a short distance then it simply
                        # returns the orignal word.  This section is for potentially conjoined words
                        tmp_list=trysplit(word)

                        # If we get back a list of split words then use these
                        if len(tmp_list) != 0:
                            return_list.extend(tmp_list)
                            continue
                else:
                    # arbitrary now, just going to use the first one found from pyenchant, even though
                    # I have seen norvig get the correct word sometimes when pyenchant gets it wrong
                    return_list.append(candidates[0])
    return return_list

# myword='In a long discussion about thisismessedup what should I do askd'
# print(fix_spellings(myword))

In [311]:
myword2='mylove'
# trysplit(myword)
#norvig_P('is')
#fix_spellings('alit')
# help(enchant)
#myword2="I'm"
print(get_best_candidates(myword2))
print(my_checker.check(myword2))
print(my_checker.suggest(myword2))
print(norvig_candidates(myword2))
print(fix_spellings(myword2))


['my love']
False
['my love', 'my-love', 'ladylove', 'Mylo', 'lovely', 'Lovejoy', 'Melville', 'Malvin', 'mylo', 'malone', 'milone', 'love']
{'move', 'glove', 'love'}
['my love']


In [264]:
index=44
print(train_data[index])
print(fix_spellings(my_preprocessor(train_data[index]), verbose=True))

I'm Sorry 

I'm sorry I screwed around with someones talk page.  It was very bad to do.  I know how having the templates on their talk page helps you assert your dominance over them.  I know I should bow down to the almighty administrators.  But then again, I'm going to go play outside....with your mom.   76.122.79.82
["I'm", 'Sorry', "I'm", 'sorry', 'I', 'screwed', 'around', 'with', 'someones', 'talk', 'page', 'It', 'was', 'very', 'bad', 'to', 'do', 'I', 'know', 'how', 'having', 'the', 'templates', 'on', 'their', 'talk', 'page', 'helps', 'you', 'assert', 'your', 'dominance', 'over', 'them', 'I', 'know', 'I', 'should', 'bow', 'down', 'to', 'the', 'almighty', 'administrators', 'But', 'then', 'again', "I'm", 'going', 'to', 'go', 'play', 'outside', 'with', 'your', 'mom', '999', '999', '999', '999']


In [237]:
# Calculate a flag for any level of toxix or a unique number for each combination
def bernoulli_toxic_labels (label_vector):
    return [1 if (label_vector['toxic'][x] + label_vector['severe_toxic'][x] +
                label_vector['obscene'][x] + label_vector['threat'][x] +
                label_vector['insult'][x] + label_vector['identity_hate'][x]) > 0 else 0 
            for x in label_vector.index.values]

def binarize_toxic_labels (label_vector):
    return [(label_vector['toxic'][x]*32 + label_vector['severe_toxic'][x]*16 +
                label_vector['obscene'][x]*8 + label_vector['threat'][x]*4 +
                label_vector['insult'][x]*2 + label_vector['identity_hate'][x]) 
            for x in label_vector.index.values]

binary_train_labels = binarize_toxic_labels(train_labels)
binary_dev_labels = binarize_toxic_labels(dev_labels)

bernoulli_train_labels = bernoulli_toxic_labels(train_labels)
bernoulli_dev_labels = bernoulli_toxic_labels(dev_labels)

In [321]:
count_vect_plain = CountVectorizer(preprocessor=my_preprocessor)
X_train_counts_plain = count_vect_plain.fit_transform(train_data)
X_dev_counts_plain = count_vect_plain.transform(dev_data)

tfidf_vect_plain = TfidfVectorizer(preprocessor=my_preprocessor)
X_train_tfidf_plain = tfidf_vect_plain.fit_transform(train_data)
X_dev_tfidf_plain = tfidf_vect_plain.transform(dev_data)

count_vect_stop_words = CountVectorizer(stop_words='english',preprocessor=my_preprocessor)
X_train_counts_stop_words = count_vect_stop_words.fit_transform(train_data)
X_dev_counts_stop_words = count_vect_stop_words.transform(dev_data)

tfidf_vect_stop_words = TfidfVectorizer(stop_words='english',preprocessor=my_preprocessor)
X_train_tfidf_stop_words = tfidf_vect_stop_words.fit_transform(train_data)
X_dev_tfidf_stop_words = tfidf_vect_stop_words.transform(dev_data)

count_vect_stop_words_max10k = CountVectorizer(stop_words='english', max_features=10000,preprocessor=my_preprocessor)
X_train_counts_stop_words_max10k = count_vect_stop_words_max10k.fit_transform(train_data)
X_dev_counts_stop_words_max10k = count_vect_stop_words_max10k.transform(dev_data)

tfidf_vect_stop_words_max10k = TfidfVectorizer(stop_words='english', max_features=10000,preprocessor=my_preprocessor)
X_train_tfidf_stop_words_max10k = tfidf_vect_stop_words_max10k.fit_transform(train_data)
X_dev_tfidf_stop_words_max10k = tfidf_vect_stop_words_max10k.transform(dev_data)

count_vect_stop_words_max5k = CountVectorizer(stop_words='english', max_features=5000)
X_train_counts_stop_words_max5k = count_vect_stop_words_max5k.fit_transform(train_data)
X_dev_counts_stop_words_max5k = count_vect_stop_words_max5k.transform(dev_data)

tfidf_vect_stop_words_max5k = TfidfVectorizer(max_features=5000)
X_train_tfidf_stop_words_max5k = tfidf_vect_stop_words_max5k.fit_transform(train_data)
X_dev_tfidf_stop_words_max5k = tfidf_vect_stop_words_max5k.transform(dev_data)

count_vect_max5k = CountVectorizer(max_features=5000)
X_train_counts_max5k = count_vect_max5k.fit_transform(train_data)
X_dev_counts_max5k = count_vect_max5k.transform(dev_data)

tfidf_vect_max5k = TfidfVectorizer(max_features=5000)
X_train_tfidf_max5k = tfidf_vect_max5k.fit_transform(train_data)
X_dev_tfidf_max5k = tfidf_vect_max5k.transform(dev_data)

count_vect_max4k = CountVectorizer(max_features=4000)
X_train_counts_max4k = count_vect_max4k.fit_transform(train_data)
X_dev_counts_max4k = count_vect_max4k.transform(dev_data)

tfidf_vect_max4k = TfidfVectorizer(max_features=4000)
X_train_tfidf_max4k = tfidf_vect_max4k.fit_transform(train_data)
X_dev_tfidf_max4k = tfidf_vect_max4k.transform(dev_data)

count_vect_max6k = CountVectorizer(max_features=6000)
X_train_counts_max6k = count_vect_max6k.fit_transform(train_data)
X_dev_counts_max6k = count_vect_max6k.transform(dev_data)

tfidf_vect_max6k = TfidfVectorizer(max_features=6000)
X_train_tfidf_max6k = tfidf_vect_max6k.fit_transform(train_data)
X_dev_tfidf_max6k = tfidf_vect_max6k.transform(dev_data)


print('done')

done


In [5]:
def score_f1_auc_on_train_dev(dev_vector, train_vector, name):
    multinomial_nb_class = MultinomialNB().fit(train_vector, train_labels[name])
    predicted_labels_dev = multinomial_nb_class.predict(dev_vector)
    fpr, tpr, thresholds = metrics.roc_curve(dev_labels[name], predicted_labels_dev)
    predicted_labels_train = multinomial_nb_class.predict(train_vector)
    fpr1, tpr1, thresholds1 = metrics.roc_curve(train_labels[name], predicted_labels_train)
    f1scoredev = metrics.f1_score(dev_labels[name],predicted_labels_dev,average='micro')
    f1scoretrain = metrics.f1_score(train_labels[name],predicted_labels_train,average='micro')
    aucdev = metrics.auc(fpr,tpr)
    auctrain = metrics.auc(fpr1,tpr1)
    return f1scoredev,aucdev,f1scoretrain,auctrain

In [322]:
scores_all=pd.DataFrame(columns=['set','label','f1dev','aucdev','f1train','auctrain'])

for name in target_names:
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_counts_plain,
                                                   dev_vector=X_dev_counts_plain,name=name)
    scores_all.loc[scores_all.shape[0]] = ['CountPlain',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]

    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_tfidf_plain,
                                                   dev_vector=X_dev_tfidf_plain,name=name)
    scores_all.loc[scores_all.shape[0]] = ['TfidfPlain',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]

    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_counts_stop_words,
                                                   dev_vector=X_dev_counts_stop_words,name=name)
    scores_all.loc[scores_all.shape[0]] = ['CountStopWords',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_tfidf_stop_words,
                                                   dev_vector=X_dev_tfidf_stop_words,name=name)
    scores_all.loc[scores_all.shape[0]] = ['TfidfStopWords',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_counts_stop_words_max10k,
                                                   dev_vector=X_dev_counts_stop_words_max10k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['CountStopWords10k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_tfidf_stop_words_max10k,
                                                   dev_vector=X_dev_tfidf_stop_words_max10k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['TfidfStopWords10k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_counts_stop_words_max5k,
                                                   dev_vector=X_dev_counts_stop_words_max5k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['CountStopWords5k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_tfidf_stop_words_max5k,
                                                   dev_vector=X_dev_tfidf_stop_words_max5k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['TfidfStopWords5k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_counts_max5k,
                                                   dev_vector=X_dev_counts_max5k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['Count5k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_tfidf_max5k,
                                                   dev_vector=X_dev_tfidf_max5k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['TfidfS5k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_counts_max4k,
                                                   dev_vector=X_dev_counts_max4k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['Count4k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_tfidf_max4k,
                                                   dev_vector=X_dev_tfidf_max4k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['TfidfS5k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_counts_max6k,
                                                   dev_vector=X_dev_counts_max6k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['Count6k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_tfidf_max6k,
                                                   dev_vector=X_dev_tfidf_max6k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['TfidfS6k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
    

    # not measuring here for each name
print(scores_all)

                  set          label     f1dev    aucdev   f1train  auctrain
0          CountPlain          toxic  0.942498  0.773851  0.947786  0.818918
1          TfidfPlain          toxic  0.921717  0.599574  0.923317  0.616814
2      CountStopWords          toxic  0.943746  0.776646  0.951295  0.827418
3      TfidfStopWords          toxic  0.924466  0.611925  0.927531  0.635766
4   CountStopWords10k          toxic  0.946748  0.812008  0.948961  0.822992
5   TfidfStopWords10k          toxic  0.946621  0.760780  0.947991  0.770859
6    CountStopWords5k          toxic  0.947805  0.823325  0.948730  0.828803
7    TfidfStopWords5k          toxic  0.949559  0.747556  0.949246  0.750520
8             Count5k          toxic  0.938714  0.839973  0.941132  0.848808
9            TfidfS5k          toxic  0.949559  0.747556  0.949246  0.750520
10            Count4k          toxic  0.939052  0.833640  0.940838  0.841222
11           TfidfS5k          toxic  0.947868  0.738899  0.947973  0.743093

In [12]:
import datetime
count_vect_plain_pre = CountVectorizer(preprocessor=my_preprocessor)
print(str(datetime.datetime.now().time()))
X_train_counts_plain_pre = count_vect_plain_pre.fit_transform(tiny_data)
print(str(datetime.datetime.now().time()))
count_vect_plain_pre_token6k = CountVectorizer(tokenizer=fix_spellings, max_features=10000, strip_accents='ascii', lowercase=True)
print(str(datetime.datetime.now().time()))
X_train_counts_plain_pre_token6k = count_vect_plain_pre_token6k.fit_transform(train_data)
print(str(datetime.datetime.now().time()))
X_dev_counts_plain_pre_token6k = count_vect_plain_pre_token6k.transform(dev_data)
print(str(datetime.datetime.now().time()))
#X_dev_counts_plain_pre = count_vect_plain_pre.transform(dev_data)

11:50:33.894302
11:50:34.625534
11:50:34.952166


KeyboardInterrupt: 

In [11]:
scores_all=pd.DataFrame(columns=['set','label','f1dev','aucdev','f1train','auctrain'])
for name in target_names:
    tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(train_vector=X_train_counts_plain_pre_token6k,
                                                   dev_vector=X_dev_counts_plain_pre_token6k,name=name)
    scores_all.loc[scores_all.shape[0]] = ['CountPlainPreTok6k',name,tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]

print(scores_all)

                  set          label     f1dev    aucdev   f1train  auctrain
0  CountPlainPreTok6k          toxic  0.940576  0.820492  0.942385  0.827347
1  CountPlainPreTok6k   severe_toxic  0.975849  0.828395  0.977018  0.840311
2  CountPlainPreTok6k        obscene  0.958318  0.847900  0.958472  0.853475
3  CountPlainPreTok6k         threat  0.981128  0.794024  0.981355  0.797693
4  CountPlainPreTok6k         insult  0.955092  0.825678  0.956559  0.838370
5  CountPlainPreTok6k  identity_hate  0.969649  0.764698  0.970875  0.786283


In [342]:
import pickle

def save_model(model_name, model):
    filename='saved/' + model_name + '.sav'
    print('Saving model %s to %s' % (model_name, filename))
    pickle.dump(model, open(filename, 'wb'))
    print('Finished saving %s' % (filename))

def save_csv_results(name, data):
    filename='saved/' + name + '.csv'
    print('Saving to %s' % (filename))
    pd.DataFrame(data).to_csv(filename)
    print('Finished saving %s' % (filename))

for name in target_names:
    multinomial_nb_class = MultinomialNB().fit(X_train_counts_plain_pre_token, train_labels[name])
    predicted_labels_dev = multinomial_nb_class.predict(X_dev_counts_plain_pre_token)
    predicted_labels_proba_dev = multinomial_nb_class.predict_proba(X_dev_counts_plain_pre_token)
    predicted_labels_log_proba_dev = multinomial_nb_class.predict_log_proba(X_dev_counts_plain_pre_token)
    model_name = 'counts_plain_pre_token_' + name
    model_name_predict = model_name + '_predict_dev'
    model_name_predict_proba = model_name_predict + '_proba'
    model_name_predict_log_proba = model_name_predict_proba + '_log'
    save_model(model_name=model_name, model=multinomial_nb_class)
    save_csv_results(name=model_name_predict, data=predicted_labels_dev)
    save_csv_results(name=model_name_predict_proba, data=predicted_labels_proba_dev)
    save_csv_results(name=model_name_predict_log_proba, data=predicted_labels_log_proba_dev)

Saving model counts_plain_pre_token_toxic to saved/counts_plain_pre_token_toxic.sav
Finished saving saved/counts_plain_pre_token_toxic.sav
Saving to saved/counts_plain_pre_token_toxic_predict_dev.csv
Finished saving saved/counts_plain_pre_token_toxic_predict_dev.csv
Saving to saved/counts_plain_pre_token_toxic_predict_dev_proba.csv
Finished saving saved/counts_plain_pre_token_toxic_predict_dev_proba.csv
Saving to saved/counts_plain_pre_token_toxic_predict_dev_proba_log.csv
Finished saving saved/counts_plain_pre_token_toxic_predict_dev_proba_log.csv
Saving model counts_plain_pre_token_severe_toxic to saved/counts_plain_pre_token_severe_toxic.sav
Finished saving saved/counts_plain_pre_token_severe_toxic.sav
Saving to saved/counts_plain_pre_token_severe_toxic_predict_dev.csv
Finished saving saved/counts_plain_pre_token_severe_toxic_predict_dev.csv
Saving to saved/counts_plain_pre_token_severe_toxic_predict_dev_proba.csv
Finished saving saved/counts_plain_pre_token_severe_toxic_predict_dev

In [None]:
print(X_train_counts_plain_pre_token.vocabulary_)