A number of things were tested for this model

* A variety of different parameters to countVectorizer and tfidfVectorizor including
  * a number of different preprocessing steps
  * Replacement of the tokenizer with something that will do spelling and error corrections
  * ngrams from (1,10) - code not here as it did not have any success
  
Interesting observations

* Spell checking can be very intensive and so care has to be given to it.  The inital attempt
was very slow and needed to be optimized.  It went from 10 minutes per 1000 messages (in our
data set) to 3.5 minutes with some rearrangment.  Even with that, the total time on my laptop
for a single use of it was 8 hours for transforming dev and training sets.


In [9]:
import numpy as np
import pandas as pd

#sklearn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

#scipy imports
from scipy.sparse import hstack

#Visualization imports
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import bokeh
#! pip install bokeh

# target classes
target_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [11]:
# read frames localy through csv
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

# Random index generator for splitting training data
# Note: Each rerun of cell will create new splits.
randIndexCut = np.random.rand(len(train_df)) < 0.7

#S plit up data
test_data = test_df["comment_text"]
dev_data, dev_labels = train_df[~randIndexCut]["comment_text"], train_df[~randIndexCut][target_names]
train_data, train_labels = train_df[randIndexCut]["comment_text"], train_df[randIndexCut][target_names]

print('total training observations:', train_df.shape[0])
print('training data shape:', train_data.shape)
print('training label shape:', train_labels.shape)
print('dev label shape:', dev_labels.shape)
print('labels names:', target_names)

total training observations: 159571
training data shape: (111853,)
training label shape: (111853, 6)
dev label shape: (47718, 6)
labels names: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [46]:
# Imports etc. used in this analysis
import string
import datetime
import re
from collections import Counter

from enchant import DictWithPWL
from enchant.checker import SpellChecker
import difflib

from sklearn import metrics

punctuation = "[\!\?\"\#\$\%\&\(\)\*\+\,\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\']"

In [15]:
# from http://norvig.com/spell-correct.html
# This is the Norvig spell checker and requires the storage of a "big.txt"
# file with a corpus of words that it uses for predictions

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('../data/big.txt').read()))

def norvig_P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def norvig_correction(word): 
    "Most probable spelling correction for word."
    return max(norvig_candidates(word), key=norvig_P)

def norvig_candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [None]:
# My own spell checking and corrections, using a combination of Norvig
# and pyenchant for spelling and a customer word splitter that uses
# both to verify words

## Custom Dictionary for pyenchant in ./data/mywords.txt.  There are many missing words
# mywords.txt currently contains:
# - list of firstnames and surnames gathered from several internet searches
#         http://www.birkenhoerdt.net/surnames-all.php?tree=1
# - List of swear words from: https://www.noswearing.com/dictionary
# - Custom entries that were flagged as misspelled but are known

my_dict=DictWithPWL('en_US', "../data/mywords.txt")
my_checker = SpellChecker(my_dict)


def trysplit(word):
    """This function looks for between 2 and 4 words which have been conjoined
    likethisforexample.  It uses pyenchant to recognize the words and then the
    probabilities assigned by Norvig, and returns the highest combined probability
    for the parsed block with all valid words
    
    Note: it also only accepts 'a' and 'i' as legitimate single letter words.  Various
    dictionaries define all individual letters as nouns, however they are rarely used
    in writing and if they are in conjoined words it will make them too difficult to
    process.
    
    Also no spelling corrections are attempted here, if the words are both misspelled
    and conjoined we give up:-)
    
    Args:
        word (string) : A word that is suspected to be conjoined
    Returns:
        list of strings : A list of up to 4 valid subwords
    """
    split_candidates = []
    max_proba = 0.0
    for i in range(1,len(word)):
        # I will only allow single letters of 'a' and 'i', all others ignored.  Pyenchant allows for
        # any single letter to be a legitimate word, and so too does norvig.  The dictionary defines
        # them as nouns that represent the letter, however even though several can be used in slang
        # (e.g. k->okay, c->see, u->you) using them in conjoined words would make the splitting far
        # too difficult and also human understanding much more difficult #howucthisk, u c?
        if (len(word[:i]) != 1 or (word[:i].lower() == 'a' or word[:i].lower() == 'i')) and (
            len(word[i:]) != 1 or (word[i:].lower() == 'a' or word[i:].lower() == 'i')):
            if my_checker.check(word[:i]) and my_checker.check(word[i:]):
                norvig_score = norvig_P(word[:i]) + norvig_P(word[i:])
                if norvig_score > max_proba:
                    max_proba = norvig_score
                    split_candidates = [word[:i],word[i:]]
                    
    for i in range(1,len(word)):
        for j in range(i+1,len(word)):        
            if (len(word[:i]) != 1 or (word[:i].lower() == 'a' or word[:i].lower() == 'i')) and (
                len(word[i:j]) != 1 or (word[i:j].lower() == 'a' or word[i:j].lower() == 'i')) and (
                len(word[i:]) != 1 or (word[i:].lower() == 'a' or word[i:].lower() == 'i')):
                
                if my_checker.check(word[:i]) and my_checker.check(word[i:j]) and my_checker.check(word[j:]):
                    norvig_score = norvig_P(word[:i]) + norvig_P(word[i:j]) + norvig_P(word[j:])
                    if norvig_score > max_proba:
                        max_proba = norvig_score
                        split_candidates = [word[:i],word[i:j],word[j:]]
                        
    for i in range(1,len(word)):
        for j in range(i+1,len(word)):
            for k in range(j+1,len(word)):
                if (len(word[:i]) != 1 or (word[:i].lower() == 'a' or word[:i].lower() == 'i')) and (
                    len(word[i:j]) != 1 or (word[i:j].lower() == 'a' or word[i:j].lower() == 'i')) and (
                    len(word[j:k]) != 1 or (word[j:k].lower() == 'a' or word[j:k].lower() == 'i')) and (
                    len(word[k:]) != 1 or (word[k:].lower() == 'a' or word[k:].lower() == 'i')):
                    if my_checker.check(word[:i]) and my_checker.check(word[i:j]) and my_checker.check(word[j:k]) and my_checker.check(word[k:]):
                        norvig_score = norvig_P(word[:i]) + norvig_P(word[i:j]) + norvig_P(word[j:k]) + norvig_P(word[k:])
                        if norvig_score > max_proba:
                            max_proba = norvig_score
                            split_candidates = [word[:i],word[i:j],word[j:k],word[k:]]
                            
    return split_candidates


def get_best_candidates(word):
    """ This function returns the highest probability candidate(s) for a
    word using pyenchant
    
    Args:
        word (string): single word that needs to be corrected
    Returns:
        list of equal probabilty spelling corrections
    """
    best_words = []
    best_ratio = 0
    a = set(my_checker.suggest(word))
    for b in a:
        if not '-' in b:
            tmp = difflib.SequenceMatcher(None, word, b).ratio()
            if tmp > best_ratio:
                best_words=[b]
                best_ratio=tmp
            elif tmp == best_ratio:
                best_words.append(b)
    return best_words

    
def fix_spellings(textinput):
    """This function takes the input text, parses it and then checks all words to correct
    any misspellings or conjoined words
    
    Args:
        block of text (string): a message to be split and checked for errors
    Returns:
        List of the split and corrected words
    """
    words = textinput.split()
    return_list = []
    for word in words:
        if my_checker.check(word) or my_checker.check(word.lower()) or word in punctuation or\
            any(i.isdigit() for i in word) or (word[-1].lower() == 's' and my_checker.check(word[:-1].lower())):
            return_list.append(word)
            # continue
        else:            
            candidates = get_best_candidates(word)
            if len(candidates) == 1:
                return_list.append(candidates.pop())
            elif len(candidates) > 1:
                # try another spell checker
                nv_candidates = norvig_candidates(word)
                tmp_set = set(nv_candidates).intersection(set(candidates))
                if len(tmp_set) == 1:
                    # only 1 overlap, should be correct
                    return_list.append(tmp_set.pop())
                elif len(nv_candidates) == 1 and next(iter(nv_candidates)) == word:
                        # this is suspicious, pyenchants' "suggest" method always returns something, however if
                        # norvigs method cannot find a suitable match within a short distance then it simply
                        # returns the orignal word.  This section is for potentially conjoined words
                        tmp_list=trysplit(word)

                        # If we get back a list of split words then use these
                        if len(tmp_list) != 0:
                            return_list.extend(tmp_list)
                            continue
                else:
                    # arbitrary now, just going to use the first one found from pyenchant, even though
                    # I have seen norvig get the correct word sometimes when pyenchant gets it wrong
                    return_list.append(candidates[0])
    return return_list

In [None]:
# Preprocessing functions:

def my_preprocessor_eng(textblock):
    return_words = textblock
    return_words = re.sub(r"[^A-Za-z0-9]?!'`:´", " ", return_words)
    return_words = re.sub(r","," ",return_words)
    return_words = re.sub(r"\.\.+"," ",return_words)
    return_words = re.sub(r"\."," ",return_words)
    return_words = re.sub(r"\("," ", return_words)
    return_words = re.sub(r"\)"," ", return_words)
    return_works = re.sub(r"\;", " ", return_words)
    return_words = re.sub(r":"," ", return_words)
    return_words = re.sub(r"´", "'", return_words)
    return_words = re.sub(r"`", "'", return_words)
    return_words = re.sub(r"''+", "'", return_words)
    return_words = re.sub(r" '", " ", return_words)
    return_words = re.sub(r"' ", " ", return_words)
    return_words = re.sub(r"\"", " ", return_words)
    return_words = re.sub(r"\/", " ", return_words)
    return_words = re.sub(r"\!\!+", "!!", return_words)
    return_words = re.sub(r"\?\?+", "?!", return_words)
    return_words = re.sub(r"\!", " !", return_words)
    return_words = re.sub(r"\?", " ?", return_words)
    return_words = re.sub(r"\b\d+\b", "999", return_words)
    # slang and abbreviations, need to be aware of capitolization and spaces
    return_words = re.sub(r"[Ww]on't", "will not", return_words)
    return_words = re.sub(r"n't", " not", return_words)
    return_words = re.sub(r"'s\b", " is", return_words)
    return_words = re.sub(r"\b[Aa]bt\b", "about", return_words)
    return return_words

In [34]:
# Calculation of scores on dev set and training set
def score_f1_auc_on_train_dev(dev_vector, train_vector, name, ctype='multi'):
    """This function creates a Naive Bayes classifier with the input vectors
    and then calculates both the AUC score and F1 score for the training and dev data
    
    Args:
        dev_vector: the processed vector of dev data
        train_vector: the processed vector of training data
        name (string) : the label name to test
        ctype: multi, gaus or bern, choses between multinomial or bernoulli
    Returns:
        f1scoredev: the F1 score for dev
        aucdev: the AUC score for dev
        f1scoretrain: the F1 score for training
        auctrain: the AUC score for training
    """
    if ctype == 'multi':
        nb_class = MultinomialNB().fit(train_vector, train_labels[name])
    elif ctype == 'bern':
        nb_class = BernoulliNB().fit(train_vector, train_labels[name])
    elif ctype == 'gaus':
        nb_class = GaussianNB().fit(train_vector, train_labels[name])
    else:
        print('ctype = %s, error' % (ctype))
    
    predicted_labels_dev = nb_class.predict(dev_vector)
    fpr, tpr, thresholds = metrics.roc_curve(dev_labels[name], predicted_labels_dev)
    
    predicted_labels_train = nb_class.predict(train_vector)
    fpr1, tpr1, thresholds1 = metrics.roc_curve(train_labels[name], predicted_labels_train)
    
    f1scoredev = metrics.f1_score(dev_labels[name],predicted_labels_dev,average='micro')
    f1scoretrain = metrics.f1_score(train_labels[name],predicted_labels_train,average='micro')
    
    aucdev = metrics.auc(fpr,tpr)
    auctrain = metrics.auc(fpr1,tpr1)
    
    return f1scoredev,aucdev,f1scoretrain,auctrain

In [None]:
#scores_all=pd.DataFrame(columns=['vector', 'label', 'model', 'f1dev','aucdev','f1train','auctrain'])
vectors_all=pd.DataFrame(columns=['vectortrain', 'vectordata','type','preprocessor', 'tokenizer',
                                  'max_features', 'stop_words', 'lowercase', 'strip_accents' ])

print(str(datetime.datetime.now().time()))
for i in None, 1000, 4000, 5000, 6000, 10000:
    print('%s: Doing i = %s' %(str(datetime.datetime.now().time()), i))
    for x in None, 'english':
        for y in None, 'ascii', 'unicode':
            for z in True, False:
                vect = CountVectorizer(max_features=i, stop_words=x, strip_accents=y, lowercase=True)
                vect_train = vect.fit_transform(train_data)
                vect_dev = vect.transform(dev_data)
                vectors_all.loc[vectors_all.shape[0]] = [vect_train, vect_dev, 'count', 0, 0, i, x, z, y]
                # Same but with the preprocessor
                vect = CountVectorizer(max_features=i, stop_words=x, 
                                strip_accents=y, lowercase=z, preprocessor=my_preprocessor_eng)
                vect_train = vect.fit_transform(train_data)
                vect_dev = vect.transform(dev_data)
                vectors_all.loc[vectors_all.shape[0]] = [vect_train, vect_dev, 'count', 1, 0, i, x, z, y]
                vect = TfidfVectorizer(max_features=i, stop_words=x, strip_accents=y, lowercase=True)
                vect_train = vect.fit_transform(train_data)
                vect_dev = vect.transform(dev_data)
                vectors_all.loc[vectors_all.shape[0]] = [vect_train, vect_dev, 'tfidf', 0, 0, i, x, z, y]
                # Same but with the preprocessor
                vect = TfidfVectorizer(max_features=i, stop_words=x, 
                                strip_accents=y, lowercase=z, preprocessor=my_preprocessor_eng)
                vect_train = vect.fit_transform(train_data)
                vect_dev = vect.transform(dev_data)
                vectors_all.loc[vectors_all.shape[0]] = [vect_train, vect_dev, 'tfidf', 1, 0, i, x, z, y]

print(str(datetime.datetime.now().time()))

19:58:44.270299
19:58:44.272527: Doing i = None


In [None]:
print(vectors_all.shape)

In [None]:
scores_all=pd.DataFrame(columns=['vector', 'label', 'model', 'f1dev','aucdev','f1train','auctrain'])

for index,row in vectors_all.iterrows():
    print('%s: testint row %d' % (str(datetime.datetime.now().time()), index))
    for name in target_names:
        tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(
            train_vector=row['vectortrain'],
            dev_vector=row[1],name=name, ctype='multi')
        scores_all.loc[scores_all.shape[0]] = [index,name,'multi',tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]
        tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain = score_f1_auc_on_train_dev(
            train_vector=row[0],
            dev_vector=row[1],name=name, ctype='bern')
        scores_all.loc[scores_all.shape[0]] = [index,name,'bern',tmpf1dev,tmpaucdev,tmpf1train,tmpauctrain]

    

In [108]:
# scores_all=pd.DataFrame(columns=['vector', 'label', 'model', 'f1dev','aucdev','f1train','auctrain'])
# vectors_all=pd.DataFrame(columns=['vectortrain', 'vectordata','preprocessor', 'tokenizer',
#                                   'max_features', 'stop_words', 'lowercase', 'strip_accents' ])
data_all=pd.DataFrame(columns=['label', 'model', 'type', 'preprocessor', 'tokenizer', 'max_features',
                                'stop_words', 'lowercase', 'strip_accents', 'f1dev',
                                'aucdev','f1train','auctrain'])
for index,row in scores_all.iterrows():
    df_row = vectors_all.loc[int(row['vector'])]
    data_all.loc[data_all.shape[0]] = [row['label'], row['model'], df_row['type'], df_row['preprocessor'],
                                        df_row['tokenizer'], df_row['max_features'], 
                                        df_row['stop_words'], df_row['lowercase'],
                                      df_row['strip_accents'], row['f1dev'],row['aucdev'],
                                       row['f1train'],row['auctrain']]

pd.DataFrame(data_all).to_csv('results.csv')


In [107]:
for label in target_names:
    df_tmp = data_all.loc[data_all['label'] == label]
    print(df_tmp.loc[df_tmp['aucdev'].idxmax()])

label                toxic
model                multi
preprocessor             0
tokenizer                0
max_features         10000
stop_words            None
lowercase             True
strip_accents        ascii
f1dev            0.9397712
aucdev           0.8491642
f1train           0.943381
auctrain         0.8659385
Name: 1488, dtype: object
label            severe_toxic
model                    bern
preprocessor                0
tokenizer                   0
max_features            10000
stop_words            english
lowercase                True
strip_accents           ascii
f1dev               0.9569974
aucdev              0.9350653
f1train             0.9568541
auctrain            0.9431089
Name: 1635, dtype: object
label              obscene
model                multi
preprocessor             0
tokenizer                0
max_features         10000
stop_words            None
lowercase             True
strip_accents      unicode
f1dev            0.9599313
aucdev           0.87

In [None]:
count_vect_plain_pre = CountVectorizer(preprocessor=my_preprocessor)
print(str(datetime.datetime.now().time()))
X_train_counts_plain_pre = count_vect_plain_pre.fit_transform(tiny_data)
print(str(datetime.datetime.now().time()))
count_vect_plain_pre_token6k = CountVectorizer(tokenizer=fix_spellings, max_features=10000, strip_accents='ascii', lowercase=True)
print(str(datetime.datetime.now().time()))
X_train_counts_plain_pre_token6k = count_vect_plain_pre_token6k.fit_transform(train_data)
print(str(datetime.datetime.now().time()))
X_dev_counts_plain_pre_token6k = count_vect_plain_pre_token6k.transform(dev_data)
print(str(datetime.datetime.now().time()))