In [1]:
import numpy as np
import pandas as pd
import nltk
import string
import re
import matplotlib.pyplot as plt
import unicodedata
import sys
import collections
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
unicodePuncs = [chr(i) for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P')]
stopwords += ([i for i in string.punctuation] + ['--', '\'\'', '``', '...'])
stopwords += unicodePuncs

In [3]:
class BOWExtractor():      
    def __init__(self, ngrams_to_take = {1: 100, 2: 50}, stopwords=[], tokenizer = nltk.word_tokenize, unicodeNormalize = True, binary = False):
        self.ngrams_to_take = ngrams_to_take
        self.stopwords = stopwords
        self.tokenizer = tokenizer
        self.unicodeNormalize = unicodeNormalize
        self.binary = binary
    
    def hasnum(self,s):
        return any(i.isdigit() for i in s)
    
    def hasPunc(self, s):
        return any(i in string.punctuation for i in s)
    
    def preprocess(self, series):
        if self.unicodeNormalize:
            series = series.apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii','ignore').decode('utf-8'))
        toks = series.apply(lambda x: list(filter(lambda z: z not in self.stopwords, map(lambda y: y.lower(),self.tokenizer(x)))))
        toks = toks.apply(lambda x: [i for i in x if not self.hasPunc(i) and not self.hasnum(i)])
        return toks
    
    def getngrams(self, lst, n=2):
        ans = []
        for i in range(len(lst) - n+1):
            ans.append([lst[i+j] for j in range(n)])
        return ans
    
    def flatten(self, x):
        if isinstance(x, (np.ndarray, list, tuple)):
            lst = []
            for i in x:
                lst += self.flatten(i)
            return lst
        else:
            return [x]
    
    def fit(self, series): #input is a pandas series of strings
        assert(isinstance(series.iloc[0], str))
        toks = self.preprocess(series)
        self.most_common_ngrams = {}
        for i in self.ngrams_to_take.keys():
            if i == 1:
                self.most_common_ngrams[i] = {i[0]: i[1] for i in nltk.FreqDist(self.flatten(toks.values)).most_common(self.ngrams_to_take[i])}
            else:
                ngrams = [j for i in toks for j in self.getngrams(i)]
                self.most_common_ngrams[i] = {i[0]: i[1] for i in collections.Counter(map(tuple, ngrams)).most_common(self.ngrams_to_take[i])}
        self.features = [j for i in self.most_common_ngrams.keys() for j in self.most_common_ngrams[i].keys()]
        return self
            
    def transform(self, series): #input is a pandas series of strings, output is a 2d numpy array, in the order of self.features
        ngrams = {}
        toks = self.preprocess(series)
        for i in self.ngrams_to_take.keys():
            if i == 1:
                ngrams[1] = toks
            else:
                ngrams[i] = toks.apply(lambda x: self.getngrams(x, i))
        ans = np.zeros(shape = (len(series), len(self.features),), dtype='int16')
        # loop through each feature
        for count, i in enumerate(self.features):
            if isinstance(i, str):
                ans[:, count] = ngrams[1].apply(lambda x: x.count(i)).values
            else: #tuple
                ans[:, count] = ngrams[len(i)].apply(lambda x: x.count(i)).values
        if self.binary:
            ans = (ans>0).astype(int)
        return ans

#### Load Data

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
output_names = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [7]:
train[output_names].apply(pd.value_counts)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,144277,157976,151122,159093,151694,158166
1,15294,1595,8449,478,7877,1405


In [8]:
#traindf, valdf = train_test_split(train, test_size = 0.1, random_state = 42, stratify = train.threat)

#### Feature Extraction

In [9]:
bow= BOWExtractor(ngrams_to_take = {1:2400, 2:1600, 3:400},stopwords = stopwords).fit(train['comment_text'])

In [10]:
train_mat = bow.transform(train['comment_text']).astype('int16')

#### Model

In [12]:
final_models = []
gs_objs = []
for i in output_names:
    lr = LogisticRegression(random_state = 10)
    param_grid = {'C': 10**np.linspace(-2,2,5),
        'penalty': ['l2']
    }
    gs = GridSearchCV(estimator = lr, param_grid = param_grid, scoring = 'roc_auc', 
                      n_jobs = 2, cv = 3, refit = True, verbose = 10, return_train_score = True).fit(train_mat, train[i])
    final_models.append(gs.best_estimator_)
    gs_objs.append(gs)
    print('Completed calculation for', i)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] ..... C=0.01, penalty=l2, score=0.9351442743376353, total=  10.6s
[CV] C=0.01, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   15.5s


[CV] ..... C=0.01, penalty=l2, score=0.9283337526834519, total=  13.2s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ..... C=0.01, penalty=l2, score=0.9279195451917107, total=  11.1s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ....... C=0.1, penalty=l2, score=0.938509175007358, total=  13.0s
[CV] C=0.1, penalty=l2 ...............................................


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   35.4s


[CV] ........ C=0.1, penalty=l2, score=0.93414299720488, total=  11.9s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9343704671806133, total=  12.5s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.9357168132062554, total=  13.8s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.9346370972570652, total=  11.9s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ...... C=1.0, penalty=l2, score=0.9343512440210793, total=  13.2s
[CV] C=10.0, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.2min


[CV] ..... C=10.0, penalty=l2, score=0.9369514462453202, total=  13.1s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ..... C=10.0, penalty=l2, score=0.9347963031951282, total=  13.6s
[CV] C=100.0, penalty=l2 .............................................
[CV] ...... C=10.0, penalty=l2, score=0.934340423091259, total=  13.5s
[CV] C=100.0, penalty=l2 .............................................
[CV] ..... C=100.0, penalty=l2, score=0.937217644556969, total=  13.4s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.9341022463907691, total=  12.0s
[CV] ..... C=100.0, penalty=l2, score=0.934403798744312, total=  10.4s


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:  1.9min finished


Completed calculation for toxic
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] ..... C=0.01, penalty=l2, score=0.9584774189789976, total=   9.3s
[CV] C=0.01, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   14.1s


[CV] ..... C=0.01, penalty=l2, score=0.9440713709585876, total=  12.9s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ..... C=0.01, penalty=l2, score=0.9483735040715775, total=  12.3s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9615086968260965, total=  12.2s
[CV] C=0.1, penalty=l2 ...............................................


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   34.4s


[CV] ...... C=0.1, penalty=l2, score=0.9453514897309929, total=  12.6s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ....... C=0.1, penalty=l2, score=0.950106929560532, total=  13.3s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.9607178945483688, total=  12.3s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.9425002966311695, total=  13.0s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ...... C=1.0, penalty=l2, score=0.9505076248333506, total=  11.4s
[CV] C=10.0, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.2min


[CV] ..... C=10.0, penalty=l2, score=0.9626602397293867, total=  12.6s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ..... C=10.0, penalty=l2, score=0.9453513112525517, total=  12.6s
[CV] C=100.0, penalty=l2 .............................................
[CV] ..... C=10.0, penalty=l2, score=0.9488049202690079, total=  11.1s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.9609239300610096, total=  11.5s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.9430053192286818, total=  11.8s
[CV] .... C=100.0, penalty=l2, score=0.9435557192097476, total=   9.7s


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:  1.8min finished


Completed calculation for severe_toxic
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] ..... C=0.01, penalty=l2, score=0.9528148688139306, total=  13.3s
[CV] C=0.01, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   17.9s


[CV] ..... C=0.01, penalty=l2, score=0.9462545690902504, total=  11.4s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ..... C=0.01, penalty=l2, score=0.9478386611329076, total=  12.0s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9542545155914978, total=  11.4s


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   33.6s


[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9488509789107026, total=  10.4s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9493056497061074, total=  10.7s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.9561278865185325, total=  11.6s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ....... C=1.0, penalty=l2, score=0.948494532910872, total=  10.0s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ....... C=1.0, penalty=l2, score=0.950596780400784, total=  11.5s
[CV] C=10.0, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.2min


[CV] ..... C=10.0, penalty=l2, score=0.9564008817876152, total=  11.2s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ..... C=10.0, penalty=l2, score=0.9519414206451651, total=  11.7s
[CV] C=100.0, penalty=l2 .............................................
[CV] ..... C=10.0, penalty=l2, score=0.9511728619359012, total=  11.9s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.9567445800055274, total=  11.4s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.9477733999964358, total=  11.1s
[CV] ..... C=100.0, penalty=l2, score=0.951419433771751, total=   9.2s


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:  1.8min finished


Completed calculation for obscene
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] ..... C=0.01, penalty=l2, score=0.8915188757519187, total=   9.1s
[CV] C=0.01, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   13.9s


[CV] ..... C=0.01, penalty=l2, score=0.9032004420340826, total=   9.5s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ..... C=0.01, penalty=l2, score=0.8584277690194024, total=  10.7s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9329137674190568, total=  13.3s
[CV] C=0.1, penalty=l2 ...............................................


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   31.7s


[CV] ...... C=0.1, penalty=l2, score=0.9246530657456912, total=  13.2s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9171881072528006, total=  14.0s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.9335706944994437, total=  12.7s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.9252319368438705, total=  13.2s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ...... C=1.0, penalty=l2, score=0.9190346598032314, total=  13.2s
[CV] C=10.0, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.2min


[CV] ..... C=10.0, penalty=l2, score=0.9329591418227074, total=  12.5s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ..... C=10.0, penalty=l2, score=0.9245819076512624, total=  12.3s
[CV] C=100.0, penalty=l2 .............................................
[CV] ....... C=10.0, penalty=l2, score=0.91873638879075, total=  14.0s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.9336669825196583, total=  13.5s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.9246530657456911, total=  11.7s
[CV] .... C=100.0, penalty=l2, score=0.9185544612626602, total=  10.9s


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:  1.9min finished


Completed calculation for threat
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] ..... C=0.01, penalty=l2, score=0.9283851766734301, total=  11.0s
[CV] C=0.01, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   15.6s


[CV] ..... C=0.01, penalty=l2, score=0.9319315836154275, total=  14.1s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ..... C=0.01, penalty=l2, score=0.9247071434221975, total=  15.4s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9313195581475405, total=  11.7s


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   34.7s


[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9332372334282922, total=  11.9s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9237015192438814, total=  11.2s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.9309818208847789, total=  10.7s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ....... C=1.0, penalty=l2, score=0.933243371230307, total=  11.5s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ...... C=1.0, penalty=l2, score=0.9242169621903029, total=  10.9s
[CV] C=10.0, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.2min


[CV] ..... C=10.0, penalty=l2, score=0.9312815188371403, total=  10.9s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ..... C=10.0, penalty=l2, score=0.9324729678773048, total=  11.6s
[CV] C=100.0, penalty=l2 .............................................
[CV] ..... C=10.0, penalty=l2, score=0.9244806770109356, total=  11.8s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.9313696395995623, total=  11.3s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.9329905239114834, total=  11.7s
[CV] .... C=100.0, penalty=l2, score=0.9235701251784632, total=   9.2s


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:  1.8min finished


Completed calculation for insult
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.01, penalty=l2 ..............................................
[CV] C=0.01, penalty=l2 ..............................................
[CV] ..... C=0.01, penalty=l2, score=0.8826133036066639, total=  14.3s
[CV] C=0.01, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   19.0s


[CV] ..... C=0.01, penalty=l2, score=0.9046365438194276, total=  12.4s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ..... C=0.01, penalty=l2, score=0.8643686023480037, total=  13.8s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.8933829527353883, total=  13.8s


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   37.4s


[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.9154300155921871, total=  16.0s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.8666108505928694, total=  14.5s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.8958248151849961, total=  11.8s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.9154217882737287, total=  12.3s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ...... C=1.0, penalty=l2, score=0.8613375447476961, total=  12.3s
[CV] C=10.0, penalty=l2 ..............................................


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.3min


[CV] ...... C=10.0, penalty=l2, score=0.895865500085778, total=  13.0s
[CV] C=10.0, penalty=l2 ..............................................
[CV] ..... C=10.0, penalty=l2, score=0.9154137230699199, total=  13.1s
[CV] C=100.0, penalty=l2 .............................................
[CV] ..... C=10.0, penalty=l2, score=0.8626577659239547, total=  13.0s
[CV] C=100.0, penalty=l2 .............................................
[CV] .... C=100.0, penalty=l2, score=0.8958683310430888, total=  12.3s
[CV] C=100.0, penalty=l2 .............................................
[CV] ..... C=100.0, penalty=l2, score=0.915461871120799, total=  11.9s
[CV] .... C=100.0, penalty=l2, score=0.8614776928621245, total=   9.7s


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:  2.0min finished


Completed calculation for identity_hate


In [13]:
for a,b in zip(output_names, gs_objs):
    print(a,b.best_score_)

toxic 0.935674230897
severe_toxic 0.952322399807
obscene 0.953171741693
threat 0.925945811499
insult 0.929480784076
identity_hate 0.89180794951


#### Submission

In [14]:
test_mat = bow.transform(test['comment_text'])

In [18]:
pred = [model.predict_proba(test_mat)[:,1] for model in final_models]

In [19]:
for count,i in enumerate(output_names):
    test[i] = pred[count].flatten()

In [20]:
test[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].to_csv('data/answers/linear_baseline1.csv', index = False)