In [37]:
import numpy as np
import pandas as pd
import nltk
import string
import re
import matplotlib.pyplot as plt
import unicodedata
import sys
import collections
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
unicodePuncs = [chr(i) for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P')]
stopwords += ([i for i in string.punctuation] + ['--', '\'\'', '``', '...'])
stopwords += unicodePuncs

In [3]:
class BOWExtractor():      
    def __init__(self, ngrams_to_take = {1: 100, 2: 50}, stopwords=[], tokenizer = nltk.word_tokenize, unicodeNormalize = True, binary = False):
        self.ngrams_to_take = ngrams_to_take
        self.stopwords = stopwords
        self.tokenizer = tokenizer
        self.unicodeNormalize = unicodeNormalize
        self.binary = binary
    
    def hasnum(self,s):
        return any(i.isdigit() for i in s)
    
    def hasPunc(self, s):
        return any(i in string.punctuation for i in s)
    
    def preprocess(self, series):
        if self.unicodeNormalize:
            series = series.apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii','ignore').decode('utf-8'))
        toks = series.apply(lambda x: list(filter(lambda z: z not in self.stopwords, map(lambda y: y.lower(),self.tokenizer(x)))))
        toks = toks.apply(lambda x: [i for i in x if not self.hasPunc(i) and not self.hasnum(i)])
        return toks
    
    def getngrams(self, lst, n=2):
        ans = []
        for i in range(len(lst) - n+1):
            ans.append([lst[i+j] for j in range(n)])
        return ans
    
    def flatten(self, x):
        if isinstance(x, (np.ndarray, list, tuple)):
            lst = []
            for i in x:
                lst += self.flatten(i)
            return lst
        else:
            return [x]
    
    def fit(self, series): #input is a pandas series of strings
        assert(isinstance(series.iloc[0], str))
        toks = self.preprocess(series)
        self.most_common_ngrams = {}
        for i in self.ngrams_to_take.keys():
            if i == 1:
                self.most_common_ngrams[i] = {i[0]: i[1] for i in nltk.FreqDist(self.flatten(toks.values)).most_common(self.ngrams_to_take[i])}
            else:
                ngrams = [j for i in toks for j in self.getngrams(i)]
                self.most_common_ngrams[i] = {i[0]: i[1] for i in collections.Counter(map(tuple, ngrams)).most_common(self.ngrams_to_take[i])}
        self.features = [j for i in self.most_common_ngrams.keys() for j in self.most_common_ngrams[i].keys()]
        return self
            
    def transform(self, series): #input is a pandas series of strings, output is a 2d numpy array, in the order of self.features
        ngrams = {}
        toks = self.preprocess(series)
        for i in self.ngrams_to_take.keys():
            if i == 1:
                ngrams[1] = toks
            else:
                ngrams[i] = toks.apply(lambda x: self.getngrams(x, i))
        ans = np.zeros(shape = (len(series), len(self.features),), dtype='int16')
        # loop through each feature
        for count, i in enumerate(self.features):
            if isinstance(i, str):
                ans[:, count] = ngrams[1].apply(lambda x: x.count(i)).values
            else: #tuple
                ans[:, count] = ngrams[len(i)].apply(lambda x: x.count(i)).values
        if self.binary:
            ans = (ans>0).astype(int)
        return ans

#### Load Data

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
output_names = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [7]:
train[output_names].apply(pd.value_counts)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,144277,157976,151122,159093,151694,158166
1,15294,1595,8449,478,7877,1405


In [8]:
#traindf, valdf = train_test_split(train, test_size = 0.1, random_state = 42, stratify = train.threat)

#### Feature Extraction

In [9]:
# bow= BOWExtractor(ngrams_to_take = {1:2400, 2:1600, 3:400},stopwords = stopwords).fit(train['comment_text'])

In [10]:
# train_mat = bow.transform(train['comment_text']).astype('int16')

In [25]:
# tf = TfidfVectorizer(input = 'content', analyzer = 'word', 
#                      tokenizer = nltk.word_tokenize, ngram_range = (1, 4), 
#                      stop_words = stopwords, max_features = 200000, dtype='int16').fit(train['comment_text'].values)

In [26]:
# train_mat = tf.transform(train['comment_text'].values)

In [27]:
# train_mat.shape

(159571, 100000)

#### Model

In [40]:
tf = TfidfVectorizer(input = 'content', analyzer = 'word', 
                     tokenizer = nltk.word_tokenize, ngram_range = (1, 4), 
                     stop_words = stopwords, max_features = 100000, dtype='int16')

In [41]:
pipe = Pipeline(steps = [
    ('tfidf', tf),
    ('lr', LogisticRegression(random_state = 10))    
])

In [44]:
final_models = []
gs_objs = []
for i in output_names:
    param_grid = {'lr__C': 10**np.linspace(-2,2,5),
        'lr__penalty': ['l2']
    }
    gs = GridSearchCV(estimator = pipe, param_grid = param_grid, scoring = 'roc_auc', 
                      n_jobs = 2, cv = 3, refit = True, verbose = 10, return_train_score = True).fit(train['comment_text'].values, train[i])
    final_models.append(gs.best_estimator_)
    gs_objs.append(gs)
    print('Completed calculation for', i)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9419268169262449, total= 3.0min
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9356197359826907, total= 3.0min


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  4.3min


[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9566178468115633, total= 2.8min
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.93669610851465, total= 2.8min


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  8.5min


[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9508638360919786, total= 2.9min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9518373220974693, total= 2.9min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9689821274177841, total= 2.9min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9649394144582369, total= 2.9min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9660830007491525, total= 2.9min


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 21.2min


[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9660302040662568, total= 2.9min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9644823372405714, total= 2.8min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9645844467647287, total= 2.9min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9544443045189588, total= 2.9min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9546502315736085, total= 2.9min
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9534901630446966, total= 2.7min


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed: 33.5min finished


Completed calculation for toxic
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9792203262100446, total= 2.8min


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  4.1min


[CV]  lr__C=0.01, lr__penalty=l2, score=0.9766206627775501, total= 2.8min
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9825358696299805, total= 2.9min
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9835815183146724, total= 2.9min


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  8.3min


[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9810698090580522, total= 2.8min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9844761159653033, total= 2.8min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9858571541369804, total= 2.8min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.981896663980923, total= 2.8min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.985450387709513, total= 2.9min


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 21.0min


[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9813480391002002, total= 3.0min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9754322105325983, total= 2.9min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9808930690804515, total= 2.9min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9655424166866206, total= 3.0min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9594174827771873, total= 3.0min
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9675236731725645, total= 2.8min


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed: 33.7min finished


Completed calculation for severe_toxic
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9678850371038618, total= 2.9min


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  4.2min


[CV]  lr__C=0.01, lr__penalty=l2, score=0.966516190429677, total= 2.9min
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9682195924484853, total= 2.9min
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9760739931552668, total= 2.9min


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  8.5min


[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9746049055902756, total= 2.9min
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9747484272189476, total= 2.9min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9818800632187108, total= 2.9min
[CV]  lr__C=1.0, lr__penalty=l2, score=0.98168962495759, total= 2.9min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9812191949107044, total= 2.9min


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 21.3min


[CV]  lr__C=10.0, lr__penalty=l2, score=0.9790409871188712, total= 2.9min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9799273451627283, total= 2.9min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9786732598120603, total= 2.9min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.968988554184103, total= 3.0min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9713249157664308, total= 3.0min
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9693199484334452, total= 2.7min


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed: 34.0min finished


Completed calculation for obscene
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9624854448341535, total= 2.9min


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  4.2min


[CV]  lr__C=0.01, lr__penalty=l2, score=0.9492669471007168, total= 2.9min
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9407525846102358, total= 2.9min
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9787338419980766, total= 2.9min


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  8.5min


[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9741450028813099, total= 2.9min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9746426944534281, total= 2.9min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9845026376081916, total= 2.9min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9778570834740188, total= 2.9min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9831433590107317, total= 2.8min


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 21.1min


[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9861732406516942, total= 2.8min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9751831994790279, total= 2.8min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9849271738412408, total= 2.8min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9796454550168769, total= 2.8min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9638513915380454, total= 2.8min
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9793437539618752, total= 2.6min


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed: 33.4min finished


Completed calculation for threat
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9568362801184394, total= 2.9min
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9607439814332619, total= 2.9min


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  4.2min


[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9689370659905596, total= 2.8min
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9580298951635079, total= 2.8min


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  8.4min


[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9650742722995574, total= 2.8min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9669659799367892, total= 2.8min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9760583472262294, total= 2.8min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9728641597473303, total= 2.8min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9744750151623026, total= 2.8min


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 20.8min


[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9710713717927254, total= 2.9min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9689168827888425, total= 2.9min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9704982803500326, total= 2.8min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9550570179214029, total= 2.9min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9544310073021769, total= 2.9min
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9539012585652884, total= 2.8min


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed: 33.4min finished


Completed calculation for insult
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9532056102456065, total= 2.9min


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  4.2min


[CV] lr__C=0.01, lr__penalty=l2 ......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9468457474247277, total= 2.9min
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.01, lr__penalty=l2, score=0.9568186556350892, total= 2.8min
[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9646360856951808, total= 2.8min


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  8.4min


[CV] lr__C=0.1, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9619614794518062, total= 2.8min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=0.1, lr__penalty=l2, score=0.9671131790455793, total= 2.8min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9716598525524194, total= 2.8min
[CV] lr__C=1.0, lr__penalty=l2 .......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9701293018338086, total= 2.8min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=1.0, lr__penalty=l2, score=0.9745540793395578, total= 2.8min


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 20.8min


[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9665712067861445, total= 2.9min
[CV] lr__C=10.0, lr__penalty=l2 ......................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.9662059246744008, total= 2.8min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=10.0, lr__penalty=l2, score=0.96982592453174, total= 2.8min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9522193856029968, total= 2.9min
[CV] lr__C=100.0, lr__penalty=l2 .....................................
[CV]  lr__C=100.0, lr__penalty=l2, score=0.952219685938532, total= 2.9min
[CV]  lr__C=100.0, lr__penalty=l2, score=0.9546624497404057, total= 2.7min


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed: 33.1min finished


Completed calculation for identity_hate


In [45]:
for a,b in zip(output_names, gs_objs):
    print(a,b.best_score_)

toxic 0.966668195376
severe_toxic 0.984401388795
obscene 0.981596296141
threat 0.982094563551
insult 0.974465840597
identity_hate 0.972114408393


#### Submission

In [46]:
# test_mat = bow.transform(test['comment_text'])

In [47]:
# test_mat = tf.transform(test['comment_text'].values)

In [48]:
pred = [model.predict_proba(test['comment_text'].values)[:,1] for model in final_models]

In [49]:
for count,i in enumerate(output_names):
    test[i] = pred[count].flatten()

In [50]:
test[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].to_csv('data/answers/linear_baseline1.csv', index = False)

#### Save model

In [54]:
from sklearn.externals import joblib
for count,i in enumerate(final_models):
    joblib.dump(i, 'weights/logit/'+str(count))