In [21]:
import numpy as np
import pandas as pd
import nltk
import string
import re
import matplotlib.pyplot as plt
import unicodedata
import sys
import collections
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.externals import joblib

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
unicodePuncs = [chr(i) for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P')]
stopwords += ([i for i in string.punctuation] + ['--', '\'\'', '``', '...'])
stopwords += unicodePuncs

In [3]:
class BOWExtractor():      
    def __init__(self, ngrams_to_take = {1: 100, 2: 50}, stopwords=[], tokenizer = nltk.word_tokenize, unicodeNormalize = True, binary = False):
        self.ngrams_to_take = ngrams_to_take
        self.stopwords = stopwords
        self.tokenizer = tokenizer
        self.unicodeNormalize = unicodeNormalize
        self.binary = binary
    
    def hasnum(self,s):
        return any(i.isdigit() for i in s)
    
    def hasPunc(self, s):
        return any(i in string.punctuation for i in s)
    
    def preprocess(self, series):
        if self.unicodeNormalize:
            series = series.apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii','ignore').decode('utf-8'))
        toks = series.apply(lambda x: list(filter(lambda z: z not in self.stopwords, map(lambda y: y.lower(),self.tokenizer(x)))))
        toks = toks.apply(lambda x: [i for i in x if not self.hasPunc(i) and not self.hasnum(i)])
        return toks
    
    def getngrams(self, lst, n=2):
        ans = []
        for i in range(len(lst) - n+1):
            ans.append([lst[i+j] for j in range(n)])
        return ans
    
    def flatten(self, x):
        if isinstance(x, (np.ndarray, list, tuple)):
            lst = []
            for i in x:
                lst += self.flatten(i)
            return lst
        else:
            return [x]
    
    def fit(self, series): #input is a pandas series of strings
        assert(isinstance(series.iloc[0], str))
        toks = self.preprocess(series)
        self.most_common_ngrams = {}
        for i in self.ngrams_to_take.keys():
            if i == 1:
                self.most_common_ngrams[i] = {i[0]: i[1] for i in nltk.FreqDist(self.flatten(toks.values)).most_common(self.ngrams_to_take[i])}
            else:
                ngrams = [j for i in toks for j in self.getngrams(i)]
                self.most_common_ngrams[i] = {i[0]: i[1] for i in collections.Counter(map(tuple, ngrams)).most_common(self.ngrams_to_take[i])}
        self.features = [j for i in self.most_common_ngrams.keys() for j in self.most_common_ngrams[i].keys()]
        return self
            
    def transform(self, series): #input is a pandas series of strings, output is a 2d numpy array, in the order of self.features
        ngrams = {}
        toks = self.preprocess(series)
        for i in self.ngrams_to_take.keys():
            if i == 1:
                ngrams[1] = toks
            else:
                ngrams[i] = toks.apply(lambda x: self.getngrams(x, i))
        ans = np.zeros(shape = (len(series), len(self.features),), dtype='int16')
        # loop through each feature
        for count, i in enumerate(self.features):
            if isinstance(i, str):
                ans[:, count] = ngrams[1].apply(lambda x: x.count(i)).values
            else: #tuple
                ans[:, count] = ngrams[len(i)].apply(lambda x: x.count(i)).values
        if self.binary:
            ans = (ans>0).astype(int)
        return ans

#### Load Data

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [27]:
output_names = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [7]:
train[output_names].apply(pd.value_counts)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,144277,157976,151122,159093,151694,158166
1,15294,1595,8449,478,7877,1405


In [8]:
#traindf, valdf = train_test_split(train, test_size = 0.1, random_state = 42, stratify = train.threat)

#### Feature Extraction

In [9]:
# bow= BOWExtractor(ngrams_to_take = {1:2400, 2:1600, 3:400},stopwords = stopwords).fit(train['comment_text'])

In [10]:
# train_mat = bow.transform(train['comment_text']).astype('int16')

In [11]:
# tf = TfidfVectorizer(input = 'content', analyzer = 'word', 
#                      tokenizer = nltk.word_tokenize, ngram_range = (1, 4), 
#                      stop_words = stopwords, max_features = 200000, dtype='int16').fit(train['comment_text'].values)

In [12]:
# train_mat = tf.transform(train['comment_text'].values)

In [13]:
# train_mat.shape

#### Model

In [15]:
tf = TfidfVectorizer(input = 'content', analyzer = 'word', 
                     tokenizer = nltk.word_tokenize, ngram_range = (1, 4), 
                     stop_words = stopwords, max_features = 100000, dtype='int16')

In [16]:
pipe = Pipeline(steps = [
    ('tfidf', tf),
    ('svm', SVC(kernel='linear', random_state = 10, probability=True))    
])

In [18]:
final_models = []
gs_objs = []
output_names = ['severe_toxic','obscene','threat','insult','identity_hate']
for i in output_names:
    param_grid = {'svm__C': 10**np.linspace(-1,1,3),
    }
    gs = GridSearchCV(estimator = pipe, param_grid = param_grid, scoring = 'roc_auc', 
                      n_jobs = 4, cv = 3, refit = True, verbose = 10, return_train_score = True).fit(train['comment_text'].values, train[i])
    final_models.append(gs.best_estimator_)
    gs_objs.append(gs)
    print('Completed calculation for', i)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9353129697998771, total=23.8min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9371224020856562, total=25.5min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=1.0, score=0.9360261518034818, total=26.4min


[Parallel(n_jobs=4)]: Done   3 out of   9 | elapsed: 29.6min remaining: 59.2min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=0.1, score=0.9446079019368059, total=26.4min


[Parallel(n_jobs=4)]: Done   4 out of   9 | elapsed: 29.6min remaining: 37.0min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9347505842313297, total=24.1min


[Parallel(n_jobs=4)]: Done   5 out of   9 | elapsed: 53.7min remaining: 43.0min


[CV] svm__C=10.0 .....................................................
[CV] .............. svm__C=1.0, score=0.944916935126062, total=26.3min


[Parallel(n_jobs=4)]: Done   6 out of   9 | elapsed: 58.0min remaining: 29.0min


[CV] ............ svm__C=10.0, score=0.9192984205229078, total=31.6min


[Parallel(n_jobs=4)]: Done   7 out of   9 | elapsed: 64.2min remaining: 18.3min


[CV] ............ svm__C=10.0, score=0.9273358223222844, total=32.4min
[CV] ............ svm__C=10.0, score=0.9392404306823285, total=32.3min


[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 88.6min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 88.6min finished


Completed calculation for severe_toxic
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9785381138928173, total=74.9min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9779083351417759, total=76.6min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9767456858775903, total=78.0min


[Parallel(n_jobs=4)]: Done   3 out of   9 | elapsed: 86.0min remaining: 171.9min


[CV] svm__C=10.0 .....................................................
[CV] .............. svm__C=1.0, score=0.973568213842813, total=89.1min


[Parallel(n_jobs=4)]: Done   4 out of   9 | elapsed: 96.7min remaining: 120.8min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9730005637377869, total=81.9min


[Parallel(n_jobs=4)]: Done   5 out of   9 | elapsed: 173.8min remaining: 139.0min


[CV] svm__C=10.0 .....................................................
[CV] .............. svm__C=1.0, score=0.975131256130282, total=85.1min


[Parallel(n_jobs=4)]: Done   6 out of   9 | elapsed: 175.4min remaining: 87.7min


[CV] ........... svm__C=10.0, score=0.9578877472543711, total=120.0min


[Parallel(n_jobs=4)]: Done   7 out of   9 | elapsed: 213.2min remaining: 60.9min


[CV] ........... svm__C=10.0, score=0.9615232041601548, total=138.2min
[CV] ............ svm__C=10.0, score=0.958478626746933, total=125.7min


[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 305.1min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 305.1min finished


Completed calculation for obscene
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9327357357966368, total=10.0min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9643519238747148, total=11.2min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9650860437747995, total=12.9min


[Parallel(n_jobs=4)]: Done   3 out of   9 | elapsed: 15.0min remaining: 30.1min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9651716095302747, total=13.6min


[Parallel(n_jobs=4)]: Done   4 out of   9 | elapsed: 15.8min remaining: 19.8min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9332707260699183, total=13.0min


[Parallel(n_jobs=4)]: Done   5 out of   9 | elapsed: 27.3min remaining: 21.8min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9650639847655265, total=14.5min


[Parallel(n_jobs=4)]: Done   6 out of   9 | elapsed: 30.0min remaining: 15.0min


[CV] ............ svm__C=10.0, score=0.9534515778506911, total=15.3min


[Parallel(n_jobs=4)]: Done   7 out of   9 | elapsed: 32.5min remaining:  9.3min


[CV] ............ svm__C=10.0, score=0.9284033938141558, total=15.0min
[CV] ............ svm__C=10.0, score=0.9610206632432508, total=13.6min


[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 42.7min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 42.7min finished


Completed calculation for threat
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9679747904279509, total=60.8min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9667754510078693, total=61.6min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9618376398486892, total=63.9min


[Parallel(n_jobs=4)]: Done   3 out of   9 | elapsed: 71.4min remaining: 142.8min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9632630822354764, total=80.5min


[Parallel(n_jobs=4)]: Done   4 out of   9 | elapsed: 88.6min remaining: 110.7min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9570361728914145, total=80.4min


[Parallel(n_jobs=4)]: Done   5 out of   9 | elapsed: 156.6min remaining: 125.3min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9631608635543452, total=86.2min


[Parallel(n_jobs=4)]: Done   6 out of   9 | elapsed: 163.5min remaining: 81.7min


[CV] ........... svm__C=10.0, score=0.9402228504118241, total=149.6min


[Parallel(n_jobs=4)]: Done   7 out of   9 | elapsed: 229.1min remaining: 65.4min


[CV] ........... svm__C=10.0, score=0.9376114415859357, total=156.4min
[CV] ........... svm__C=10.0, score=0.9387510255743783, total=150.2min


[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 312.9min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 312.9min finished


Completed calculation for insult
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=0.1 ......................................................
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9382748582923875, total=26.9min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9431100787650235, total=26.9min
[CV] svm__C=1.0 ......................................................
[CV] ............. svm__C=0.1, score=0.9439159912159798, total=28.8min


[Parallel(n_jobs=4)]: Done   3 out of   9 | elapsed: 32.0min remaining: 64.1min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9366083546079775, total=30.2min


[Parallel(n_jobs=4)]: Done   4 out of   9 | elapsed: 33.7min remaining: 42.1min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9433020225099433, total=28.4min


[Parallel(n_jobs=4)]: Done   5 out of   9 | elapsed: 61.7min remaining: 49.3min


[CV] svm__C=10.0 .....................................................
[CV] ............. svm__C=1.0, score=0.9421698948556807, total=29.6min


[Parallel(n_jobs=4)]: Done   6 out of   9 | elapsed: 63.1min remaining: 31.6min


[CV] ............ svm__C=10.0, score=0.9296927813913134, total=37.7min


[Parallel(n_jobs=4)]: Done   7 out of   9 | elapsed: 74.7min remaining: 21.3min


[CV] ............ svm__C=10.0, score=0.9227244097838209, total=39.5min
[CV] ............ svm__C=10.0, score=0.9277373544899435, total=34.6min


[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 99.0min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 99.0min finished


Completed calculation for identity_hate


In [20]:
for a,b in zip(output_names, final_models):
    print(a,b.best_score_)

severe_toxic 0.939014354501
obscene 0.977730716697
threat 0.954502173652
insult 0.965529278143
identity_hate 0.941766954207


#### Submission

In [18]:
# test_mat = bow.transform(test['comment_text'])

In [19]:
# test_mat = tf.transform(test['comment_text'].values)

In [28]:
pred = [model.predict_proba(test['comment_text'].values)[:,1] for model in final_models]

In [29]:
for count,i in enumerate(output_names):
    test[i] = pred[count].flatten()

In [30]:
test[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].to_csv('data/answers/linear_baseline2.csv', index = False)

#### Save model

In [32]:
for count,i in enumerate(final_models):
    joblib.dump(i, 'weights/svm/'+str(count))

#### Ensembling

In [43]:
pred_svm = pd.read_csv('data/answers/linear_baseline2.csv')
pred_logit = pd.read_csv('data/answers/linear_baseline1.csv')
ensemble_pred = pred_svm[['id']]

In [44]:
weight = 0.75
for i in output_names:
    ensemble_pred[i] = weight*pred_logit[i]+(1-weight)*pred_svm[i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [45]:
ensemble_pred.to_csv('data/answers/linear_ensemble_'+str(weight)+'.csv', index = False)