# Naive Bayes SVM Simple Implementation

Based on the kernel by Dmitri B

https://www.kaggle.com/onemoresunday/toxic-comments-nb-svm-strong-linear-baseline

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from helpers import clean

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")
subm = pd.read_csv('./input/sample_submission.csv')


In [3]:
cols_target = ["toxic", "severe_toxic", "obscene", "threat", "insult",
                    "identity_hate"]
train_df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
train_df['none'] = 1-train_df[cols_target].max(axis=1)

In [5]:
COMMENT = 'comment_text'

train_df = clean(train_df)
test_df = clean(test_df)

train_df[COMMENT].fillna("unknown", inplace=True)
test_df[COMMENT].fillna("unknown", inplace=True)

clean
Text is clean
clean
Text is clean


In [6]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

In [7]:
n = train_df.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                     min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                     smooth_idf=1, sublinear_tf=1)
trn_term_doc = vec.fit_transform(train_df[COMMENT])
test_term_doc = vec.transform(test_df[COMMENT])

In [8]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [9]:
x = trn_term_doc
test_x = test_term_doc

In [10]:
def get_model(y):
    y = y.values
    r = np.log(pr(1, y) / pr(0, y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [11]:
preds = np.zeros((len(test_df), len(cols_target)))

for i, j in enumerate(cols_target):
    print('fit', j)
    m,r = get_model(train_df[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [12]:
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = cols_target)], axis=1)
submission.to_csv('submissions/nbsvm_02.csv', index=False)