In [19]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [21]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [35]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
test.shape

(31915, 8)

In [25]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [36]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [37]:
n = train.shape[0]
vec = CountVectorizer()
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [38]:
trn_term_doc, test_term_doc

(<127656x166430 sparse matrix of type '<class 'numpy.int64'>'
 	with 5559667 stored elements in Compressed Sparse Row format>,
 <31915x166430 sparse matrix of type '<class 'numpy.int64'>'
 	with 1365662 stored elements in Compressed Sparse Row format>)

In [39]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [40]:
x = trn_term_doc
test_x = test_term_doc

In [41]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [42]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat


  np.exp(prob, prob)


fit insult
fit identity_hate


In [44]:
test[label_cols] = preds
test.to_csv('submissions/test_nblr_count.csv')