In [19]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [21]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [35]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
test.shape

(31915, 8)

In [25]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [36]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [37]:
n = train.shape[0]
vec = CountVectorizer()
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [38]:
trn_term_doc, test_term_doc

(<127656x166430 sparse matrix of type '<class 'numpy.int64'>'
 	with 5559667 stored elements in Compressed Sparse Row format>,
 <31915x166430 sparse matrix of type '<class 'numpy.int64'>'
 	with 1365662 stored elements in Compressed Sparse Row format>)

In [39]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [40]:
x = trn_term_doc
test_x = test_term_doc

In [41]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [46]:
preds = np.zeros((len(test), len(label_cols)))
models = []
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    models.append(m)
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat


  np.exp(prob, prob)


fit insult
fit identity_hate


In [44]:
test[label_cols] = preds
test.to_csv('submissions/test_nblr_count.csv')

In [49]:
vec.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '0000000',
 '0000000027',
 '00000001',
 '00000003',
 '00000050']

In [52]:
m.coef_

array([[  3.76333693e-01,   2.59927309e-01,  -3.05758603e-03, ...,
         -5.16859312e-15,  -5.16859312e-15,  -5.16859312e-15]])

In [53]:
import operator
z = zip(vec.get_feature_names(),m.coef_)
print(max(z, key=operator.itemgetter(1)))

('00', array([  3.76333693e-01,   2.59927309e-01,  -3.05758603e-03, ...,
        -5.16859312e-15,  -5.16859312e-15,  -5.16859312e-15]))


In [54]:
np.argmax(m.coef_)

13262

In [55]:
vec.get_feature_names()[13262]

'am'

In [57]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print(coef_1, fn_1, coef_2, fn_2)

In [61]:
for i,model in enumerate(models):
    print("-------" + label_cols[i] + "-------")
    show_most_informative_features(vec,model,6)

-------toxic-------
-4.81100497862 cool 5.4144244455 ridiculous
-4.69301935516 dx927 4.91646980887 adam
-4.18750522834 lynched 4.67419596748 frequent
-3.77651297659 ashabat 4.34417605697 sockpuppet
-3.43786656481 weve 4.17720999885 inapproriate
-3.29687980137 estogen 4.00900963812 nonsense
-------severe_toxic-------
-3.88895691635 californiaalibaba 2.79427777169 is
-3.32797956268 queer 2.42731940507 retardeds
-3.0000375956 misterwiki 2.41084356349 sexless
-2.7382442052 motherfuck 2.40628696009 create
-2.68680034606 sasayama 2.40023103133 newcomer
-2.52222552783 departed 2.37819913813 mostly
-------obscene-------
-8.62199752141 eats 4.17412177765 worldwide
-6.6181044257 halberg 4.15694617783 participation
-5.86517952942 youngamerican 3.96074324882 von
-5.64126250564 bradbury 3.23802475873 sex
-5.33606739784 vbutt 3.17642358834 xxx
-4.53479789813 abysmal 2.99603877707 american
-------threat-------
-4.95369112375 bleachanhero 2.56629255169 killing
-2.39871051721 nigga 2.31429595135 earthq

In [62]:
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))

In [65]:
for i,model in enumerate(models):
    print("-------" + label_cols[i] + "-------")
    print_top10(vec,model,[0])

-------toxic-------
0: 24hrs transferred bluesband swift nonsense inapproriate sockpuppet frequent adam ridiculous
-------severe_toxic-------
0: garbage bandwidth condescending duded mostly newcomer create sexless retardeds is
-------obscene-------
0: muthafuckaa tawker wouldn infinitely american xxx sex von participation worldwide
-------threat-------
0: trying cut 6ft shoulder hot bowel hang sitush earthquake killing
-------insult-------
0: title82 sloppy edjohnston protestant input theories reversion gg retardedyour am
-------identity_hate-------
0: shanghai communist israeli filled barack country wikipeida sexual super am
