# Naive Bayes + Logistic Regression
This model is based on [Jeremy Howard's take on NB+SVM](https://www.youtube.com/watch?v=37sFIak42Sc&feature=youtu.be&t=3745). The notebook preprocesses the data, trains a model, then outputs the top 20 most informative word features

In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
test.shape

(31915, 8)

In [4]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [5]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [6]:
n = train.shape[0]
vec = CountVectorizer()
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [7]:
trn_term_doc, test_term_doc

(<127656x166430 sparse matrix of type '<class 'numpy.int64'>'
 	with 5559667 stored elements in Compressed Sparse Row format>,
 <31915x166430 sparse matrix of type '<class 'numpy.int64'>'
 	with 1365662 stored elements in Compressed Sparse Row format>)

In [8]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [9]:
x = trn_term_doc
test_x = test_term_doc

In [10]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [11]:
preds = np.zeros((len(test), len(label_cols)))
models = []
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    models.append(m)
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit toxic
fit severe_toxic
fit obscene
fit threat


  np.exp(prob, prob)


fit insult
fit identity_hate


In [12]:
test[label_cols] = preds
test.to_csv('submissions/test_nblr_count.csv')

In [13]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print(coef_1, fn_1, coef_2, fn_2)

# Most informative 6 features with wieght coefficients

In [14]:
for i,model in enumerate(models):
    print("-------" + label_cols[i] + "-------")
    show_most_informative_features(vec,model,6)

-------toxic-------
-4.80350473962 cool 5.4182601344 ridiculous
-4.6892747789 dx927 4.91705302848 adam
-4.18697595893 lynched 4.68495943295 frequent
-3.78003258938 ashabat 4.34131818615 sockpuppet
-3.42302211904 weve 4.17585825183 inapproriate
-3.29427655492 estogen 4.01281062899 nonsense
-------severe_toxic-------
-3.88351460731 californiaalibaba 2.8153360603 is
-3.33863164384 queer 2.42564482972 newcomer
-3.00727719151 misterwiki 2.42274167774 retardeds
-2.72933222774 motherfuck 2.41115701695 sexless
-2.71613389699 sasayama 2.40606018073 create
-2.50919108452 departed 2.36978975341 mostly
-------obscene-------
-8.63580867538 eats 4.17340457977 worldwide
-6.64699268959 halberg 4.14657783591 participation
-5.87609082306 youngamerican 3.97799818308 von
-5.65082153384 bradbury 3.2320449831 sex
-5.3461866163 vbutt 3.17670460022 xxx
-4.52363487647 abysmal 3.00664667134 american
-------threat-------
-4.95100074607 bleachanhero 2.56818026989 killing
-2.41702893175 nigga 2.31558795795 earthqu

In [32]:
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-20:]
        print(", ".join(feature_names[j] for j in reversed(top10)))

# 20 Most informative features for each label
This output is used to create the word features table in the paper.

In [33]:
for i,model in enumerate(models):
    print("-------" + label_cols[i] + "-------")
    print_top10(vec,model,[0])

-------toxic-------
ridiculous, adam, frequent, sockpuppet, inapproriate, nonsense, swift, bluesband, transferred, 24hrs, sarcastic, abstain, awkward, implement, expire, nambla, bespoke, rubbish, saxmunds, silly
-------severe_toxic-------
is, newcomer, retardeds, sexless, create, mostly, duded, condescending, bandwidth, garbage, hanging, uneducated, words, tasty, indrian, infinite, wolfkeeper, stick, insurance, tempted
-------obscene-------
worldwide, participation, von, sex, xxx, american, wouldn, infinitely, tawker, muthafuckaa, tuesday, danger, moron, stories, reversion, value, puss, fegget, 2014, horseshit
-------threat-------
killing, earthquake, sitush, hang, bowel, hot, shoulder, 6ft, cut, trying, introducing, children, happen, dies, dammed, ll, jews, wh, 210, bw
-------insult-------
am, retardedyour, gg, reversion, theories, input, protestant, edjohnston, sloppy, title82, rubbish, warmongering, revandalising, quotemining, fucky, browsed, participation, worldwide, infinitely, re