In [63]:
# Building off code from: https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [64]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
subm = pd.read_csv('sub.csv')

In [65]:
train = train.fillna(0)
test = test.fillna(0)
subm = subm.fillna(0)
train = train.drop(columns=['match', 'slot'])
test = test.drop(columns=['match', 'slot'])
subm = subm.drop(columns=['match', 'slot'])

In [66]:
train = train.astype({'id':'int32','Positivity':'bool', 'Team_Work':'bool', 'Sportsmanship':'bool', 'Spam':'bool', 'Toxic':'bool', 'Report_Misuse':'bool', 'Neutral':'bool'})

In [67]:
label_cols = ['Positivity', 'Team_Work', 'Sportsmanship', 'Spam', 'Toxic', 'Report_Misuse', 'Neutral' ]


In [68]:

train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,id,none
count,71.0,71.0
mean,35.0,0.211268
std,20.639767,0.411113
min,0.0,0.0
25%,17.5,0.0
50%,35.0,0.0
75%,52.5,0.0
max,70.0,1.0


In [69]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [70]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train['text'])
test_term_doc = vec.transform(test['text'])

In [71]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [72]:
x = trn_term_doc
test_x = test_term_doc

In [80]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=False)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [81]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit text
fit Positivity
fit Team_Work
fit Sportsmanship
fit Spam
fit Toxic
fit Report_Misuse
fit Neutral


In [83]:
preds

array([[0.01408451, 0.05424096, 0.04945858, 0.02961455, 0.01535369,
        0.11622213, 0.0081897 , 0.18840897],
       [0.01408451, 0.05424096, 0.04945858, 0.02961455, 0.01535369,
        0.11622213, 0.0081897 , 0.18840897],
       [0.01408451, 0.05339643, 0.04208792, 0.026214  , 0.36119571,
        0.12137051, 0.0311682 , 0.06315162],
       [0.01408451, 0.03380008, 0.05133115, 0.00982607, 0.91638545,
        0.11938583, 0.01336602, 0.28346502],
       [0.01408451, 0.28926162, 0.09977249, 0.09496045, 0.91769936,
        0.1168101 , 0.02027541, 0.16122884],
       [0.01408451, 0.06138966, 0.05211071, 0.01792415, 0.01331234,
        0.11161479, 0.00704896, 0.18581922],
       [0.01408451, 0.05424096, 0.04945858, 0.02961455, 0.01535369,
        0.11622213, 0.0081897 , 0.18840897],
       [0.01408451, 0.05424096, 0.04945858, 0.02961455, 0.01535369,
        0.11622213, 0.0081897 , 0.18840897],
       [0.01408451, 0.05180439, 0.42678365, 0.02138549, 0.01378561,
        0.59044276, 0.007782

In [82]:

submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission.csv', index=False)