In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
subm = pd.read_csv('sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
lens = train.comment_text.str.len()
lens.mean(), lens.std(), lens.max()

(394.0732213246768, 590.7202819048923, 5000)

In [5]:
lens.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x1a1a115780>

In [6]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train['Sum'] = train[label_cols].sum(axis=1)
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none,Sum
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1,0


In [7]:
len(train),len(test)

(159571, 153164)

In [8]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [9]:
len(train),len(test)

(159571, 153164)

In [10]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [11]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                strip_accents='unicode', use_idf=1,
                sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [12]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [13]:
x = trn_term_doc
test_x = test_term_doc
print(test_x.shape)
print(x.shape)



(159571, 2237630)
(159571, 2237630)


In [15]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C, dual=False, n_jobs=-1):
        self.Cs = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y == y_i].sum(0)
            return (p+1) / ((y == y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x, 1, y) / pr(x, 0, y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegressionCV(Cs=self.Cs, dual=self.dual, n_jobs=self.n_jobs, solver='liblinear',
                                         cv=10, scoring='neg_log_loss').fit(x_nb, y)
        return self

In [23]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=0.1, dual=True, n_jobs=-1)
#     m = NbSvmClassifier(C=0.1, dual=True, n_jobs=-1)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [24]:
preds = np.zeros((len(train), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
#     m.fit(x, train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit toxic


  " = {}.".format(effective_n_jobs(self.n_jobs)))


fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [32]:
predictions = pd.DataFrame(preds, columns = label_cols)
predictions['Average'] = predictions.mean(axis=1)
predictions['Original_probability'] = train['Sum'].values
predictions['Original_probability'] = predictions['Original_probability'].apply(lambda x: x/6)
predictions['Correctness'] = np.where(((predictions['Average']>=0.5) & (predictions['Original_probability']<0.5))|((predictions['Average']<0.5) & (predictions['Original_probability']>=0.5)),0.0,1.0)
accu = predictions['Correctness'].sum()
print(accu/159571*100)


97.1392044920443


In [42]:
submid = pd.DataFrame({'id': train["id"]})
submission = pd.concat([submid, predictions], axis=1)
submission.to_csv('submission.csv', index=False)

In [20]:
len(preds.mean(axis=1))

159571

In [27]:
vader = pd.read_csv('Neg.csv')
# vader.head

<bound method NDFrame.head of           Neg
0       0.000
1       0.099
2       0.083
3       0.022
4       0.000
5       0.000
6       0.531
7       0.129
8       0.109
9       0.000
10      0.019
11      0.000
12      0.130
13      0.120
14      0.113
15      0.070
16      0.000
17      0.000
18      0.099
19      0.000
20      0.230
21      0.000
22      0.085
23      0.000
24      0.055
25      0.000
26      0.047
27      0.097
28      0.000
29      0.000
...       ...
159541  0.275
159542  0.000
159543  0.000
159544  0.508
159545  0.000
159546  0.216
159547  0.000
159548  0.000
159549  0.000
159550  0.144
159551  0.000
159552  0.000
159553  0.012
159554  0.325
159555  0.000
159556  0.000
159557  0.023
159558  0.083
159559  0.000
159560  0.017
159561  0.054
159562  0.070
159563  0.059
159564  0.073
159565  0.068
159566  0.110
159567  0.306
159568  0.180
159569  0.000
159570  0.183

[159571 rows x 1 columns]>

In [35]:
predictions['Vader_Values'] = vader.values
predictions['New_probab'] = predictions[['Average','Vader_Values']].mean(axis=1)
predictions['Correctness_new'] = np.where(((predictions['New_probab']>=0.5) & (predictions['Original_probability']<0.5))|((predictions['New_probab']<0.5) & (predictions['Original_probability']>=0.5)),0.0,1.0)
accur = predictions['Correctness_new'].sum()
print(accur/159571*100)

96.75755619755469


In [36]:
submid = pd.DataFrame({'id': train["id"]})
submission = pd.concat([submid, predictions], axis=1)
submission.to_csv('submission1.csv', index=False)