## Toxic Comment
<p> Logistic regression with words and char n-grams with LB 0.9788

### import library

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from tqdm import tqdm


In [3]:
import string
import nltk
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc
%matplotlib inline


### read dataset

In [4]:
train_data_path='/Users/huizi/Downloads/Toxic_Comment_Classification/train.csv'
test_data_path='/Users/huizi/Downloads/Toxic_Comment_Classification/test.csv'
train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)
print('Number of rows and columns in the train data set:',train.shape)
print('Number of rows and columns in the test data set:',test.shape)

('Number of rows and columns in the train data set:', (159571, 8))
('Number of rows and columns in the test data set:', (153164, 2))


In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [32]:
test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
5,0001ea8717f6de06,Thank you for understanding. I think very high...
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...
7,000247e83dcc1211,:Dear god this site is horrible.
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ..."
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...


### Missing value

In [8]:
k = pd.DataFrame()
k['train'] = train.isnull().sum()
k['test'] = test.isnull().sum()
k

Unnamed: 0,train,test
id,0,0.0
comment_text,0,0.0
toxic,0,
severe_toxic,0,
obscene,0,
threat,0,
insult,0,
identity_hate,0,


In [9]:
test['comment_text'].isnull().sum()

0

In [10]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [11]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [12]:
word_vectorizer = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='word',
                                  token_pattern=r'\w{1,}',ngram_range=(1, 1),max_features=15000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)


In [None]:
'''
sublinear_tf : boolean, default=False   
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). 
    
    
analyzer : string, {‘word’, ‘char’} or callable
Whether the feature should be made of word or character n-grams.
If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input.

'''

In [13]:
char_vectorizer = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='char',
                                  ngram_range=(1, 5),max_features=20000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [16]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [18]:
losses = []
predictions = {'id': test['id']}

for class_name in tqdm(class_names):
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))

    classifier.fit(train_features, train_target)
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(losses)))


  0%|          | 0/6 [00:00<?, ?it/s]

CV score for class toxic is 0.97815568733


 17%|█▋        | 1/6 [03:55<19:35, 235.15s/it]

CV score for class severe_toxic is 0.988542843251


 33%|███▎      | 2/6 [08:04<16:08, 242.05s/it]

CV score for class obscene is 0.989945546171


 50%|█████     | 3/6 [11:14<11:14, 224.72s/it]

CV score for class threat is 0.988919052879


 67%|██████▋   | 4/6 [16:12<08:06, 243.13s/it]

CV score for class insult is 0.982464874521


 83%|████████▎ | 5/6 [20:01<04:00, 240.31s/it]

CV score for class identity_hate is 0.982741778006


100%|██████████| 6/6 [23:48<00:00, 238.07s/it]

Total CV score is 0.985128297026





In [19]:
submission = pd.DataFrame.from_dict(predictions)
submission.to_csv('LRpredictions.csv', index=False)