# Toxic Comments Classification Challenge

Task: Finding probabilities of comments being 'toxic', 'severely toxic', 'obscene', 'threat', 'insult' or 'identity hate'.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
train_set = pd.read_csv("train.csv")
test_set = pd.read_csv("test.csv")
test_labels = pd.read_csv("test_labels.csv")

In [2]:
train_set = train_set.drop("id", 1)
train_set["comment_text"] = train_set["comment_text"].str.lower()
train_set["comment_text"] = train_set["comment_text"].apply(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"," ip ", x))
train_set["comment_text"] = train_set["comment_text"].apply(lambda x: re.sub(r'(\w)\1{2,}', r'\1\1', x))
train_set["comment_text"] = train_set["comment_text"].apply(lambda x: re.sub(r'(!|\?)', " \\1 ", x))
train_set["comment_text"] = train_set["comment_text"].apply(lambda x: re.sub(r'\n', "", x))  
train_set.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanationwhy the edits made under my usernam...,0,0,0,0,0,0
1,d'aww ! he matches this background colour i'm...,0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0
3,"""morei can't make any real suggestions on impr...",0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0


In [3]:
col = test_set["id"]
test_set = test_set.drop("id", 1)
test_set["comment_text"] = test_set["comment_text"].str.lower()
test_set["comment_text"] = test_set["comment_text"].apply(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"," ip ", x))
test_set["comment_text"] = test_set["comment_text"].apply(lambda x: re.sub(r'(\w)\1{2,}', r'\1\1', x))
test_set["comment_text"] = test_set["comment_text"].apply(lambda x: re.sub(r'(!|\?)', " \\1 ", x))
test_set["comment_text"] = test_set["comment_text"].apply(lambda x: re.sub(r'\n', "", x))  
test_set.head()

Unnamed: 0,comment_text
0,yo bitch ja rule is more succesful then you'll...
1,"== from rfc == the title is fine as it is, imo."
2,""" == sources == * zawe ashton on lapland — ..."
3,":if you have a look back at the source, the in..."
4,i don't anonymously edit articles at all.


In [4]:
tfidf_vectorizer = TfidfVectorizer(strip_accents='unicode',
    analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2),
    max_features=20000)
comments = pd.concat([train_set["comment_text"], test_set["comment_text"]])
features = tfidf_vectorizer.fit(comments)
train_data_features = features.transform(train_set["comment_text"])
test_data_features = features.transform(test_set["comment_text"])
prob = pd.DataFrame([])
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for label in labels:    
    target = train_set[label]
    classifier = LogisticRegression(solver = 'sag')
    classifier.fit(train_data_features, target)
    prob[label] = classifier.predict_proba(test_data_features)[:, 1] 
prob.insert(0, column='id', value=col)
prob.to_csv("toxicity_probabilities.csv", index=False)
prob.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.997365,0.126312,0.991899,0.042788,0.926212,0.236965
1,0000247867823ef7,0.005259,0.001731,0.003029,0.000549,0.004462,0.002556
2,00013b17ad220c46,0.009484,0.0019,0.00843,0.000754,0.007081,0.001878
3,00017563c3f7919a,0.004174,0.001924,0.00461,0.001252,0.003913,0.001425
4,00017695ad8997eb,0.031695,0.00407,0.012635,0.001751,0.009446,0.001949
