In [1]:
# Import Dependencies
import nltk
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

% matplotlib inline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Import text blobs
from textblob import TextBlob, Word



In [2]:
# Read and inspect the train data
df = pd.read_csv('csvs/train.csv')

In [3]:
# Sentiment Polarity Function
# Define function that accepts text and shows the polarity
def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [4]:
# Split the data into training and testing sets
X = df['comment_text']
y = df['toxic']
y2 = df['severe_toxic']
y3 = df['obscene']
y4 = df['threat']
y5 = df['insult']
y6 = df['identity_hate']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X, y3, random_state=42)
X4_train, X4_test, y4_train, y4_test = train_test_split(X, y4, random_state=42)
X5_train, X5_test, y5_train, y5_test = train_test_split(X, y5, random_state=42)
X6_train, X6_test, y6_train, y6_test = train_test_split(X, y6, random_state=42)


In [5]:
#Tokenization with tfidf
vect = TfidfVectorizer(lowercase=False, stop_words='english', decode_error='replace', strip_accents='unicode')

# Create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

print(X_train_dtm.shape)

# Use Logistic Regression to predict if it's toxic
log = LogisticRegression()
log.fit(X_train_dtm, y_train)

y_pred = log.predict(X_test_dtm)

print(f"Logistic Regression Score: {accuracy_score(y_test, y_pred)}")

(119678, 196246)
Logistic Regression Score: 0.9531998094903867


In [6]:
#Tokenization with tfidf
vect2 = TfidfVectorizer(lowercase=False, stop_words='english', decode_error='replace', strip_accents='unicode' )

# Create document-term matrices
X2_train_dtm = vect2.fit_transform(X2_train)
X2_test_dtm = vect2.transform(X2_test)

print(X2_train_dtm.shape)

# Use Logistic Regression to predict if it's severe toxic
log2 = LogisticRegression()
log2.fit(X2_train_dtm, y2_train)

y2_pred = log2.predict(X2_test_dtm)

print(f"Logistic Regression Score: {accuracy_score(y2_test, y2_pred)}")

(119678, 196246)
Logistic Regression Score: 0.9903241170130098


In [7]:
#Tokenization with tfidf
vect3 = TfidfVectorizer(lowercase=False, stop_words='english', decode_error='replace', strip_accents='unicode')

# Create document-term matrices
X3_train_dtm = vect3.fit_transform(X3_train)
X3_test_dtm = vect3.transform(X3_test)

print(X3_train_dtm.shape)

# Use Logistic Regression to predict if it's obscene
log3 = LogisticRegression()
log3.fit(X3_train_dtm, y3_train)

y3_pred = log3.predict(X3_test_dtm)

print(f"Logistic Regression Score: {accuracy_score(y3_test, y3_pred)}")

(119678, 196246)
Logistic Regression Score: 0.9736294588022961


In [8]:
#Tokenization with tfidf
vect4 = TfidfVectorizer(lowercase=False, stop_words='english', decode_error='replace', strip_accents='unicode')

# Create document-term matrices
X4_train_dtm = vect4.fit_transform(X4_train)
X4_test_dtm = vect4.transform(X4_test)

print(X4_train_dtm.shape)

# Use Logistic Regression to predict if it's threat
log4 = LogisticRegression()
log4.fit(X4_train_dtm, y4_train)

y4_pred = log4.predict(X4_test_dtm)

print(f"Logistic Regression Score: {accuracy_score(y4_test, y4_pred)}")

(119678, 196246)
Logistic Regression Score: 0.997443160454215


In [9]:
#Tokenization with tfidf
vect5 = TfidfVectorizer(lowercase=False, stop_words='english', decode_error='replace', strip_accents='unicode')

# Create document-term matrices
X5_train_dtm = vect5.fit_transform(X5_train)
X5_test_dtm = vect5.transform(X5_test)

print(X5_train_dtm.shape)

# Use Logistic Regression to predict if it's insult
log5 = LogisticRegression()
log5.fit(X5_train_dtm, y5_train)

y5_pred = log5.predict(X5_test_dtm)

print(f"Logistic Regression Score: {accuracy_score(y5_test, y5_pred)}")

(119678, 196246)
Logistic Regression Score: 0.9681147068407991


In [10]:
#Tokenization with tfidf
vect6 = TfidfVectorizer(lowercase=False, stop_words='english', decode_error='replace', strip_accents='unicode')

# Create document-term matrices
X6_train_dtm = vect6.fit_transform(X6_train)
X6_test_dtm = vect6.transform(X6_test)

print(X6_train_dtm.shape)

# Use Logistic Regression to predict if it's identity hate
log6 = LogisticRegression()
log6.fit(X6_train_dtm, y6_train)

y6_pred = log6.predict(X6_test_dtm)

print(f"Logistic Regression Score: {accuracy_score(y6_test, y6_pred)}")

(119678, 196246)
Logistic Regression Score: 0.9914521344596796


In [11]:
pickle.dump(vect, open("pickle/vect.sav", 'wb'))
pickle.dump(vect2, open("pickle/vect2.sav", 'wb'))
pickle.dump(vect3, open("pickle/vect3.sav", 'wb'))
pickle.dump(vect4, open("pickle/vect4.sav", 'wb'))
pickle.dump(vect5, open("pickle/vect5.sav", 'wb'))
pickle.dump(vect6, open("pickle/vect6.sav", 'wb'))


In [12]:
pickle.dump(log, open("pickle/toxic.sav", 'wb'))
pickle.dump(log2, open("pickle/severetoxic.sav", 'wb'))
pickle.dump(log3, open("pickle/obscene.sav", 'wb'))
pickle.dump(log4, open("pickle/threat.sav", 'wb'))
pickle.dump(log5, open("pickle/insult.sav", 'wb'))
pickle.dump(log6, open("pickle/identityhate.sav", 'wb'))