In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc
from sklearn.model_selection import cross_val_score # validate training data
from scipy import sparse
from scipy.sparse import hstack
from sklearn.pipeline import make_union # combine vectors
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Logistic Regression with TFIDF

In [67]:
train_data = pd.read_csv('PROJ/train.csv')
test_data = pd.read_csv('PROJ/test.csv')
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = train_data['comment_text']
test = test_data['comment_text']

FileNotFoundError: File b'PROJ/train.csv' does not exist

In [None]:
toxic = len(train_data[train_data.toxic == 1])
severe_toxic = len(train_data[train_data.severe_toxic == 1])
obscene = len(train_data[train_data.obscene == 1])
threat = len(train_data[train_data.threat == 1])
insult = len(train_data[train_data.insult == 1])
identity_hate = len(train_data[train_data.identity_hate == 1])
count_array = [toxic,severe_toxic,obscene,threat,insult,identity_hate]


score_df = pd.DataFrame(index=class_names)
score_df['class'] = count_array
freq_plot = score_df['class'].plot('bar')
plt.title("Frequency of Each Toxicity")
plt.xlabel("Frequency")
plt.ylabel("Toxicity")
plt.xticks(rotation = 360)


# ----------------------------------

In [None]:
#Stemming using Snowball-English
stemmer = EnglishStemmer()
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))
#Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    strip_accents='unicode',
    analyzer=stemmed_words,
    token_pattern='[a-z]{3, 15}',
    ngram_range=(1, 2),
    max_features=10000)

vectorizer = tfidf_vectorizer

vectorizer.fit(train)

train_vec = vectorizer.transform(train)
test_vec = vectorizer.transform(test)

In [None]:
# Using LR to fit training set to calculate test set
submission = pd.DataFrame.from_dict({'id': test_data['id']})
for class_name in class_names:
    train_target = train_data[class_name]
    classifier = LogisticRegression(solver='lbfgs',max_iter=5000)

    classifier.fit(train_vec, train_target)
    submission[class_name] = classifier.predict_proba(test_vec)[:, 1]

In [None]:
submission.to_csv('PROJ/submission_1.csv', index=False)
# Result: 0.97609

# ----------------------------------

In [None]:
# Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    strip_accents='unicode',
    analyzer= 'word',
    token_pattern=r'\w{1,}',
    stop_words= 'english',
    ngram_range=(1, 2),
    max_features=10000)

vectorizer = tfidf_vectorizer

vectorizer.fit(train)

train_vec = vectorizer.transform(train)
test_vec = vectorizer.transform(test)

In [None]:
# Using LR to fit training set to calculate test set
submission2 = pd.DataFrame.from_dict({'id': test_data['id']})
for class_name in class_names:
    train_target = train_data[class_name]
    classifier = LogisticRegression(solver='lbfgs',max_iter=5000)

    classifier.fit(train_vec, train_target)
    submission2[class_name] = classifier.predict_proba(test_vec)[:, 1]
submission2.to_csv('PROJ/submission_2.csv', index=False)
# Result: 0.97215

In [None]:
#Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    #strip_accents='unicode',
    #analyzer= 'word',
    token_pattern=r'\w{1,}'
    #stop_words= 'english',
    #ngram_range=(1, 2),
    #max_features=10000)
)

vectorizer = tfidf_vectorizer

vectorizer.fit(train)

train_vec = vectorizer.transform(train)
test_vec = vectorizer.transform(test)

In [None]:
# Using LR to fit training set to calculate test set
submission3 = pd.DataFrame.from_dict({'id': test_data['id']})
for class_name in class_names:
    train_target = train_data[class_name]
    classifier = LogisticRegression(solver='lbfgs',max_iter=5000)
    
    classifier.fit(train_vec, train_target)
    submission3[class_name] = classifier.predict_proba(test_vec)[:, 1]
submission3.to_csv('PROJ/submission_3.csv', index=False)
# Result: 0.97375

# -----------------------------------

In [None]:
all_comments = pd.concat([train,test])
#Vectorizer
tfidf_vectorizer_word = TfidfVectorizer(
    strip_accents='unicode',
    analyzer= 'word',
    token_pattern=r'\w{1,}',
    stop_words= 'english',
    ngram_range=(1, 1),
    max_features=10000)

tfidf_vectorizer_char = TfidfVectorizer(
    strip_accents='unicode',
    analyzer= 'char',
    token_pattern=r'\w{1,}',
    stop_words= 'english',
    ngram_range=(2, 6),
    max_features=10000)

# Combining two vectors
vectorizer = make_union(tfidf_vectorizer_word,tfidf_vectorizer_char,n_jobs=3)

vectorizer.fit(train)

train_vec = vectorizer.transform(train)
test_vec = vectorizer.transform(test)

In [None]:
# Using LR to fit training set to calculate test set
submission4 = pd.DataFrame.from_dict({'id': test_data['id']})
for class_name in class_names:
    train_target = train_data[class_name]
    classifier = LogisticRegression(solver='lbfgs',max_iter=5000)  
    
    classifier.fit(train_vec, train_target)
    submission4[class_name] = classifier.predict_proba(test_vec)[:, 1]

submission4.to_csv('PROJ/submission_4.csv', index=False)
#Result: 0.97624


# ---------------------------------------------

# Random Forest with TFIDF

In [None]:
#Stemming
stemmer = EnglishStemmer()
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))
#Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    strip_accents='unicode',
    analyzer=stemmed_words,
    token_pattern='[a-z]{3, 15}',
    ngram_range=(1, 2),
    max_features=10000)

vectorizer = tfidf_vectorizer

vectorizer.fit(train)

train_vec = vectorizer.transform(train)
test_vec = vectorizer.transform(test)

In [None]:
# Using RF to fit training set to calculate test set
submission5 = pd.DataFrame.from_dict({'id': test_data['id']})
for class_name in class_names:
    train_target = train_data[class_name]
    classifier = RandomForestClassifier(bootstrap=True, criterion='gini', max_features=1000, random_state=None,n_estimators=10)

    classifier.fit(train_vec, train_target)
    submission5[class_name] = classifier.predict_proba(test_vec)[:, 1]

submission5.to_csv('PROJ/submission_5.csv', index=False)
# Test Result: 0.90573

# Logistic Regression with CountVectorizer & Stemming


In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [46]:
columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']   

for i in columns:
    test_df[i] = ''

In [4]:
comment_train = train_df["comment_text"]
comment_test = test_df["comment_text"]
#comments = pd.concat([comment_train, comment_test])

In [10]:
# stemming
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc): 
    return (stemmer.stem(w) for w in analyzer(doc))

In [54]:
vectorizer = CountVectorizer(
    analyzer = stemmed_words,
    lowercase = True,
    stop_words='english',
    ngram_range=(1, 1),
    token_pattern=r'\w{1,}', #vectorize 1-character words or more
    max_features=30000)
vectorizer.fit(comment_train)

# Word n-gram vector
train_dt = vectorizer.transform(comment_train)
test_dt = vectorizer.transform(comment_test)

In [49]:
char_vectorizer = CountVectorizer(
    lowercase = True,
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=30000)
char_vectorizer.fit(comment_train)

# Character n-gram vector
train_char_dt = char_vectorizer.transform(comment_train)

In [50]:
test_char_dt = char_vectorizer.transform(comment_test)

In [58]:

train_stack = hstack([train_dt, train_char_dt])
test_stack = hstack([test_dt, test_char_dt])

In [59]:
# stemmed words + 'char'
pred = pd.DataFrame.from_dict({'id': test_df['id']})
for i in columns:
    train_target = train_df[i]
    classifier = LogisticRegression(C=0.1, solver='lbfgs', max_iter=5000)
    classifier.fit(train_stack, train_target)
    pred[i] = classifier.predict_proba(test_stack)[:, 1]
#print('Total CV score is {}'.format(np.mean(scores)))

KeyboardInterrupt: 

In [None]:
pred.to_csv('pred.csv', index=False)

In [60]:
vectorizer2 = CountVectorizer(
    analyzer = stemmed_words,
    lowercase = True,
    stop_words='english',
    ngram_range=(1, 1),
    token_pattern=r'\w{1,}', #vectorize 1-character words or more
    max_features=30000)
vectorizer2.fit(comment_train)

# Word n-gram vector
train_dt2 = vectorizer2.transform(comment_train)
test_dt2 = vectorizer2.transform(comment_test)

In [64]:
# stemmed words only
scores = []
pred2 = pd.DataFrame.from_dict({'id': test_df['id']})
for i in columns:
    train_target = train_df[i]
    classifier = LogisticRegression(C=0.1, solver='lbfgs', max_iter=5000)
    classifier.fit(train_dt2, train_target)
    pred2[i] = classifier.predict_proba(test_dt2)[:, 1]

#result is 0.9529310500879543

Total CV score is 0.9529310500879543


In [62]:
pred2.to_csv('pred2.csv', index=False)

In [None]:
vectorizer3 = CountVectorizer(
    analyzer = 'word',
    lowercase = True,
    stop_words='english',
    ngram_range=(1, 1),
    token_pattern=r'\w{1,}', #vectorize 1-character words or more
    max_features=30000)
vectorizer3.fit(comment_train)

# Word n-gram vector
train_dt3 = vectorizer3.transform(comment_train)
test_dt3 = vectorizer3.transform(comment_test)

In [None]:
train_stack2 = sparse.hstack([train_dt3, train_char_dt])
test_stack2 =sparse.hstack([test_dt3, test_char_dt])

In [None]:
# 'word' + 'char'
pred3 = pd.DataFrame.from_dict({'id': test_df['id']})
for i in columns:
    train_target = train_df[i]
    classifier = LogisticRegression(C=0.1, solver='lbfgs', max_iter=5000)
    classifier.fit(train_stack2, train_target)
    pred3[i] = classifier.predict_proba(test_stack2)[:, 1]
pred3.to_csv('pred3.csv', index=False)

# KNN Method

Reference Note: Ideas and some parts of the code was inspired and adapted from trying to implement Alexander Burmistrov's model (Kaggle 3rd Place for Toxic Comments Classification). https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52644

In [None]:
# Load in data science packages to be used.

import numpy as np
# Let's set the seed.
np.random.seed(32)

import pandas as pd
import string
import re

import gensim
from collections import Counter
import pickle

import tensorflow as tf
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import CuDNNLSTM, CuDNNGRU
from keras.preprocessing import text, sequence

from keras.callbacks import Callback
from keras import optimizers
from keras.layers import Lambda

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import os
os.environ['OMP_NUM_THREADS'] = '4'

import gc
from keras import backend as K
from sklearn.model_selection import KFold

from unidecode import unidecode
import time

In [None]:
eng_stopwords = set(stopwords.words("english"))

In [None]:
# Open the given data.
#train = pd.read_csv("train.csv")
#test = pd.read_csv("test.csv")

# Later, work with the processed data.
train = pd.read_csv("train_processed.csv")
test = pd.read_csv("test_processed.csv")

In [None]:
# Note that using the re.compile and saving the resulting regular expression
# object for reuse is more efficient when the expression will be used several times. Let's remove the ASCII.
spe_char_remove = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)

def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = spe_char_remove.sub('',x_ascii)
    return x_clean

train['clean_text'] = train['comment_text'].apply(lambda x: clean_text(str(x)))
test['clean_text'] = test['comment_text'].apply(lambda x: clean_text(str(x)))

# Next, fill in in "something" if needed.
trianX = train['clean_text'].fillna("something").values
trainY = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test['clean_text'].fillna("something").values

In [None]:
def descriptions(df):
""" Working with the features of the dataframe and adding into it."""
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df

train = descriptions(train)
test = descriptions(test)

features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)

In [None]:
# Let's set our parameters to be used at 10000 for maximum feature and our maximum length is 500
maxim_f = 10000
maxim_l = 500

# Next, apply the tokenizer.
tokenizer = text.Tokenizer(num_words=maxim_f)
tokenizer.fit_on_texts(list(trainX) + list(X_test))

x_train = sequence.pad_sequences(X_train_sequence, maxim_l=maxim_l)
x_test = sequence.pad_sequences(X_test_sequence, maxim_l=maxim_l)

X_train_sequence = tokenizer.texts_to_sequences(trainX)
X_test_sequence = tokenizer.texts_to_sequences(X_test)

In [None]:
# Let's load our Fasttext and GloVe files to be used for vectorizing.
EMBEDDING_FILE_FASTTEXT="C:\\Users\\avang\\Documents\\STA141C\\Project\\toxic\\wiki-news-300d-1M.vec\\wiki-news-300d-1M.vec"
EMBEDDING_FILE_TWITTER="C:\\Users\\avang\\Documents\\STA141C\\Project\\toxic\\glove.twitter.27B\\glove.twitter.27B.200d.txt"

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

In [None]:
# Note: takes a while to run.
embedFT = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE_FASTTEXT,encoding='utf-8'))

In [None]:
embedTW = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE_TWITTER,encoding='utf-8'))

In [None]:
spell_model = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE_FASTTEXT)

In [None]:
words = spell_model.index2word

w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i

WORDS = w_rank

In [None]:
# Use fast text as vocabulary
def words(text): return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    return - WORDS.get(word, 0) # When not in the dictionary, initialize to 0.

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(complete_A(word)) or known(complete_B(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def complete_A(word):
    "Complete the words if they are 1 character off form the word."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def complete_B(word): 
    "Complete the words if they are 2 characters off from the word."
    return (e2 for e1 in complete_A(word) for e2 in complete_A(e1))

In [None]:
word_index = tokenizer.word_index
nb_words = min(maxim_f, len(word_index))
embedding_matrix = np.zeros((nb_words,501))

In [None]:
something_tw = embedTW.get("something")
something_ft = embedFT.get("something")

In [None]:
something = np.zeros((501,))
something[:300,] = something_ft
something[300:500,] = something_tw 
something[500,] = 0

In [None]:
def all_caps(word):
    return len(word) > 1 and word.isupper()

def embed_word(embedding_matrix,i,word):
    embedding_vector_ft = embedFD.get(word)
    if embedding_vector_ft is not None: 
        if all_caps(word):
            last_value = np.array([1])
        else:
            last_value = np.array([0])
        embedding_matrix[i,:300] = embedding_vector_ft
        embedding_matrix[i,500] = last_value 
        embedding_vector_tw = embedTW.get(word)
        if embedding_vector_tw is not None:
            embedding_matrix[i,300:500] = embedding_vector_tw

In [None]:
for word, i in word_index.items():
    
    if i >= maxim_f: continue
        
    if embedFT.get(word) is not None:
        embed_word(embedding_matrix,i,word)
    else:
        if len(word) > 25:
            embedding_matrix[i] = something
        else:

            word2 = correction(word)
            if embedFT.get(word2) is not None:
                embed_word(embedding_matrix,i,word2)
            else:
    
                word2 = correction(singlify(word))
                if embedFT.get(word2) is not None:
                    embed_word(embedding_matrix,i,word2)
                else:
                    embedding_matrix[i] = something     

In [None]:
class get_ROC_AUC(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("*** New High Score (previous: %.6f) \n" % self.max_score)
                model.save_weights("best_weights.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 3:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True

In [None]:
# Set the clipvalue to 1.0 and the number of filters to 40.
def apply_model(features,clipvalue=1.0,num_filters=40,dropout=0.5,embed_size=501):
    ftrInput = Input(shape=(features.shape[1],))
    inp = Input(shape=(maxim_l, ))
    
    # Do the Fasttext and GloVe twitter embeddings.
    x = Embedding(maxim_f, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    
    # SpatialDropout1D, LSTM, and the GRU models for better classification.
    x = SpatialDropout1D(dropout)(x)
    x = Bidirectional(LSTM(num_filters, return_sequences=True))(x)
    x, x_Bdrtnl_h, x_c = Bidirectional(GRU(num_filters, return_sequences=True, return_state = True))(x)  
    
    # Get the unique and all-caps words and use the averages.
    maxim_p = GlobalMaxPooling1D()(x)
    avg_p = GlobalAveragePooling1D()(x)
    x = concatenate([avg_p, x_Bdrtnl_h, maxim_p,ftrInput])
    
    # Dense Layer
    dense_l = Dense(6, activation="sigmoid")(x)

    model = Model(inputs= [inp,ftrInput], outputs = dense_l)
    opt_a = optimizers.adam(clipvalue = clipvalue)
    model.compile(loss = 'binary_crossentropy',
                  optimizer = opt_a,
                  metrics = ['accuracy'])
    return model

In [None]:
model = apply_model(features)

batch_size = 32 

# Takes really long to run, set epochs to 80 and then early exit at end of epoch 3.
epochs = 80
gc.collect()
K.clear_session()

num_folds = 10

predict = np.zeros((test.shape[0],6))

kf = KFold(n_splits=num_folds, shuffle=True, random_state = 200)

In [None]:
for train_index, test_index in kf.split(x_train):
    
    k_x_trn = trainX[train_index]
    k_X_ftrs = features[train_index]
    
    kfold_X_valid = trainX[test_index]
    kfold_X_valid_features = features[test_index] 
    
    k_y_trn,kfold_y_test = trainY[train_index], trainY[test_index]
    
    gc.collect()
    K.clear_session()
    
    model = apply_model(features)
    
    # To determine ROC and AUC.
    ra_val = get_ROC_AUC(validation_data=([kfold_X_valid,kfold_X_valid_features], kfold_y_test), interval = 1)
    
    model.fit([k_x_trn,k_X_ftrs], k_y_trn, batch_size=batch_size, epochs=epochs, verbose=1,
             callbacks = [ra_val])
    gc.collect()
    
    model.load_weights("best_weights.h5")
    
    predict += model.predict([x_test,test_features], batch_size=batch_size,verbose=1) / num_folds
    
    gc.collect()
    scores.append(cv_score)
    print('score: ',cv_score)

print('CV score: {}'.format(np.mean(scores)))    