In [2]:
%matplotlib inline
import os
import re
import string
import shutil

from matplotlib import pyplot as plt

import pandas as pd
import numpy as np
import scipy as sp

import sklearn as sk
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2, f_classif
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

from gensim.models import KeyedVectors

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, LSTM, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding, Flatten, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tqdm import tqdm, tqdm_notebook, tnrange, trange

In [3]:
tqdm.monitor_interval = 0
tqdm_notebook().pandas()




In [4]:
train = pd.read_pickle('CLEAN/train.P.gz', compression='gzip')

In [5]:
test = pd.read_pickle('CLEAN/test.P.gz', compression='gzip')

In [6]:
train.iloc[0:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comments
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,False,False,False,False,False,False,explanation \n why the edit make under my user...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,False,False,False,False,False,False,daww he match this background colour I be seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",False,False,False,False,False,False,hey man I be really not try to edit war its ju...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",False,False,False,False,False,False,more \n I can not make any real suggestion on ...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",False,False,False,False,False,False,you sir be my hero any chance you remember wha...


In [7]:
print(train.shape)
print(train.columns)

(159571, 9)
Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'cleaned_comments'],
      dtype='object')


In [8]:
cats = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
cat_col_i = {'toxic':0,'severe_toxic':1,'obscene':2,'threat':3,'insult':4,'identity_hate':5}

In [9]:
Y_train = train[cats]

In [10]:
cat_counts=pd.DataFrame(Y_train.values).sum().to_dict()
total = Y_train.shape[0]
cat_weight = dict()
for cat in cat_counts:
    cat_weight[cat] = (total-cat_counts[cat]) / total
abnorm_total = (Y_train.sum(axis=1)>0).apply(int).sum()
norm_weight = abnorm_total/total
def calc_weight_for(s_obj):
    if s_obj.sum() == 0:
        return norm_weight
    else:
        weights = list()
        for i in range(s_obj.shape[0]):
            if s_obj[i]:
                weights.append(cat_weight[i])
        return max(weights)
samp_weight = Y_train.apply(calc_weight_for,axis=1)

In [11]:
def to_preds(prob):
    return int(prob >= 0.5)

round_preds = np.vectorize(to_preds)

def custom_metric(y_true, y_pred):
    return keras.backend.mean(keras.metrics.binary_accuracy(y_true, keras.backend.round(y_pred)))

def test_model(model, train_data, cat):
    # result = model.predict(X_train, batch_size=128, verbose=1)
    result = model.predict(train_data, batch_size=128, verbose=1)
    res_df = pd.DataFrame(round_preds(result))
#     res_df.columns = [cat]
    print(classification_report(Y_train[cat], res_df))

In [12]:
cvect = CountVectorizer(ngram_range=(1,3),lowercase=False)
all_terms_count = cvect.fit_transform(train.cleaned_comments)
tfidf_vect = TfidfVectorizer(ngram_range=(1,3),lowercase=False)
all_terms_tfidf = tfidf_vect.fit_transform(train.cleaned_comments)

In [13]:
norm = sk.preprocessing.Normalizer()
all_terms_count_normed = norm.fit_transform(all_terms_count)

In [14]:
all_terms_count.shape

(159571, 7225774)

In [15]:
all_terms_tfidf.shape

(159571, 7225774)

In [16]:
all_terms_tfidf.max()

1.0

In [17]:
all_terms = sp.sparse.hstack((all_terms_count_normed, all_terms_tfidf))
all_terms.shape

(159571, 14451548)

In [18]:
def multi_label_f_classif_scores(X, y):
    selected_features = []
    for cat in cats:
        selector = SelectKBest(f_classif, k='all')
        selector.fit(X, y[:,cat_col_i[cat]])
        selected_features.append(list(selector.scores_))
    return selected_features

def multi_label_f_classif_mean(X, y):
    return np.mean(multi_label_f_classif_scores(X, y), axis=0)

def multi_label_f_classif_max(X, y):
    return np.max(multi_label_f_classif_scores(X, y), axis=0)

In [19]:
all_selector = SelectKBest(multi_label_f_classif_max, k=200000)
all_selected = all_selector.fit_transform(all_terms, Y_train)

In [20]:
X_trains = dict()
selectors = dict()
for cat in tqdm_notebook(cats):
    selector = SelectKBest(f_classif, k=200000)
    selected = selector.fit_transform(all_terms, Y_train[cat])
    X_trains[cat] = selected
    selectors[cat] = selector




In [21]:
import gc
gc.collect()

15

In [None]:
small_selector = SelectKBest(multi_label_f_classif_max, k=50000)
small_selected = all_selector.fit_transform(all_terms, Y_train)

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
svd = TruncatedSVD(n_components=50000,n_iter=8)
all_decomp = svd.fit_transform(small_selected)

In [None]:
tsne = TSNE(n_components=5000, perplexity=30.0, n_iter=5000, n_iter_without_progress=300)
tnse_data = tsne.fit_transform(all_decomp)

## Serialize the bag of words Components

In [58]:
import pathlib
import pickle
os.makedirs("WORDBAG", exist_ok=True)
targdir = pathlib.Path("WORDBAG")
with open(targdir/"X_trains.P", 'wb') as f:
    pickle.dump(X_trains, f)
with open(targdir/"Y_train.P", 'wb') as f:
    pickle.dump(Y_train, f)
with open(targdir/"vectorizer_count.P", 'wb') as f:
    pickle.dump(cvect, f)
with open(targdir/"vectorizer_tfidf.P", 'wb') as f:
    pickle.dump(tfidf_vect, f)
with open(targdir/"selectors.P", 'wb') as f:
    pickle.dump(selectors, f)
with open(targdir/"normalizer.P", 'wb') as f:
    pickle.dump(norm, f)
with open(targdir/"all_selected.P", 'wb') as f:
    pickle.dump(all_selected, f)
with open(targdir/"all_selector.P", 'wb') as f:
    pickle.dump(all_selector, f)

## Transform the test data

In [78]:
test.iloc[0:5]

Unnamed: 0,id,comment_text,cleaned_comments
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,yo bitch ja rule be more succesful then you wi...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,from rfc the title be fine as it be imo
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",source zawe ashton on lapland
3,00017563c3f7919a,":If you have a look back at the source, the in...",if you have a look back at the source the info...
4,00017695ad8997eb,I don't anonymously edit articles at all.,I do not anonymously edit article at all


In [79]:
test_vec_count = cvect.transform(test['cleaned_comments'])

In [80]:
test_vec_tfidf = tfidf_vect.transform(test['cleaned_comments'])

In [81]:
test_vec_count_norm = norm.transform(test_vec_count)

In [82]:
test_vec_combined = sp.sparse.hstack((test_vec_count_norm, test_vec_tfidf))

In [83]:
test_selected = all_selector.transform(test_vec_combined)

In [84]:
with open(targdir/"test_selected.P", 'wb') as f:
    pickle.dump(test_selected, f)