In [18]:
import pandas as pd
import json
import numpy as np
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from skmultilearn.adapt import MLkNN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from sklearn.metrics import hamming_loss
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from snapy import MinHash, LSH
from collections import Counter
from blooms_filter import BloomsFilter
from collections import Counter
from nltk.tokenize import word_tokenize
import string
import time
stop = stopwords.words('english')
punctuations = string.punctuation

In [2]:
df = pd.read_csv("F:\Datasets\jigsaw-toxic-comment-classification-challenge\\train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
# performing TF-IDF on the data
text = list(df['comment_text'])
vectorizer = TfidfVectorizer(stop_words = stop)
X = vectorizer.fit_transform(text)
Y = df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]]

In [10]:
#Preprocessing text
def preprocess_text(inp):
    text = inp.lower()
    words =  word_tokenize(text)
    imp_words = []
    for word in words:
        word = word.translate(str.maketrans('', '', string.punctuation))
        if word not in stop and not word.isdigit() and len(word)>3:
            imp_words.append(word)
    return ' '.join(imp_words)

In [11]:
df['comment_text'] = df['comment_text'].apply(preprocess_text)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww matches background colour seemingly stuck...,0,0,0,0,0,0
2,000113f07ec002fd,really trying edit constantly removing relevan...,0,0,0,0,0,0
3,0001b41b1c6bb37e,make real suggestions improvement wondered sec...,0,0,0,0,0,0
4,0001d958c54c6e35,hero chance remember page,0,0,0,0,0,0


In [14]:
labels = df.columns[2:]
drop_indices = df[df['comment_text'].map(len) < 2].index
df.drop(drop_indices,inplace=True)

In [36]:
split_point = int(len(df) * 0.8)
X_train_text = text[:split_point]
X_test_text = text[split_point:]
shuffled_df = df.sample(frac=1)
train_df = shuffled_df[:split_point]
test_df = shuffled_df[split_point:]
X_test = X[split_point:]
y_test = Y[split_point:]

In [16]:
# generating all the word occurences for each class label and removing less occuring words
label_words_dict = {}
for label in labels:
    comment_words = []
    comments = train_df['comment_text'][df[label]==1]
    for comment in comments:
        comment_words.extend(word_tokenize(comment))
    co = Counter(comment_words)
    word_features = []
    for i in co:
        if co[i] > 2:
            word_features.append(i)
    label_words_dict[label] = word_features

In [19]:
data = json.dumps(label_words_dict)
with open('train_jigsaw.json','w') as f:
    f.write(data)

In [20]:
with open('train_jigsaw.json','r') as f:
    data = json.loads(f.read())

In [22]:
start = time.time()
minhash = MinHash(X_train_text, n_gram=2, permutations=32, hash_bits=64, seed=3)

In [23]:
labels = [i for i in range(len(X_train_text))]
test_labels = [i for i in range(len(labels), len(labels) + len(X_test_text))]

In [24]:
lsh = LSH(minhash, labels, no_of_bands=16)

In [25]:
bloom_dict = {}
for key in data:
    bloom = BloomsFilter(false_positive = 0.001,n_items = len(data[key]))
    for word in set(data[key]):
        bloom.hash_functions(word)
    bloom_dict[key] = bloom
time.time() - start

4821.5509724617

In [26]:
def bloom_predict(bloom_dict,text):
    threshold = 0.95
    bloom_count = []
    word_features = word_tokenize(text)
    prediction = dict()
    for key in bloom_dict:
        prediction[key] = 0
        for word in word_features:
            pred = bloom_dict[key].predict(word)
            prediction[key] += pred
        prediction[key] = prediction[key]/len(word_features)
    pred_labels = []
    for key in prediction:
        if prediction[key] > threshold:
            pred_labels.append(1)
        else:
            pred_labels.append(0)
    return pred_labels

In [27]:
start = time.time()
new_minhash = MinHash(X_test_text, n_gram=2, permutations=32, hash_bits=64, seed=3)
lsh.update(new_minhash, test_labels)

In [28]:
predi = []
count =0 
sim_list = list(np.arange(0.3, 0.1, -0.05))
for i in range(len(labels), len(labels) + len(X_test_text)):
    predicted_labels = []
    for sim in sim_list:
        pred = lsh.query(i, min_jaccard = sim)
        if len(pred)>0:
            break
    for p in pred:
        predicted_labels.append(Y.values[p])
    P = np.array(predicted_labels)
    if P.shape[0] == 0:
        predi.append(bloom_predict(bloom_dict,text[i]))
    else:
        predi.append(np.sum(P,axis = 0))

In [29]:
predi = np.array(predi)
pred_cpy = np.copy(predi)
for pr in pred_cpy:
    #pr[pr>1] = 1
    pr[pr>2] = 1
    pr[pr<=2] = 0
time.time() - start

14905.908553600311

In [37]:
hamming_loss(y_test.values,pred_cpy)

0.036522937586139004

In [38]:
correct = 0
for i,j in zip(y_test.values,pred_cpy):
    if (i==j).sum() == 6:
        correct += 1
correct/len(pred_cpy)

0.8989026020455747

In [35]:
len(labels) - len(labels) + len(X_test_text)

32167