In [1]:
import pandas as pd
import numpy as np
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from skmultilearn.adapt import MLkNN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from sklearn.metrics import hamming_loss
from sklearn.model_selection import train_test_split
from snapy import MinHash, LSH
from collections import Counter
import time
stop = stopwords.words('english')

In [2]:
df = pd.read_csv("F:\Datasets\jigsaw-toxic-comment-classification-challenge\\train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
text = list(df['comment_text'])
vectorizer = TfidfVectorizer(stop_words = stop)
X = vectorizer.fit_transform(text)
Y = df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]]

In [4]:
split_point = int(X.shape[0] * 0.75)
X_train_text = text[:split_point]
X_test_text = text[split_point:]
X_train = X[:split_point]
y_train = Y[:split_point]
X_test = X[split_point:]
y_test = Y[split_point:]

In [5]:
classifier = BinaryRelevance(classifier=DecisionTreeClassifier(), require_dense=[False,True])

In [15]:
start = time.time()
classifier.fit(X_train, y_train)
time.time() - start

5794.881060838699

In [16]:
start = time.time()
classifier.score(X_test,y_test)
time.time() - start

2.256603956222534

In [17]:
hamming_loss(y_test,classifier.predict(X_test))

0.0244570893807602

In [18]:
classifier = ClassifierChain(classifier=DecisionTreeClassifier(), require_dense=[False,True])
start = time.time()
classifier.fit(X_train, y_train)
time.time() - start

3349.4093067646027

In [19]:
start = time.time()
classifier.score(X_test,y_test)
time.time() - start

2.14054274559021

In [20]:
hamming_loss(y_test,classifier.predict(X_test))

0.02506705437044093

In [21]:
classifier = MLkNN(k = 9)
start = time.time()
classifier.fit(X_train, y_train.values)
time.time() - start

2282.6337480545044

In [22]:
start = time.time()
classifier.score(X_test,y_test)
time.time() - start

847.9049696922302

In [23]:
hamming_loss(y_test,classifier.predict(X_test))

0.03432097527552871

In [24]:
start = time.time()
minhash = MinHash(X_train_text, n_gram=2, permutations=16, hash_bits=64, seed=3)

In [25]:
labels = [i for i in range(len(X_train_text))]
test_labels = [i for i in range(len(labels), len(labels) + len(X_test_text))]

In [26]:
lsh = LSH(minhash, labels, no_of_bands=4)
time.time() - start

1847.653974056244

In [27]:
start = time.time()
new_minhash = MinHash(X_test_text, n_gram=2, permutations=16, hash_bits=64, seed=3)

In [28]:
lsh.update(new_minhash, test_labels)

In [29]:
predi = []
count =0 
default = np.array([0,0,0,0,0,0])
sim_list = list(np.arange(0.4, 0.1, -0.05))
for i in range(len(labels), len(labels) + len(X_test_text)):
    predicted_labels = []
    for sim in sim_list:
        pred = lsh.query(i, min_jaccard = sim)
        if len(pred)>0:
            break
    for p in pred:
        predicted_labels.append(Y.values[p])
    P = np.array(predicted_labels)
    if P.shape[0] == 0:
        predi.append(default)
    else:
        predi.append(np.sum(P,axis = 0))

In [30]:
predi = np.array(predi)
pred_cpy = np.copy(predi)
for pr in pred_cpy:
    #pr[pr>1] = 1
    pr[pr>2] = 1
    pr[pr<=2] = 0
time.time() - start

1529.3059828281403

In [31]:
hamming_loss(y_test,pred_cpy)

0.03669816759832552

In [23]:
correct = 0
for i,j in zip(y_test.values,pred_cpy):
    if (i==j).sum() == 6:
        correct += 1
correct/len(pred_cpy)

0.8981274910385281

In [101]:
(pred_cpy[0]==y_test.values[0]).sum()

6