In [8]:
import pandas as pd
import numpy as np
import time
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from skmultilearn.adapt import MLkNN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from sklearn.metrics import hamming_loss
from sklearn.model_selection import train_test_split
from snapy import MinHash, LSH
from collections import Counter
stop = stopwords.words('english')

In [2]:
df = pd.read_csv("delicious_data.csv")
df.head()

Unnamed: 0,text,TAG_.imported,TAG_.net,TAG_2.0,TAG_2007,TAG_3d,TAG_??,TAG_???,TAG_????,TAG_academia,...,TAG_words,TAG_work,TAG_world,TAG_wp,TAG_writing,TAG_xhtml,TAG_xml,TAG_xp,TAG_yahoo,TAG_youtube
0,greasemonkey python,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,closer complaints linux mac os sleeping,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,und zur,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,mac os somehow,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,against anyway article auf care carefully comp...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [3]:
text = list(df['text'])
vectorizer = TfidfVectorizer(stop_words = stop)
X = vectorizer.fit_transform(text)
Y = df[df.columns[1:]]

In [26]:
split_point = int(X.shape[0] * 0.75)
X_train_text = text[:split_point]
X_test_text = text[split_point:]
X_train = X[:split_point]
y_train = Y[:split_point]
X_test = X[split_point:]
y_test = Y[split_point:]

In [27]:
classifier = BinaryRelevance(classifier=DecisionTreeClassifier(), require_dense=[False,True])

In [28]:
start = time.time()
classifier.fit(X_train, y_train)
time.time() - start

4243.154811143875

In [29]:
start = time.time()
classifier.score(X_test,y_test)
time.time() - start

7.279276609420776

In [30]:
hamming_loss(y_test,classifier.predict(X_test))

0.024003540698454305

In [31]:
classifier = ClassifierChain(classifier=DecisionTreeClassifier(), require_dense=[False,True])
start = time.time()
classifier.fit(X_train, y_train)
time.time() - start

2645.1130669116974

In [32]:
start = time.time()
classifier.score(X_test,y_test)
time.time() - start

16.10744833946228

In [33]:
hamming_loss(y_test,classifier.predict(X_test))

0.02656256433872985

In [15]:
classifier = MLkNN(k = 9)
start = time.time()
classifier.fit(X_train, y_train.values)
time.time() - start

1707.5402476787567

In [16]:
start = time.time()
classifier.score(X_test,y_test.values)
time.time() - start

355.47704458236694

In [17]:
hamming_loss(y_test,classifier.predict(X_test))

0.018883472471296875

In [94]:
start = time.time()
minhash = MinHash(X_train_text, n_gram=2, permutations=64, hash_bits=64, seed=3)

In [95]:
labels = [i for i in range(len(X_train_text))]
test_labels = [i for i in range(len(labels), len(labels) + len(X_test_text))]

In [96]:
lsh = LSH(minhash, labels, no_of_bands=16)
time.time() - start

124.9205412864685

In [97]:
start = time.time()
new_minhash = MinHash(X_test_text, n_gram=2, permutations=64, hash_bits=64, seed=3)

In [98]:
lsh.update(new_minhash, test_labels)

In [99]:
predi = []
count =0 
default = np.zeros(983)
sim_list = list(np.arange(0.4, 0.1, -0.05))
for i in range(len(labels), len(labels) + len(X_test_text)):
    predicted_labels = []
    for sim in sim_list:
        pred = lsh.query(i, min_jaccard = sim)
        if len(pred)>0:
            break
    for p in pred:
        predicted_labels.append(Y.values[p])
    P = np.array(predicted_labels)
    if P.shape[0] == 0:
        predi.append(default)
    else:
        predi.append(np.sum(P,axis = 0))

In [100]:
predi = np.array(predi)
pred_cpy = np.copy(predi)
for pr in pred_cpy:
    #pr[pr>1] = 1
    pr[pr>2] = 1
    pr[pr<=2] = 0
time.time() - start

44.44212889671326

In [101]:
hamming_loss(y_test,pred_cpy)

0.0193066081669989

In [102]:
correct = 0
for i,j in zip(y_test.values,pred_cpy):
    if (i==j).sum() == 983:
        correct += 1
correct/len(pred_cpy)

0.0009932952570151478

In [104]:
correct

4