In [1]:
import pandas as pd
import json
import numpy as np
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from skmultilearn.adapt import MLkNN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from sklearn.metrics import hamming_loss
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from snapy import MinHash, LSH
from collections import Counter
from blooms_filter import BloomsFilter
from collections import Counter
from nltk.tokenize import word_tokenize
import string
import time
stop = stopwords.words('english')
punctuations = string.punctuation

In [2]:
df = pd.read_csv("delicious_data.csv")
df.head()

Unnamed: 0,text,TAG_.imported,TAG_.net,TAG_2.0,TAG_2007,TAG_3d,TAG_??,TAG_???,TAG_????,TAG_academia,...,TAG_words,TAG_work,TAG_world,TAG_wp,TAG_writing,TAG_xhtml,TAG_xml,TAG_xp,TAG_yahoo,TAG_youtube
0,greasemonkey python,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,closer complaints linux mac os sleeping,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,und zur,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,mac os somehow,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,against anyway article auf care carefully comp...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [3]:
#Preprocessing text
def preprocess_text(inp):
    text = inp.lower()
    words =  word_tokenize(text)
    imp_words = []
    for word in words:
        word = word.translate(str.maketrans('', '', string.punctuation))
        if word not in stop and not word.isdigit() and len(word)>3:
            imp_words.append(word)
    return ' '.join(imp_words)

In [4]:
df['text'] = df['text'].apply(preprocess_text)
df.head()

Unnamed: 0,text,TAG_.imported,TAG_.net,TAG_2.0,TAG_2007,TAG_3d,TAG_??,TAG_???,TAG_????,TAG_academia,...,TAG_words,TAG_work,TAG_world,TAG_wp,TAG_writing,TAG_xhtml,TAG_xml,TAG_xp,TAG_yahoo,TAG_youtube
0,greasemonkey python,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,closer complaints linux sleeping,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,somehow,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,anyway article care carefully complaints deep ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [5]:
labels = df.columns[1:]
drop_indices = df[df['text'].map(len) < 2].index
df.drop(drop_indices,inplace=True)

In [6]:
text = list(df['text'])
vectorizer = TfidfVectorizer(stop_words = stop)
X = vectorizer.fit_transform(text)
Y = df[df.columns[1:]]

In [7]:
split_point = int(len(df) * 0.8)
X_train_text = text[:split_point]
X_test_text = text[split_point:]
shuffled_df = df.sample(frac=1)
train_df = shuffled_df[:split_point]
test_df = shuffled_df[split_point:]

In [8]:
label_words_dict = {}
for label in labels:
    comment_words = []
    comments = train_df['text'][df[label]==1]
    for comment in comments:
        comment_words.extend(word_tokenize(comment))
    #label_words_dict[label] = list(set(comment_words))
    co = Counter(comment_words)
    word_features = []
    for i in co:
        if co[i] > 1:
            word_features.append(i)
    label_words_dict[label] = word_features

In [15]:
data = json.dumps(label_words_dict)
with open('train_delicious.json','w') as f:
    f.write(data)

In [16]:
with open('train_delicious.json','r') as f:
    data = json.loads(f.read())

In [17]:
start = time.time()
bloom_dict = {}
for key in data:
    bloom = BloomsFilter(false_positive = 0.1,n_items = len(data[key]))
    for word in set(data[key]):
        bloom.hash_functions(word)
    bloom_dict[key] = bloom

In [18]:
minhash = MinHash(X_train_text, n_gram=2, permutations=32, hash_bits=64, seed=3)

In [19]:
labels = [i for i in range(len(X_train_text))]
test_labels = [i for i in range(len(labels), len(labels) + len(X_test_text))]

In [20]:
lsh = LSH(minhash, labels, no_of_bands=16)
time.time() - start

86.85732388496399

In [21]:
def bloom_predict(bloom_dict,text):
    threshold = 0.95
    bloom_count = []
    word_features = word_tokenize(text)
    prediction = dict()
    for key in bloom_dict:
        prediction[key] = 0
        for word in word_features:
            pred = bloom_dict[key].predict(word)
            prediction[key] += pred
        prediction[key] = prediction[key]/len(word_features)
    pred_labels = []
    for key in prediction:
        if prediction[key] > threshold:
            pred_labels.append(1)
        else:
            pred_labels.append(0)
    return pred_labels

In [22]:
start = time.time()
new_minhash = MinHash(X_test_text, n_gram=2, permutations=32, hash_bits=64, seed=3)
lsh.update(new_minhash, test_labels)

In [23]:
predi = []
count =0 
sim_list = list(np.arange(0.3, 0.1, -0.05))
for i in range(len(labels), len(labels) + len(X_test_text)):
    predicted_labels = []
    for sim in sim_list:
        pred = lsh.query(i, min_jaccard = sim)
        if len(pred)>0:
            break
    for p in pred:
        predicted_labels.append(Y.values[p])
    P = np.array(predicted_labels)
    if P.shape[0] == 0:
        predi.append(bloom_predict(bloom_dict,text[i]))
    else:
        predi.append(np.sum(P,axis = 0))

In [24]:
predi = np.array(predi)
pred_cpy = np.copy(predi)
for pr in pred_cpy:
    #pr[pr>1] = 1
    pr[pr>2] = 1
    pr[pr<=2] = 0
time.time() - start

62.316367626190186

In [30]:
hamming_loss(test_df[df.columns[1:]].values,pred_cpy)

0.019270972223593977

In [31]:
correct = 0
for i,j in zip(y_test.values,pred_cpy):
    if (i==j).sum() == 983:
        correct += 1
correct/len(pred_cpy)

NameError: name 'y_test' is not defined