In [1]:
import pandas as pd
import numpy as np
import json
import time
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from sklearn.metrics import hamming_loss
from sklearn.model_selection import train_test_split
from snapy import MinHash, LSH
from blooms_filter import BloomsFilter
from collections import Counter
from nltk.tokenize import word_tokenize
import string
from collections import Counter
stop = stopwords.words('english')
punctuations = string.punctuation

In [2]:
df = pd.read_csv("delicious_data.csv")
df.head()

Unnamed: 0,text,TAG_.imported,TAG_.net,TAG_2.0,TAG_2007,TAG_3d,TAG_??,TAG_???,TAG_????,TAG_academia,...,TAG_words,TAG_work,TAG_world,TAG_wp,TAG_writing,TAG_xhtml,TAG_xml,TAG_xp,TAG_yahoo,TAG_youtube
0,greasemonkey python,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,closer complaints linux mac os sleeping,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,und zur,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,mac os somehow,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,against anyway article auf care carefully comp...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [3]:
#Preprocessing text
def preprocess_text(inp):
    text = inp.lower()
    words =  word_tokenize(text)
    imp_words = []
    for word in words:
        word = word.translate(str.maketrans('', '', string.punctuation))
        if word not in stop and not word.isdigit() and len(word)>3:
            imp_words.append(word)
    return ' '.join(imp_words)

In [4]:
df['text'] = df['text'].apply(preprocess_text)
df.head()

Unnamed: 0,text,TAG_.imported,TAG_.net,TAG_2.0,TAG_2007,TAG_3d,TAG_??,TAG_???,TAG_????,TAG_academia,...,TAG_words,TAG_work,TAG_world,TAG_wp,TAG_writing,TAG_xhtml,TAG_xml,TAG_xp,TAG_yahoo,TAG_youtube
0,greasemonkey python,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,closer complaints linux sleeping,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,somehow,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,anyway article care carefully complaints deep ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [5]:
labels = df.columns[1:]
drop_indices = df[df['text'].map(len) < 2].index
df.drop(drop_indices,inplace=True)

In [6]:
split_point = int(len(df) * 0.75)
shuffled_df = df.sample(frac=1)
train_df = shuffled_df[:split_point]
test_df = shuffled_df[split_point:]

In [28]:
label_words_dict = {}
for label in labels:
    comment_words = []
    comments = train_df['text'][df[label]==1]
    for comment in comments:
        comment_words.extend(word_tokenize(comment))
    #label_words_dict[label] = list(set(comment_words))
    co = Counter(comment_words)
    word_features = []
    for i in co:
        if co[i] > 1:
            word_features.append(i)
    label_words_dict[label] = word_features

In [29]:
data = json.dumps(label_words_dict)
with open('train_delicious.json','w') as f:
    f.write(data)

In [30]:
with open('train_delicious.json','r') as f:
    data = json.loads(f.read())

In [48]:
start = time.time()
bloom_dict = {}
for key in data:
    bloom = BloomsFilter(false_positive = 0.000001,n_items = len(data[key]))
    for word in set(data[key]):
        bloom.hash_functions(word)
    bloom_dict[key] = bloom
time.time() - start

142.5594756603241

In [49]:
start = time.time()
threshold = 0.95
bloom_count = []
for j,r in test_df.iterrows():
    word_features = word_tokenize(r['text'])
    prediction = dict()
    for key in bloom_dict:
        prediction[key] = 0
        for word in word_features:
            pred = bloom_dict[key].predict(word)
            prediction[key] += pred
        try:
            prediction[key] = prediction[key]/len(word_features)
        except:
            print(r['text'])
    pred_labels = []
    for key in prediction:
        if prediction[key] > threshold:
            pred_labels.append(1)
        else:
            pred_labels.append(0)
    bloom_count.append(pred_labels)
time.time() - start

2351.35782122612

In [50]:
hamming_loss(test_df[labels].values,bloom_count)

0.32845056799353906

In [42]:
np.all(bloom_count == test_df[labels].values, axis=1).mean()

0.0

In [18]:
len(df[df['text'].map(len) < 2].index),len(df)

(0, 12920)