In [1]:
import pandas as pd 
import json 
import re


In [2]:
with open("sephora_desc.json", "r") as f:
    data = f.readlines()

In [65]:
li = []
for line in data:
    try:
        li.append(json.loads(line.strip()[:-1]))
    except Exception as e:
        print (e)


Expecting value: line 1 column 1 (char 0)
Expecting ',' delimiter: line 1 column 316 (char 315)
Expecting value: line 1 column 1 (char 0)
Expecting ',' delimiter: line 1 column 2009 (char 2008)
Expecting value: line 1 column 1 (char 0)


In [193]:
df = pd.DataFrame(li)

def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext.replace("\n", " ").replace("\r", " ").replace("&#10004;", " ")

df["long_desc"] = df["long_desc"].fillna("")
df["long_desc"] = df["long_desc"].apply(lambda x: cleanhtml(x))
df["category"]  = df["category"].apply(lambda x: x.replace(" ", "_"))
df["feature"] = df.apply(lambda x: x["long_desc"] + " "+ x["name"], axis=1)

In [81]:
train = df[["feature", "category"]][:14000].to_dict(orient="records")  # [0]["category"]
test = df[["feature", "category"]][14000:].to_dict(orient="records")  # [0]["category"]

In [112]:
import fasttext as ft

from os import path
TRAIN_FILE = 'fasttext_train.txt'
TEST_FILE = 'fasttext_test.txt'
MODEL_FILE = 'category_classifier'
# Info to save the model

model_dir = path.join(path.dirname("__file__"), 'models')
taxonomy_output = path.join(model_dir, 'taxonomy')

with open(TRAIN_FILE, 'w') as fp:
    for i, row in enumerate(train):
        fp.write('__label__%s %s\n' % (row['category'],
                                       row['feature']))
# Storing test samples
with open(TEST_FILE, 'w') as fp:
    for i, row in enumerate(test):
        fp.write('__label__%s %s\n' % (row['category'],
                                       row['feature']))


In [128]:
test_set = df[["feature", "category"]][14000:]

In [113]:
cooking_model = ft.supervised(TRAIN_FILE, MODEL_FILE, lr=1.0, epoch=10, silent=0)

In [129]:
result = [i[0] for i in cooking_model.predict(df["feature"][14000:])]

In [130]:
test_set["prediction"] = result

In [136]:
accuracy = len(test_set[test_set["prediction"] ==  test_set["category"]]) / len(test_set)
print(accuracy)

0.2804709141274238


In [195]:
# TF-IDF
test = df[14000:]
df = df[:14000]
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df.feature)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [196]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, df["level1"])

In [197]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(df.feature, df.level1)

In [198]:
# Performance of NB Classifier
import numpy as np
predicted = text_clf.predict(df.feature)


0.8381428571428572

In [205]:
test["prediction"] = text_clf.predict(test.feature)

np.mean(test["prediction"] == test["level1"])

0.7382271468144044

In [154]:
from collections import Counter

In [155]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(df.feature, df.level1)
predicted_svm = text_clf_svm.predict(df.feature)
np.mean(predicted_svm == df.level1)



0.8425

In [156]:
from sklearn.externals import joblib
joblib.dump(text_clf_svm, 'clf.model') 

['clf.model']

In [157]:
model = joblib.load("clf.model")
prediction = model.predict(["sdfds"])