## Class

In [5]:
class Category:
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"
    ELECTRONICS = "ELECTRONICS"
    GROCERY = "GROCERY"
    PATIO = "PATIO"
    
class Data:
    def __init__(self, text, category):
        self.text = text
        self.category = category

## Load Data

In [4]:
import json

In [9]:
file_names = ["Data/Books_small.json", "Data/Clothing_small.json", "Data/Electronics_small.json",
              "Data/Grocery_small.json", "Data/Patio_small.json"]
file_categories = [Category.BOOKS, Category.CLOTHING, Category.ELECTRONICS, Category.GROCERY, Category.PATIO]

datas = []
for i in range(len(file_names)):
    category = file_categories[i]
    file = open(file_names[i])
    for x in file:
        data_json = json.loads(x)
        data = Data(data_json["reviewText"], category)
        datas.append(data)

## Data Preparation

In [14]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(datas, test_size=0.2, random_state=42)

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

X = [x.text for x in train]
X_test = [x.text for x in test]

vectorizer = CountVectorizer()

x_train = vectorizer.fit_transform(X).toarray()
y_train = [x.category for x in train]

x_test = vectorizer.transform(X_test).toarray()
y_test = [x.category for x in test]

In [46]:
len(x_test[0])

17961

## Classification

In [48]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf.fit(x_train, y_train)

DecisionTreeClassifier()

In [86]:
from sklearn import *
print(clf.score(x_test, y_test))

0.703


In [92]:
sentence = ["This cable"]

clf.predict(vectorizer.transform(sentence).toarray())

array(['ELECTRONICS'], dtype='<U11')

In [97]:
from sklearn.metrics import f1_score

y_pred = clf.predict(x_test)

f1_score(y_test, y_pred, average=None, labels=[Category.BOOKS, Category.CLOTHING,
                                               Category.ELECTRONICS, Category.GROCERY,
                                               Category.PATIO])


array([0.87037037, 0.67676768, 0.55172414, 0.71755725, 0.67661692])

In [99]:
import pickle

with open('models/dectree.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [101]:
# tree.plot_tree(clf) 

In [105]:
from sklearn import tree

tree.export_graphviz(clf, out_file="predict_category.dot",
                     feature_names=None,  
                     class_names=y_train,  
                     filled=True, rounded=True,  
                     special_characters=True)