In [155]:
import string

# data manipulation & vizualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn imports
from sklearn import ensemble
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [91]:
with open('corpus.txt', 'rt', encoding='utf-8') as file:
    description = list(file.readlines())
    
with open('final_categories.txt', 'rt') as file:
    primary_category = list(file.readlines())
    
df = pd.DataFrame()
df["description"] = description 
df["primary_category"] = primary_category
df["description"] = df["description"].str.replace('\n', '') 
df["primary_category"] = df["primary_category"].str.replace('\n', '')

df.head(3)

Unnamed: 0,description,primary_category
0,alisha solid woman cycling short cotton lycra ...,Clothing
1,fabhomedecor fabric double sofa bed finish col...,Furniture
2,belly sandal wedge heel casuals belly price ma...,Footwear


In [92]:
X = np.array(df["description"])
y = np.array(df["primary_category"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=73)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(15732,)
(15732,)
(3934,)
(3934,)


In [93]:
count_vec = CountVectorizer()
X_train_features = count_vec.fit_transform(X_train)

X_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [94]:
X_test_features = count_vec.transform(X_test)

X_test_features

<3934x13595 sparse matrix of type '<class 'numpy.int64'>'
	with 82664 stored elements in Compressed Sparse Row format>

In [95]:
# Multinomial Naive Bayes Classifier with Count Vectorizer

clf = MultinomialNB()
clf.fit(X_train_features, y_train)

MultinomialNB()

In [96]:
y_pred = clf.predict(X_test_features)

y_pred

array(['Jewellery', 'Computers', 'Beauty and Personal Care', ...,
       'Tools & Hardware', 'Jewellery', 'Home Decor & Festive Needs'],
      dtype='<U33')

In [97]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                                   precision    recall  f1-score   support

                       Automotive       0.90      0.97      0.93       193
                        Baby Care       0.96      0.74      0.84       121
            Bags, Wallets & Belts       0.89      0.95      0.92        43
         Beauty and Personal Care       0.93      0.95      0.94       135
            Cameras & Accessories       1.00      0.73      0.85        15
                         Clothing       0.97      1.00      0.98      1218
                        Computers       0.80      0.87      0.83       126
                          Eyewear       1.00      1.00      1.00         1
                         Footwear       1.00      0.97      0.98       259
                        Furniture       0.97      1.00      0.99        39
                           Gaming       0.55      0.60      0.57        10
Health & Personal Care Appliances       1.00      0.57      0.73         7
                   Home 

In [98]:
tf_idf_vec = TfidfVectorizer(norm = None)

X_train_features = tf_idf_vec.fit_transform(X_train)

X_train_features

<15732x13595 sparse matrix of type '<class 'numpy.float64'>'
	with 328094 stored elements in Compressed Sparse Row format>

In [99]:
len(tf_idf_vec.get_feature_names())

13595

In [100]:
X_test_features = tf_idf_vec.transform(X_test)

X_test_features

<3934x13595 sparse matrix of type '<class 'numpy.float64'>'
	with 82664 stored elements in Compressed Sparse Row format>

In [101]:
# Multinomial Naive Bayes Algorithm with Tfidf Bectorizer

clf = MultinomialNB(alpha = 3)
clf.fit(X_train_features, y_train)

MultinomialNB(alpha=3)

In [102]:
y_pred = clf.predict(X_test_features)

y_pred

array(['Jewellery', 'Computers', 'Beauty and Personal Care', ...,
       'Tools & Hardware', 'Jewellery', 'Home Decor & Festive Needs'],
      dtype='<U33')

In [103]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                                   precision    recall  f1-score   support

                       Automotive       0.95      0.96      0.96       193
                        Baby Care       0.93      0.83      0.88       121
            Bags, Wallets & Belts       0.86      0.98      0.91        43
         Beauty and Personal Care       0.96      0.94      0.95       135
            Cameras & Accessories       0.92      0.80      0.86        15
                         Clothing       0.98      0.99      0.99      1218
                        Computers       0.85      0.88      0.87       126
                          Eyewear       1.00      1.00      1.00         1
                         Footwear       1.00      0.97      0.98       259
                        Furniture       0.97      1.00      0.99        39
                           Gaming       0.47      0.70      0.56        10
Health & Personal Care Appliances       1.00      0.86      0.92         7
                   Home 

In [104]:
y_train_pred = clf.predict(X_train_features)

y_train_pred

array(['Beauty and Personal Care', 'Pens & Stationery', 'Clothing', ...,
       'Home Decor & Festive Needs', 'Watches', 'Jewellery'], dtype='<U33')

In [105]:
print(classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))

                                   precision    recall  f1-score   support

                       Automotive       1.00      0.99      0.99       819
                        Baby Care       0.89      0.90      0.89       362
            Bags, Wallets & Belts       0.94      0.99      0.96       222
         Beauty and Personal Care       1.00      0.98      0.99       575
            Cameras & Accessories       0.98      0.97      0.98        67
                         Clothing       0.99      0.99      0.99      4979
                        Computers       0.97      0.96      0.97       452
                          Eyewear       1.00      1.00      1.00         9
                         Footwear       1.00      1.00      1.00       968
                        Furniture       0.96      1.00      0.98       141
                           Gaming       0.77      0.96      0.86        25
Health & Personal Care Appliances       1.00      1.00      1.00        36
                   Home 

In [117]:
# GradientBoostingClassifier Algorithm 

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', ensemble.GradientBoostingClassifier(n_estimators=100)),
                     ])

text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', GradientBoostingClassifier())])

In [118]:
y_pred = text_clf.predict(X_test)

y_pred

array(['Jewellery', 'Computers', 'Beauty and Personal Care', ...,
       'Tools & Hardware', 'Jewellery', 'Home Decor & Festive Needs'],
      dtype=object)

In [119]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                                   precision    recall  f1-score   support

                       Automotive       0.97      0.98      0.98       193
                        Baby Care       0.95      0.87      0.91       121
            Bags, Wallets & Belts       0.98      0.93      0.95        43
         Beauty and Personal Care       0.95      0.93      0.94       135
            Cameras & Accessories       0.93      0.93      0.93        15
                         Clothing       0.95      1.00      0.98      1218
                        Computers       0.97      0.83      0.89       126
                          Eyewear       0.33      1.00      0.50         1
                         Footwear       1.00      0.98      0.99       259
                        Furniture       0.95      0.95      0.95        39
                           Gaming       0.67      0.80      0.73        10
Health & Personal Care Appliances       1.00      0.71      0.83         7
                   Home 

In [132]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])

text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', KNeighborsClassifier())])

In [133]:
y_pred = text_clf.predict(X_test)

y_pred

array(['Jewellery', 'Computers', 'Computers', ..., 'Tools & Hardware',
       'Jewellery', 'Home Decor & Festive Needs'], dtype=object)

In [134]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                                   precision    recall  f1-score   support

                       Automotive       1.00      0.85      0.92       193
                        Baby Care       0.94      0.72      0.81       121
            Bags, Wallets & Belts       0.94      0.72      0.82        43
         Beauty and Personal Care       1.00      0.61      0.76       135
            Cameras & Accessories       0.93      0.87      0.90        15
                         Clothing       0.99      0.79      0.88      1218
                        Computers       0.12      0.97      0.21       126
                          Eyewear       1.00      1.00      1.00         1
                         Footwear       0.99      0.68      0.81       259
                        Furniture       1.00      0.90      0.95        39
                           Gaming       0.50      0.50      0.50        10
Health & Personal Care Appliances       1.00      0.57      0.73         7
                   Home 

  _warn_prf(average, modifier, msg_start, len(result))


In [148]:
count_vec = CountVectorizer()

X_train_features = count_vec.fit_transform(X_train)

X_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [149]:
X_test_features = count_vec.transform(X_test)

X_test_features

<3934x13595 sparse matrix of type '<class 'numpy.int64'>'
	with 82664 stored elements in Compressed Sparse Row format>

In [150]:
clf = BernoulliNB()
clf.fit(X_train_features, y_train)

BernoulliNB()

In [151]:
y_pred = text_clf.predict(X_test)

y_pred

array(['Jewellery', 'Computers', 'Beauty and Personal Care', ...,
       'Jewellery', 'Jewellery', 'Home Decor & Festive Needs'],
      dtype='<U33')

In [152]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

                                   precision    recall  f1-score   support

                       Automotive       0.90      0.94      0.92       193
                        Baby Care       0.67      0.12      0.20       121
            Bags, Wallets & Belts       0.86      0.42      0.56        43
         Beauty and Personal Care       0.87      0.34      0.49       135
            Cameras & Accessories       0.00      0.00      0.00        15
                         Clothing       0.94      0.99      0.96      1218
                        Computers       0.70      0.40      0.51       126
                          Eyewear       0.00      0.00      0.00         1
                         Footwear       0.96      0.46      0.62       259
                        Furniture       0.92      0.31      0.46        39
                           Gaming       0.00      0.00      0.00        10
Health & Personal Care Appliances       0.00      0.00      0.00         7
                   Home 

  _warn_prf(average, modifier, msg_start, len(result))


In [156]:
count_vec = CountVectorizer()

X_train_features = count_vec.fit_transform(X_train)

X_train_features.todense()

X_test_features = count_vec.transform(X_test)

X_test_features

<3934x13595 sparse matrix of type '<class 'numpy.int64'>'
	with 82664 stored elements in Compressed Sparse Row format>