### For Exploratory Data Analysis, refer this notebook [Exploratory Data Analysis](./Exploratory-Data-Analysis.ipynb)

### For Deep Neural Networks implementation, refer this notebook [Deep Learning Approach](./Deep-Learning-Approach.ipynb)

# Importing Libraries

In [40]:
import string
import pickle
import joblib
from IPython.display import display

# data manipulation & vizualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn imports
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, StackingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, \
f1_score, recall_score, roc_auc_score, precision_score, make_scorer
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

# Reading the clean data 

* reading the cleaned data from the saved files
* assigning them to their corresponding columns in the dataframe

In [41]:
# reading the corpus of cleaned data and asigning each column to the new dataframe

with open('clean-dataset/corpus.txt', 'rt', encoding='utf-8') as file:
    description = list(file.readlines())
    
with open('clean-dataset/final-categories.txt', 'rt') as file:
    primary_category = list(file.readlines())
    
df = pd.DataFrame()
df["description"] = description 
df["primary_category"] = primary_category
df["description"] = df["description"].str.replace('\n', '') 
df["primary_category"] = df["primary_category"].str.replace('\n', '')

df.head(3)

Unnamed: 0,description,primary_category
0,alisha solid woman cycling short cotton lycra ...,Clothing
1,fabhomedecor fabric double sofa bed finish col...,Furniture
2,belly sandal wedge heel casuals belly price ma...,Footwear


In [42]:
# total no of words in desciption column 

df["description"].str.split().str.len().sum()

738577

# Splitting the data into training and testing set

* Dividing the whole data into 80:20 ratio using train_test_split from sklearn

In [43]:
# splitting the X, y into X_train, X_test, y_train & y_test

X = np.array(df["description"])
y = np.array(df["primary_category"])

# le = LabelEncoder()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=73)
# y_train = le.fit_transform(y_train)
# y_test = le.transform(y_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(15732,)
(15732,)
(3934,)
(3934,)


# Feature Extraction Functions

* Count Vectorizer
* TF-IDF Vectorizer
* Pipeline of Count Vectorizer followed by TF-IDF Vectorizer followed by the model

In [44]:
# returns X_train_features and X_test_features after applying CountVectorizer on both 

def CountVec(X_train, X_test):
    count_vec = CountVectorizer()
    X_train_features = count_vec.fit_transform(X_train)
    X_test_features = count_vec.transform(X_test)
    return (X_train_features, X_test_features)

In [45]:
# returns X_train_features and X_test_features after applying TFIDFVectorizer on both 

def TFIDF(X_train, X_test):
    tfidf_vec = TfidfVectorizer()
    X_train_features = tfidf_vec.fit_transform(X_train)
    X_test_features = tfidf_vec.transform(X_test)
    return (X_train_features, X_test_features)

In [46]:
# returns a pipeline of CountVectorizer, followed by TFIDF Vectorizer and then followed by the ML model 

def CountVec_TFIDF_Pipeline(model):
    return Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', model)])

# Multinomial Naive Bayes

* Calculating the test accuracy
* Calculating the training accuracy
* Calculating the mean cross validation accuracy

In [24]:
# Multinomial Naive Bayes Classifier with Count Vectorizer

X_train_features, X_test_features = CountVec(X_train, X_test)
clf = MultinomialNB(alpha = 0.01)
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

MultinomialNB(alpha=0.01)

In [27]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

array(['Beauty and Personal Care', 'Pens & Stationery', 'Clothing', ...,
       'Home Decor & Festive Needs', 'Watches', 'Jewellery'], dtype='<U33')

In [29]:
clf = MultinomialNB(alpha = 0.01)
X_features = TfidfVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.9202181920839543


In [13]:
# Multinomial Naive Bayes Algorithm with TF-IDF Vectorizer

X_train_features, X_test_features = TFIDF(X_train, X_test)
clf = MultinomialNB(alpha = 0.001)
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

MultinomialNB(alpha=0.001)

In [17]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

array(['Beauty and Personal Care', 'Pens & Stationery', 'Clothing', ...,
       'Home Decor & Festive Needs', 'Watches', 'Jewellery'], dtype='<U33')

In [23]:
clf = MultinomialNB(alpha = 0.01)
X_features = TfidfVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.9202181920839543


# Support Vector Machines (SVM)

* Calculating the test accuracy
* Calculating the training accuracy
* Calculating the mean cross validation accuracy

In [47]:
# SVM Classifier with Count Vectorizer

X_train_features, X_test_features = CountVec(X_train, X_test)
# clf = svm.SVC()
# clf.fit(X_train_features, y_train)

# joblib.dump(clf, 'trained-models/svm-countvec.pkl')
clf = joblib.load('trained-models/svm-countvec.pkl') 

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9356888662938485


In [48]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9789600813628273


In [None]:
clf = svm.SVC()
X_features = CountVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)

print(np.mean(scores["test_score"]))

In [49]:
# SVM Classifier with TF-IDF Vectorizer

X_train_features, X_test_features = TFIDF(X_train, X_test)
# clf = svm.SVC()
# clf.fit(X_train_features, y_train)

# joblib.dump(clf, 'trained-models/svm-tfidf.pkl')
clf = joblib.load('trained-models/svm-tfidf.pkl') 

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9659379766141332


In [50]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9951690821256038


In [None]:
clf = svm.SVC()
X_features = TfidfVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

# Random Forest Classifier

* Calculating the test accuracy
* Calculating the training accuracy
* Calculating the mean cross validation accuracy

In [51]:
# Random Forest Classifier with Count Vectorizer

X_train_features, X_test_features = CountVec(X_train, X_test)
# clf = RandomForestClassifier()
# clf.fit(X_train_features, y_train)

# joblib.dump(clf, 'trained-models/rf-countvec.pkl')
clf = joblib.load('trained-models/rf-countvec.pkl') 

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9682257244534824


In [52]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9995550470378846


In [None]:
clf = RandomForestClassifier()
X_features = CountVectorizer().fit_transform(X)
# scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
# print(np.mean(scores["test_score"]))

In [53]:
# Random Forest Classifier with TF-IDF Vectorizer


X_train_features, X_test_features = TFIDF(X_train, X_test)
# clf = RandomForestClassifier()
# clf.fit(X_train_features, y_train)

# joblib.dump(clf, 'trained-models/rf-tfidf.pkl')
clf = joblib.load('trained-models/rf-tfidf.pkl') 

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9651753940010168


In [54]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9995550470378846


In [None]:
clf = RandomForestClassifier()
X_features = TfidfVectorizer().fit_transform(X)
# scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
# print(np.mean(scores["test_score"]))

# KNeighbours Classifier

* Calculating the test accuracy
* Calculating the training accuracy
* Calculating the mean cross validation accuracy

In [68]:
# KNeighbours Classifier with Count Vectorizer & with Tfidf Tranformer

clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=5)),
                     ])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9191662430096594


In [69]:
y_train_pred = clf.predict(X_train)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9445715738621917


In [70]:
clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])
scores = cross_validate(clf, X, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.830823939522849


# Gradient Boosting Classifier

* Calculating the test accuracy
* Calculating the training accuracy
* Calculating the mean cross validation accuracy

In [60]:
# Gradient Boosting Classifier with Count Vectorizer & with Tfidf Tranformer

# clf = Pipeline([('vect', CountVectorizer()),
# #                 ('tfidf', TfidfTransformer()),
#                 ('clf', GradientBoostingClassifier(n_estimators=100)),
#                 ])
# clf.fit(X_train, y_train)

# joblib.dump(clf, 'trained-models/gradient-boosting.pkl')
clf = joblib.load('trained-models/gradient-boosting.pkl') 

y_pred = clf.predict(X_test)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9590747330960854


In [61]:
y_train_pred = clf.predict(X_train)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9988558352402745


In [75]:
clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                     ('clf', GradientBoostingClassifier(n_estimators=100)),
                     ])
# scores = cross_validate(clf, X, y, cv=5, return_train_score=False)
# print(np.mean(scores["test_score"]))

# Xgboost Classifier

* Calculating the test accuracy
* Calculating the training accuracy
* Calculating the mean cross validation accuracy

In [76]:
# Xgboost Classifier with Count Vectorizer & with Tfidf Tranformer

# clf = Pipeline([('vect', CountVectorizer()),
# #                 ('tfidf', TfidfTransformer()),
#                 ('clf', XGBClassifier()),
#                 ])
# clf.fit(X_train, y_train)

# joblib.dump(clf, 'trained-models/xgboost.pkl')
clf = joblib.load('trained-models/xgboost.pkl') 

y_pred = clf.predict(X_test)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.974580579562786


In [77]:
y_train_pred = clf.predict(X_train)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9977116704805492


In [74]:
clf = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                     ('clf', XGBClassifier()),
                     ])
# scores = cross_validate(clf, X, y, cv=5, return_train_score=False)
# print(np.mean(scores["test_score"]))