In [1]:
import string
import pickle
from IPython.display import display

# data manipulation & vizualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn imports
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, StackingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, \
f1_score, recall_score, roc_auc_score, precision_score, make_scorer
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [2]:
# reading the corpus of cleaned data and asigning each column to the new dataframe

with open('corpus-spell.txt', 'rt', encoding='utf-8') as file:
    description = list(file.readlines())
    
with open('final-categories.txt', 'rt') as file:
    primary_category = list(file.readlines())
    
df = pd.DataFrame()
df["description"] = description 
df["primary_category"] = primary_category
df["description"] = df["description"].str.replace('\n', '') 
df["primary_category"] = df["primary_category"].str.replace('\n', '')

df.head(3)

Unnamed: 0,description,primary_category
0,alisha solid woman cycling short cotton lycra ...,Clothing
1,fabhomedecor fabric double sofa bed finish col...,Furniture
2,belly sandal wedge heel casuals belly price ma...,Footwear


In [3]:
# splitting the X, y into X_train, X_test, y_train & y_test

X = np.array(df["description"])
y = np.array(df["primary_category"])

# le = LabelEncoder()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=73)
# y_train = le.fit_transform(y_train)
# y_test = le.transform(y_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(15732,)
(15732,)
(3934,)
(3934,)


In [4]:
# returns X_train_features and X_test_features after applying CountVectorizer on both 

def CountVec(X_train, X_test):
    count_vec = CountVectorizer()
    X_train_features = count_vec.fit_transform(X_train)
    X_test_features = count_vec.transform(X_test)
    return (X_train_features, X_test_features)

In [5]:
# returns X_train_features and X_test_features after applying TFIDFVectorizer on both 

def TFIDF(X_train, X_test):
    tfidf_vec = TfidfVectorizer()
    X_train_features = tfidf_vec.fit_transform(X_train)
    X_test_features = tfidf_vec.transform(X_test)
    return (X_train_features, X_test_features)

In [6]:
# Multinomial Naive Bayes Classifier with Count Vectorizer

from sklearn import svm

X_train_features, X_test_features = CountVec(X_train, X_test)
clf = RandomForestClassifier()
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9651753940010168


In [7]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9995550470378846


In [10]:
clf = RandomForestClassifier()
X_features = CountVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.9184385612026353


In [11]:
# Multinomial Naive Bayes Classifier with Count Vectorizer

from sklearn import svm

X_train_features, X_test_features = TFIDF(X_train, X_test)
clf = RandomForestClassifier()
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)
print(y_pred)

print(accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

['Jewellery' 'Computers' 'Beauty and Personal Care' ... 'Tools & Hardware'
 'Jewellery' 'Home Decor & Festive Needs']
0.9682257244534824


In [12]:
y_train_pred = clf.predict(X_train_features)

print(accuracy_score(y_train, y_train_pred))
# print(classification_report(y_train, y_train_pred))
# print(confusion_matrix(y_train, y_train_pred))

0.9995550470378846


In [13]:
clf = RandomForestClassifier()
X_features = TfidfVectorizer().fit_transform(X)
scores = cross_validate(clf, X_features, y, cv=5, return_train_score=False)
print(np.mean(scores["test_score"]))

0.9176758881059476
