In [1]:
# Imports and Data exploration
import pandas as pd
from pandas import *
import numpy as np

from scipy import stats
import warnings
warnings.filterwarnings("ignore")


In [2]:
# import file path
file_path = r"c:\Users\jonat\OneDrive\Documents\fun numbers\data for classification\pricerunner_aggregate.xlsx"
df = pd.read_excel(file_path)

In [3]:

df.head()


Unnamed: 0,Product ID,Product Title,Merchant ID,Cluster ID,Cluster Label,Category ID,Category Label
0,1,apple iphone 8 plus 64gb silver,1,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
1,2,apple iphone 8 plus 64 gb spacegrau,2,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
3,4,apple iphone 8 plus 64gb space grey,4,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones


In [4]:
#Counts of items in category

df[' Category Label'].value_counts()

 Category Label
Fridge Freezers     5501
Mobile Phones       4081
Washing Machines    4044
CPUs                3862
Fridges             3584
TVs                 3564
Dishwashers         3424
Digital Cameras     2697
Microwaves          2342
Freezers            2212
Name: count, dtype: int64

In [5]:

X = df['Product Title']
y = df[' Category Label']

len(X)


35311

In [6]:

# Build and Train the model
#Split into training and testing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [7]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

le = LabelEncoder()
y_encoded_train = le.fit_transform(y_train)
y_encoded_test = le.fit_transform(y_test)
len(X_train)


28248

In [8]:

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import Perceptron, LogisticRegression, SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform
from xgboost import XGBClassifier
#Finding the size of the tfidf vector
vectorizer = TfidfVectorizer()
tfidfarray = vectorizer.fit_transform(X_train)
print(tfidfarray.shape)


(28248, 17504)


In [9]:
#Initialize the pipelines for each of the models

pipeMNB = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB())])
pipeCNB = Pipeline([('tfidf', TfidfVectorizer()),('feature_selection', SelectKBest(chi2, k=15000)),('clf', ComplementNB())])
pipeSVC = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC())])
pipeKNN = Pipeline([('tfidf', TfidfVectorizer()),('feature_selection', SelectKBest(chi2, k=5000)),('KNN', KNeighborsClassifier(n_neighbors=3))])
pipeXGB = Pipeline([('tfidf', TfidfVectorizer()),('XGBoost', XGBClassifier(objective="multi:softmax"))])
pipeSGD = Pipeline([('tfidf', TfidfVectorizer()),('SGD', SGDClassifier(loss='log_loss', penalty='l2',max_iter=50))])


In [19]:
#Training and testing CNB
pipeMNB.fit(X_train, y_train)
predictMNB = pipeMNB.predict(X_test)
print(f'MNB: {accuracy_score(y_test, predictMNB):.4f}')

MNB: 0.9393


In [10]:

#Training and testing SGDClassifier
pipeSGD.fit(X_train, y_train)
predictSGD = pipeSGD.predict(X_test)
print(f'SGD: {accuracy_score(y_test, predictSGD):.4f}')

SGD: 0.9425


In [11]:
#Training and testing CNB
pipeCNB.fit(X_train, y_train)
predictCNB = pipeCNB.predict(X_test)
print(f'CNB: {accuracy_score(y_test, predictCNB):.4f}')


CNB: 0.9605


In [12]:
#XGBoost
pipeXGB.fit(X_train, y_encoded_train)
predictXGB = pipeXGB.predict(X_test)
print(f'XGB: {accuracy_score(y_encoded_test, predictXGB):.4f}')


XGB: 0.9418


In [13]:
#Training and testing linearSVC
pipeSVC.fit(X_train, y_train)
predictSVC = pipeSVC.predict(X_test)
print(f'SVC: {accuracy_score(y_test, predictSVC):.4f}')


SVC: 0.9693


In [15]:
#Determining best parameters for KNN

param_dist = {'KNN__n_neighbors': [5, 7, 9],
              'KNN__weights': ['distance', 'uniform'],
              'KNN__p': [1,2,3,4]}
random_search = RandomizedSearchCV(pipeKNN, param_distributions=param_dist, n_iter=5, cv=3, scoring='accuracy')
random_search.fit(X_train, y_train)
print("Best parameters: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)

Best parameters:  {'KNN__weights': 'distance', 'KNN__p': 2, 'KNN__n_neighbors': 5}
Best cross-validation score:  0.9025417728688757


In [16]:
#KNeighbors classifier
pipeKNN.fit(X_train, y_train)
predictKNN = pipeKNN.predict(X_test)
print(f'KNN: {accuracy_score(y_test, predictKNN):.4f}')

KNN: 0.9244


In [21]:

#Initialize the voting classifier

estimators = [('XGB', pipeXGB), ('KNN',pipeKNN), ('CNB', pipeCNB), ('SGD', pipeSGD)]

ensemble = VotingClassifier(estimators, voting='soft')

pipeVC = Pipeline([('ensemble', ensemble)])


In [22]:

#Train and test voting classifier

pipeVC.fit(X_train, y_train)


In [23]:

#Train and test voting classifier

predictVC = ensemble.predict(X_test)
print(f'VotingClassifier: {accuracy_score(y_test, predictVC):.4f}')


VotingClassifier: 0.9635


In [None]:
# Having build and trained the models, print scores
#Classification report for each category

print(f'SVC: {classification_report(y_test, predictSVC)}')
print(f'KNN: {classification_report(y_test, predictKNN)}')
print(f'CNB: {classification_report(y_test, predictCNB)}')


In [29]:
#Sample prediction for voting classifier

Desc = 'samsung galaxy'
pipeVC.predict_proba([Desc])


array([[0.00688121, 0.00842659, 0.00885409, 0.01106589, 0.02372134,
        0.01359037, 0.01682091, 0.86617645, 0.02731326, 0.01714991]])

In [30]:
# Make predictions using various files
#Import the under review excel file
UR_file_path = r"c:\Users\jonat\OneDrive\Documents\fun numbers\data for classification\Test file.xlsx"
urdf = pd.read_excel(UR_file_path)
urdf.head()


Unnamed: 0,Desc,Correct Category
0,Intel Xeon Processor E5-2658 2.10GHz,CPU
1,Intel Xeon Processor E5-2658 2.10GHz,CPU
2,AMD Ryzen 7 2700X 3.7GHz,CPU
3,Intel Core i7-8700K 3.7GHz,CPU
4,AMD Ryzen 5 1600X 3.6GHz,CPU


In [31]:

#Predict categories for under review data for both svc & cccv

predicted_probs = pipeVC.predict_proba(urdf['Desc'])
print('Order of the classes: ', pipeVC.classes_)


Order of the classes:  ['CPUs' 'Digital Cameras' 'Dishwashers' 'Freezers' 'Fridge Freezers'
 'Fridges' 'Microwaves' 'Mobile Phones' 'TVs' 'Washing Machines']


In [32]:
# get the indices of the top 5 classes
top_5_classes = np.argsort(predicted_probs, axis=1)[:, -5:][:, ::-1]

#Get the class labels
class_labels = pipeVC.classes_

# add new columns to the DataFrame with class names and scores
for i in range(5):
    # replace class indices with class names
    class_labels = [class_labels[index] for index in top_5_classes[:, i]]
    urdf[f'Top {i+1} Class'] = class_labels

    # get the scores for the top classes
    class_score = np.take_along_axis(predicted_probs, top_5_classes, axis=1)[:, i]
    urdf[f'Top {i+1} Score'] = class_score
urdf.head()


Unnamed: 0,Desc,Correct Category,Top 1 Class,Top 1 Score,Top 2 Class,Top 2 Score,Top 3 Class,Top 3 Score,Top 4 Class,Top 4 Score,Top 5 Class,Top 5 Score
0,Intel Xeon Processor E5-2658 2.10GHz,CPU,CPUs,0.963233,Digital Cameras,0.005636,CPUs,0.004709,CPUs,0.004091,CPUs,0.003984
1,Intel Xeon Processor E5-2658 2.10GHz,CPU,CPUs,0.963233,Digital Cameras,0.005636,CPUs,0.004709,CPUs,0.004091,CPUs,0.003984
2,AMD Ryzen 7 2700X 3.7GHz,CPU,CPUs,0.907794,Digital Cameras,0.014944,CPUs,0.012099,CPUs,0.010541,CPUs,0.009905
3,Intel Core i7-8700K 3.7GHz,CPU,CPUs,0.948559,Digital Cameras,0.008524,CPUs,0.006835,CPUs,0.005598,CPUs,0.005482
4,AMD Ryzen 5 1600X 3.6GHz,CPU,CPUs,0.910773,Digital Cameras,0.014918,CPUs,0.011542,CPUs,0.010141,CPUs,0.009515


In [33]:
urdf.to_excel("Test file predictions.xlsx")