In [1]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from torch.utils.data import dataset, DataLoader

import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score,classification_report
import matplotlib.pyplot as plt
import nltk

In [2]:
column_names = ['Item', 'Review']
data = pd.read_csv("ecommerceDataset.csv", names = column_names)
data.head(5)

Unnamed: 0,Item,Review
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [3]:
data.Item.unique()

array(['Household', 'Books', 'Clothing & Accessories', 'Electronics'],
      dtype=object)

In [4]:
data.describe()

Unnamed: 0,Item,Review
count,50425,50424
unique,4,27802
top,Household,Think & Grow Rich About the Author NAPOLEON HI...
freq,19313,30


In [5]:
data.dropna(inplace=True)

In [6]:
data.drop_duplicates(inplace=True)
data.shape

(27802, 2)

In [7]:
new_data = data.copy()
new_data = new_data.reset_index()

In [8]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
ps = PorterStemmer()

In [9]:
corpus = []

for row in range((new_data.shape[0])):

    clean_row = re.sub('[^a-zA-z]' , ' ', new_data['Review'][row])
    clean_row = clean_row.lower()
    clean_row = clean_row.split()
    stem_words = [ps.stem(word) for word in clean_row if word not in (stopwords.words('english'))]
    complete_clean_words = ''.join(stem_words)

    corpus.append(complete_clean_words)



## CountVectorizer - Convert Text to numeric vectors

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 2000, max_df = 0.6,min_df =10)
numeric_vectors = tfidf.fit_transform(corpus).toarray()

In [19]:
numeric_vectors.shape

(27802, 5)

In [20]:
y = data['Item'].values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([3, 3, 3, ..., 2, 2, 2])

In [21]:
#Data train test split 
X_train, X_test, y_train,y_test = train_test_split(numeric_vectors, y, test_size=0.3, random_state=0)
X_train.shape, X_test.shape, y_train.shape,y_test.shape

((19461, 5), (8341, 5), (19461,), (8341,))

In [22]:
input_size = X_train.shape[1]
hidden_size1= 200
hidden_size2 = 100
output_size = 4
batch_size = 32

In [23]:
## Network building

class Review_classifier(nn.Module):
    def __init__(self):
        super(Review_classifier, self).__init__()

        self.layer1 = nn.Linear(input_size, hidden_size1)
        self.layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.layer3 = nn.Linear(hidden_size2, output_size)

    def forward(self,x):
        outputs = F.relu(self.layer1(x))
        outputs = F.relu(self.layer2(outputs))

        outputs = F.relu(self.layer3(outputs))

        return outputs

model = Review_classifier()
print(model)

Review_classifier(
  (layer1): Linear(in_features=5, out_features=200, bias=True)
  (layer2): Linear(in_features=200, out_features=100, bias=True)
  (layer3): Linear(in_features=100, out_features=4, bias=True)
)


## Model parameters and training

In [24]:
X_train = torch.from_numpy(X_train).float()
X_test = torch.from_numpy(X_test).float()

y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)

In [25]:
epochs = 200
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
adam_optim = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [26]:
#Training

running_loss = []
for epoch in range(epochs):

    adam_optim.zero_grad()
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)

    loss.backward()
    running_loss.append(loss)
    adam_optim.step()

    print(f"Epoch {epoch}, Loss : {loss.item()}")
    

Epoch 0, Loss : 1.3888499736785889
Epoch 1, Loss : 1.3861902952194214
Epoch 2, Loss : 1.3902260065078735
Epoch 3, Loss : 1.3858072757720947
Epoch 4, Loss : 1.3480374813079834
Epoch 5, Loss : 1.350712776184082
Epoch 6, Loss : 1.361232042312622
Epoch 7, Loss : 1.354092001914978
Epoch 8, Loss : 1.3454474210739136
Epoch 9, Loss : 1.3448598384857178
Epoch 10, Loss : 1.3488972187042236
Epoch 11, Loss : 1.3511933088302612
Epoch 12, Loss : 1.3502273559570312
Epoch 13, Loss : 1.347348690032959
Epoch 14, Loss : 1.3447182178497314
Epoch 15, Loss : 1.343867301940918
Epoch 16, Loss : 1.3449058532714844
Epoch 17, Loss : 1.346077561378479
Epoch 18, Loss : 1.345953345298767
Epoch 19, Loss : 1.344751238822937
Epoch 20, Loss : 1.343845009803772
Epoch 21, Loss : 1.3441561460494995
Epoch 22, Loss : 1.344933032989502
Epoch 23, Loss : 1.3449870347976685
Epoch 24, Loss : 1.3443994522094727
Epoch 25, Loss : 1.3438695669174194
Epoch 26, Loss : 1.343873143196106
Epoch 27, Loss : 1.3442288637161255
Epoch 28, Los

In [28]:
predictions = model(X_test)
predictions

tensor([[0.0000, 0.0000, 0.0000, 0.6193],
        [0.0000, 0.0000, 0.0000, 0.6193],
        [0.0000, 0.0000, 0.0000, 0.6193],
        ...,
        [0.0000, 0.0000, 0.0000, 0.6193],
        [0.0000, 0.0000, 0.0000, 0.6193],
        [0.0000, 0.0000, 0.0000, 0.6193]], grad_fn=<ReluBackward0>)