In [1]:
from sklearn.utils import shuffle
import pandas as pd
df = pd.read_csv(
    'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/Dataset/ecommerceDataset.csv', 
    names = ['label', 'description']
)
df = df[['description', 'label']]

df = df.dropna(subset=['description'])

df.drop_duplicates(inplace = True) 
df.reset_index(drop = True, inplace = True) # Resetting index


df = df[0:10000]

In [2]:
df.reset_index(inplace=True)
df.head(3)

Unnamed: 0,index,description,label
0,0,Paper Plane Design Framed Wall Hanging Motivat...,Household
1,1,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",Household
2,2,SAF 'UV Textured Modern Art Print Framed' Pain...,Household


In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

In [4]:
df.head()

Unnamed: 0,index,description,label,label_encoded
0,1235,"Rikki Knight Compact Mirror, Mary Cassatt Art ...",Household,3
1,13728,Michael Jordan Review One of Amazon's Best Boo...,Books,0
2,19011,"Papaya Men's Cotton Lungi (Multi-Coloured, 2.1...",Clothing & Accessories,1
3,11793,International Relations About the Author Peu G...,Books,0
4,20712,Xs and Os Women's Lace Garter Belt and Sheer S...,Clothing & Accessories,1


In [None]:
import string
import spacy
nlp = spacy.load("en_core_web_lg")
def remove_punct(text):
    
    punctuations =string.punctuation
    mytokens = ''.join(word for word in text if word not in punctuations)
    
    return mytokens

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

def remove_stopwords(text):
    
    # gives us list stopword in english
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    
    # add more stop if requried based on domain knowledge 
    #  stop_words + = ['remove','item','rich']
    
    mytokens = [word for word in text if word not in stop_words]
    
    return mytokens

In [None]:
nlp.max_length = 19461259

def tokenization(text):
    
    token = nlp(text)
    
    ## lemma 
    token = [word.lemma_ for word in token]
    
    ## convert tokens into lower case
    token = [ word.lower() for word in token]
    
    return token

In [None]:
def text_norm(text):
        
    punct_text = remove_punct(text)
    
    tokens = tokenization(punct_text)
    
    final_tokens = remove_stopwords(tokens)
    
    return final_tokens

In [None]:
## CountVectorizer
'''
   min_df : is used for removing data values that appear too infrequently.
   max_df : is used for removing data values that appear too frequently.
   
   max_df = 9   means remove the words that appear in more than 9 documents
   max-df =0.90 means remove the words that appear in more tahn 90% documents
'''
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(tokenizer=text_norm,max_df=0.9,min_df=2,ngram_range=(1,1))


In [None]:
## TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(min_df=2,max_df=0.90,tokenizer=text_norm,ngram_range=(1,1))

In [None]:
# pipeline creation
from sklearn.pipeline import Pipeline

# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# spliting dataset
from sklearn.model_selection import train_test_split


from sklearn.metrics import classification_report,ConfusionMatrixDisplay,confusion_matrix

In [None]:
# train:test = 70:30
X = df['description'] 
y = df['label_encoded']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=7)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

In [None]:
print("Count Vectorizer + Logistic Regression  \n\n ")
## create a pipeline using Count Vectorizer
pipe_bow = Pipeline([('vectorizer', bow_vectorizer),
                    ('classifier', MultinomialNB())])

## fit data
pipe_bow.fit(X_train,y_train)
y_pred_bow = pipe_bow.predict(X_test)

print(classification_report(y_test,y_pred_bow))
ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred_bow),display_labels=['Books','Clothing','Electronics','Household']).plot()

In [None]:
print("Count Vectorizer + Logistic Regression  \n\n ")
## create a pipeline using Count Vectorizer
pipe_bow = Pipeline([('vectorizer', bow_vectorizer),
                    ('classifier', KNeighborsClassifier())])

## fit data
pipe_bow.fit(X_train,y_train)
y_pred_bow = pipe_bow.predict(X_test)

print(classification_report(y_test,y_pred_bow))
ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred_bow),display_labels=['Books','Clothing','Electronics','Household']).plot()

In [None]:
print("TF_IDF + Logistic Regression  \n\n ")
## create a pipeline using TF-IDF
pipe_tf = Pipeline([
                    ('vectorizer', tf_idf),
                    ('classifier', MultinomialNB())])

## fit data
pipe_tf.fit(X_train,y_train)
y_pred_tf = pipe_tf.predict(X_test)

print(classification_report(y_test,y_pred_tf))
ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred_tf),display_labels=['Books','Clothing','Electronics','Household']).plot()

In [None]:
print("Count Vectorizer + Logistic Regression  \n\n ")
## create a pipeline using Count Vectorizer
pipe_bow = Pipeline([('vectorizer', tf_idf),
                    ('classifier', KNeighborsClassifier())])

## fit data
pipe_bow.fit(X_train,y_train)
y_pred_bow = pipe_bow.predict(X_test)

print(classification_report(y_test,y_pred_bow))
ConfusionMatrixDisplay(confusion_matrix(y_test,y_pred_bow),display_labels=['Books','Clothing','Electronics','Household']).plot()

In [None]:
import spacy


In [7]:
import spacy

# Load spaCy language model
nlp = spacy.load('en_core_web_md')  # or 'en_core_web_sm' if the medium model is not available

# Function to get the vector for each description
def get_vector(text):
    return nlp(text).vector

# Apply the function to the descriptions column and create a new column with the vectors
df['description_vector'] = df['description'].apply(get_vector)


In [10]:
df.head()

Unnamed: 0,index,description,label,label_encoded,description_vector
0,1235,"Rikki Knight Compact Mirror, Mary Cassatt Art ...",Household,3,"[-1.8166245, 0.7556441, -1.8953093, 0.735171, ..."
1,13728,Michael Jordan Review One of Amazon's Best Boo...,Books,0,"[-1.9969743, -0.06636697, -0.70505005, -0.1701..."
2,19011,"Papaya Men's Cotton Lungi (Multi-Coloured, 2.1...",Clothing & Accessories,1,"[-3.7544067, -2.0045116, -1.2673969, 2.5446517..."
3,11793,International Relations About the Author Peu G...,Books,0,"[-2.3403058, -1.0976384, 1.0231518, 1.211159, ..."
4,20712,Xs and Os Women's Lace Garter Belt and Sheer S...,Clothing & Accessories,1,"[-2.4635875, -0.64901465, -1.610657, 1.0111321..."


In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    np.stack(df['description_vector'].values), df['label_encoded'].values, test_size=0.2, random_state=42
)

In [37]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

class MyDataset(Dataset):
    def __init__(self, data, target, transform=None):
        self.data = torch.from_numpy(data).float()
        self.target = torch.from_numpy(target).long()
        self.transform = transform
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        
        if self.transform:
            x = self.transform(x)
        
        return x, y
    
    def __len__(self):
        return len(self.data)

# Create datasets
dataset_train = MyDataset(X_train, y_train)
dataset_test = MyDataset(X_test, y_test)


In [38]:
num_epochs = 25
num_classes = 3
learning_rate = 0.001
batch_size=100
vector_len=len(nlp(df["description"][0]).vector)

In [39]:
train_loader = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False)

In [40]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.layer13 = nn.Sequential(
            nn.Conv2d(1, 100, kernel_size=(3,300), stride=1,padding=0), 
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(70,1), stride=1))
        self.layer14 = nn.Sequential(
            nn.Conv2d(1, 100, kernel_size=(4,300), stride=1,padding=0), 
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(69,1), stride=1))
        self.layer15 = nn.Sequential(
            nn.Conv2d(1, 100, kernel_size=(5,300), stride=1,padding=0), 
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(68,1), stride=1))
        #self.layer2 = nn.Sequential(
            #nn.Conv2d(15, 30, kernel_size=5, stride=1, padding=0),
            #nn.ReLU(),
            #nn.MaxPool2d(kernel_size=2, stride=2))
        self.drop_out = nn.Dropout()
        #concat operation
        self.fc1 = nn.Linear(1 * 1 * 100 * 3, 30)
        self.fc2 = nn.Linear(30, 3)
        #self.fc3 = nn.Linear(100,3)
        
    def forward(self, x):
        x3 = self.layer13(x)
        x4 = self.layer14(x)
        x5 = self.layer15(x)
        x3 = x3.reshape(x3.size(0), -1)
        x4 = x4.reshape(x4.size(0), -1)
        x5 = x5.reshape(x5.size(0), -1)
        x3 = self.drop_out(x3)
        x4 = self.drop_out(x4)
        x5 = self.drop_out(x5)
        out = torch.cat((x3,x4,x5),1)
        out = self.fc1(out)
        out = self.fc2(out)
        return(out)

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [42]:
model = ConvNet()
model.to(device) #CNN to GPU


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
#CrossEntropyLoss function combines both a SoftMax activation and a cross entropy loss function in the same function

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [43]:
# Training the model
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for data, target in train_loader:
        print(data.shape)
        print(target.shape)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for data, target in test_loader:
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()
    
    print(f'Accuracy of the model on the test set: {100 * correct / total:.2f}%')

torch.Size([100, 300])
torch.Size([100])


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [100, 300]

In [44]:
x = next(iter(train_loader))

In [34]:
x[0].shape

torch.Size([100, 1, 300, 1])