In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kk061\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kk061\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df=pd.read_csv('ecommerceDataset.csv',header=None, names=["category", "description"])
df.head()

Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [6]:
from nltk.stem import SnowballStemmer
import re
stemmer = SnowballStemmer("english")

def preprocess_text(text):
    if not isinstance(text, str): 
        return ""  
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words and word.isalnum()]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)
df['processed_description'] = df['description'].apply(preprocess_text)


In [7]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

In [8]:
X = df['processed_description']
y = df['label']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report


In [12]:
models = {
    "Logistic Regression": make_pipeline(vectorizer, LogisticRegression(max_iter=1000)),
    "Random Forest": make_pipeline(vectorizer, RandomForestClassifier(n_estimators=100)),
    "Support Vector Machine": make_pipeline(vectorizer, SVC(kernel='linear')),
    "Naive Bayes": make_pipeline(vectorizer, MultinomialNB())
}

In [13]:
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)   
    y_pred = model.predict(X_test)   
    print(f"\nClassification Report for {model_name}:\n")
    print(classification_report(y_test, y_pred))

Training Logistic Regression...

Classification Report for Logistic Regression:

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      2387
           1       0.98      0.98      0.98      1744
           2       0.96      0.94      0.95      2067
           3       0.96      0.97      0.96      3887

    accuracy                           0.96     10085
   macro avg       0.96      0.96      0.96     10085
weighted avg       0.96      0.96      0.96     10085

Training Random Forest...

Classification Report for Random Forest:

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      2387
           1       0.98      0.97      0.98      1744
           2       0.98      0.95      0.96      2067
           3       0.96      0.98      0.97      3887

    accuracy                           0.97     10085
   macro avg       0.97      0.97      0.97     10085
weighted avg       0.97      0.97  

In [15]:
new_data = ["iVoltaa 3.4A Dual Port Car Charger with Micro USB Cable - Black"]  # Replace with your actual text
logistic_regression_model = models["Logistic Regression"]
new_predictions = logistic_regression_model.predict(new_data)
decoded_predictions = label_encoder.inverse_transform(new_predictions)
print(f"Predicted categories for the input data: {decoded_predictions}")

Predicted categories for the input data: ['Electronics']
