## Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

nltk.download('stopwords')
nltk.download('vader_lexicon')

df = pd.read_csv("NLP_Task.csv")

print(df.shape)
df.head()


(2072, 4)


[nltk_data] Downloading package stopwords to C:\Users\Harsh
[nltk_data]     Patel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Harsh
[nltk_data]     Patel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Brand Name,T-shirt Name,Review,sentiment
0,infirax,women round neck black t shirt,Fabric and size is good,
1,infirax,women round neck black t shirt,Too good,
2,infirax,women round neck black t shirt,Just loved the fabric of this tshirt ðŸ˜\r\nI...,
3,infirax,women round neck black t shirt,This product is really good and quality is ama...,
4,infirax,women round neck black t shirt,Perfect ðŸ‘ðŸ‘ðŸ‘,


## Basic EDA (Exploratory Data Analysis)

In [2]:
print(df.info())

print(df.isnull().sum())

print(df['Review'].head(10))

df['review_len'] = df['Review'].astype(str).apply(len)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2072 entries, 0 to 2071
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Brand Name    2072 non-null   object
 1   T-shirt Name  2072 non-null   object
 2   Review        2072 non-null   object
 3   sentiment     127 non-null    object
dtypes: object(4)
memory usage: 64.9+ KB
None
Brand Name         0
T-shirt Name       0
Review             0
sentiment       1945
dtype: int64
0                              Fabric and size is good
1                                             Too good
2    Just loved the fabric of this tshirt ðŸ˜\r\nI...
3    This product is really good and quality is ama...
4                                 Perfect ðŸ‘ðŸ‘ðŸ‘
5                  Very nice fitting . So soft fabric.
6                              Amazing cotton product,
7                                            Very nice
8                                     Very bad quality
9   

In [3]:
print("Unique brands:", df['Brand Name'].nunique())
print("Unique T-shirt names:", df['T-shirt Name'].nunique())

Unique brands: 18
Unique T-shirt names: 23


## Text Preprocessing

In [4]:
stop_words = set(stopwords.words('english'))

def clean_text(text, remove_stopwords=True):
    text = str(text).lower()
    
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    
    text = re.sub(r"\d+", "", text)
    
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    text = re.sub(r"\s+", " ", text).strip()
    
    if remove_stopwords:
        tokens = text.split()
        tokens = [w for w in tokens if w not in stop_words]
        text = " ".join(tokens)
    
    return text

df['clean_review'] = df['Review'].apply(clean_text)
df[['Review', 'clean_review']].head()

Unnamed: 0,Review,clean_review
0,Fabric and size is good,fabric size good
1,Too good,good
2,Just loved the fabric of this tshirt ðŸ˜\r\nI...,loved fabric tshirt ðÿ˜ amazing comfortable
3,This product is really good and quality is ama...,product really good quality amazing go â¤ï¸âœ¨
4,Perfect ðŸ‘ðŸ‘ðŸ‘,perfect ðÿ‘ðÿ‘ðÿ‘


## Automatic Sentiment Labelling with VADER

In [5]:
sia = SentimentIntensityAnalyzer()

def get_vader_sentiment(text, pos_threshold=0.05, neg_threshold=-0.05):
    scores = sia.polarity_scores(text)
    compound = scores['compound']
    if compound >= pos_threshold:
        return "positive"
    elif compound <= neg_threshold:
        return "negative"
    else:
        return "neutral"

df['sentiment'] = df['clean_review'].apply(get_vader_sentiment)

df['sentiment'].value_counts()


sentiment
positive    1807
neutral      197
negative      68
Name: count, dtype: int64

## Prepare Data for Machine Learning

In [6]:
X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

Train size: 1657
Test size: 415


## Text Vectorization (TF-IDF)

In [7]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # unigrams + bigrams
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape


((1657, 3083), (415, 3083))

## Train a Classifier (Logistic Regression)

In [8]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9228915662650602

Classification Report:

              precision    recall  f1-score   support

    negative       1.00      0.50      0.67        14
     neutral       1.00      0.36      0.53        39
    positive       0.92      1.00      0.96       362

    accuracy                           0.92       415
   macro avg       0.97      0.62      0.72       415
weighted avg       0.93      0.92      0.91       415


Confusion Matrix:

[[  7   0   7]
 [  0  14  25]
 [  0   0 362]]


## Predict Sentiment for New Reviews (Demo)

In [9]:
def predict_sentiment(review_text):
    clean = clean_text(review_text)
    vec = tfidf.transform([clean])
    pred = clf.predict(vec)[0]
    return pred

examples = [
    "The fabric is really soft and comfortable.",
    "Very poor quality, I did not like it at all.",
    "nyc t-shirt"
]

for r in examples:
    print(r, "=>", predict_sentiment(r))

The fabric is really soft and comfortable. => positive
Very poor quality, I did not like it at all. => negative
nyc t-shirt => neutral


## Save Final Labelled Dataset

In [10]:
df.to_csv("NLP_Task_with_sentiment.csv", index=False)
print("Saved file: NLP_Task_with_sentiment.csv")

Saved file: NLP_Task_with_sentiment.csv
