In [1]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
# Define stop words BEFORE using the function
stop_words = set(stopwords.words('english'))
import matplotlib.pyplot as plt

#Import the dataset
df = pd.read_csv("D:/DataScience/Nlp/Reviews.csv")


[nltk_data] Downloading package stopwords to C:\Users\kpk
[nltk_data]     laptops\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(df.shape)

In [None]:
df.head()

In [2]:
# Keep only relevant columns
df = df[['Score','Text']].dropna()
# Map numerical rating to sentiment
def map_sentiment(Score):
    if Score <=2:
        return "negative"
    elif Score == 3:
        return "neutral"
    else:
        return "positive"

df['Sentiment'] = df['Score'].apply(map_sentiment)

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # keep only letters
    text = re.sub(r"\s+", " ", text).strip()
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

In [4]:
df['Clean_text'] = df['Text'].apply(clean_text)

In [None]:
df.head()

In [None]:
#Data Exploration EDA

# Sentiment distribution
sns.countplot(x='Sentiment', data=df)
plt.title("Sentiment Distribution")
plt.show()

# Average text length
df['Text_Length'] = df['Clean_text'].apply(len)
sns.histplot(df['Text_Length'], bins=50)
plt.title("Text Length Distribution")
plt.show()

In [5]:
#Train test split
#import libraries
from sklearn.model_selection import train_test_split
X = df['Clean_text']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 454763
Test size: 113691


In [6]:
#Baseline Model (TF-IDF + Logistic Regression)
#baseline sentiment classifier
#TF-IDF Vectorization: Converts text into numerical features.
#Logistic Regression: Fast, interpretable baseline for text classification.

#import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features = 10000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec =  vectorizer.transform(X_test)


#Logistic Regression model

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_vec, y_train)

#Predictions

y_pred = lr_model.predict(X_test_vec)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))





Accuracy: 0.87906694461303

Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.71      0.74     16407
     neutral       0.59      0.24      0.34      8528
    positive       0.91      0.97      0.94     88756

    accuracy                           0.88    113691
   macro avg       0.75      0.64      0.67    113691
weighted avg       0.86      0.88      0.86    113691



In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='tensorflow')

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"


#Advanced Model (BERT Fine-Tuning)
#use Transformers (BERT) for state-of-the-art performance.
#import libraries 
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Prepare Hugging Face Dataset
train_df = pd.DataFrame({'text': X_train, 'label': y_train.map({'negative':0, 'neutral':1, 'positive':2})})
test_df = pd.DataFrame({'text': X_test, 'label': y_test.map({'negative':0, 'neutral':1, 'positive':2})})

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Training setup
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)

trainer.train()






Map:   0%|          | 0/454763 [00:00<?, ? examples/s]

Map:   0%|          | 0/113691 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
df.head()

In [None]:

import bertopic
import sentence_transformers
import umap
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans

# Sample subset (BERTopic can be memory intensive)
sample_texts = df['Clean_text'].sample(2000, random_state=42).tolist()

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Define KMeans clustering (replace default hdbscan)
kmeans_model = MiniBatchKMeans(n_clusters=20, random_state=42)

# Pass the custom clustering model to BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    hdbscan_model=kmeans_model,   # replaces hdbscan
    verbose=True
)

topics, probs = topic_model.fit_transform(sample_texts)

# Show top topics
topic_model.get_topic_info().head(10)


In [None]:
topic_model.visualize_topics()


In [None]:
topic_model.visualize_barchart(top_n_topics=10)


In [None]:
topic_model.get_representative_docs(2)  # for topic 2 (coffee)


In [None]:
#SAVE THE MODEL

import joblib

#  Save your Logistic Regression sentiment model and TF-IDF vectorizer
joblib.dump(lr_model, "sentiment_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Save your BERTopic model (includes the SentenceTransformer and KMeans)
topic_model.save("bertopic_model")
