# Flipkart Customer Reviews

Dataset Source: [Kaggle](https://www.kaggle.com/datasets/niraliivaghani/flipkart-product-customer-reviews-dataset)

## <b>Topic Modelling</b>

### <b><i>Using BERTopic (with spacy model 'en_core_web_sm' for embedding)</i></b>

<br><br><br>


## Libraries and Data

In [1]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 KB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 KB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m40.1 MB/s[0m 

In [2]:
import pandas as pd
import spacy
from bertopic import BERTopic


import re
import nltk.corpus
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Flipkart Product Reviews - Kaggle/Dataset-SA.csv')
data['sentiment_code'] = pd.Categorical(data.Sentiment).codes
data['sentiment_code'] = data['sentiment_code'].astype('Int64')
data.dropna(inplace = True)

In [5]:
print(data.product_name.nunique())
data.product_name.value_counts()

841


cello Pack of 18 Opalware Cello Dazzle Lush Fiesta Opalware Dinner Set, 18 Pieces Dinner SetÃÂ ÃÂ (White, Microwave Safe)                                                               6005
Lakm?? Eyeconic Kajal Twin Pack??????????(Deep Black, 0.7 g)                                                                                                                              5000
Mi 5A 80 cm (32 inch) HD Ready LED Smart Android TV with Dolby Audio (2022 Model)                                                                                                         2205
cello Pack of 18 Opalware Cello Dazzle Lush Fiesta Opalware Dinner Set 18 Pieces Dinner SetWhite Microwave Safe                                                                           2095
Home Sizzler 153 cm 502 ft Polyester Room Darkening Window Curtain Pack Of 2Floral Brown                                                                                                  2012
                                             

## Pre-process Text

In [6]:
# define function to clean text

stop = stopwords.words('english')
add_stopwords = ['aqua', 'black', 'blue', 'fuchsia', 'gray', 'green', 'lime', 'maroon', 'navy', 'olive', 'purple', 'red', 'silver', 'teal', 'white', 'yellow', 'pack', 'light']
stop.extend(add_stopwords)

lemmatizer = WordNetLemmatizer()


def clean_text(text):

  text = text.lower()
  text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text) # Punctuations, URLs and @
  text = " ".join([word for word in text.split() if word not in (stop)]) # Stopwords
  text = " ".join([lemmatizer.lemmatize(word) for word in text.split()]) # Stemming
  
  return(text)


In [7]:
data_sub = data[['product_name', 'Summary']].copy(deep = True)
data_sub = data_sub.applymap(clean_text)
data_sub

Unnamed: 0,product_name,Summary
0,candes 12 l roompersonal air coolerwhite elega...,great cooler excellent air flow price amazing ...
1,candes 12 l roompersonal air coolerwhite elega...,best budget 2 fit cooler nice cooling
2,candes 12 l roompersonal air coolerwhite elega...,quality good power air decent
3,candes 12 l roompersonal air coolerwhite elega...,bad product fan
4,candes 12 l roompersonal air coolerwhite elega...,ok ok product
...,...,...
205047,cello 18 opalware cello dazzle lush fiesta opa...,good product
205048,cello 18 opalware cello dazzle lush fiesta opa...,nice
205049,cello 18 opalware cello dazzle lush fiesta opa...,nice fast delivery
205050,cello 18 opalware cello dazzle lush fiesta opa...,awesome product


## Topic Modelling

In [12]:
# Topic modelling based on product name text

docs_product_name = data_sub.product_name
nlp = spacy.load('en_core_web_sm', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

topic_model_product_name = BERTopic(embedding_model=nlp, nr_topics=50)
topics, probs = topic_model_product_name.fit_transform(docs_product_name)

fig = topic_model_product_name.visualize_topics()
fig.show()


In [13]:
# Topic modelling based on Summary text

docs_Summary = data_sub.Summary
# nlp = spacy.load('en_core_web_sm', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

topic_model_Summary = BERTopic(embedding_model=nlp, nr_topics=20)
topics, probs = topic_model_Summary.fit_transform(docs_Summary)

fig = topic_model_Summary.visualize_topics()
fig.show()
