# Flipkart Customer Reviews

Dataset Source: [Kaggle](https://www.kaggle.com/datasets/niraliivaghani/flipkart-product-customer-reviews-dataset)

## <b>Sentiment Analysis</b>

### <b><i>Using DistilBERT model to fine-tune it on the data</i></b>

<br><br><br>
### Results:

Sentiment Categories - <i>Positive, Negative, Neutral</i> 

Recall: 0.94
Precision: 0.94


<br><br><br>

## Libraries and Data

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"
!pip install datasets transformers


In [None]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from datasets import load_dataset, Dataset

import torch
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_metric


In [None]:
# Read data 

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle_flipkart/data/Dataset-SA.csv')

# Convert sentiments to category and drop NA value rows

data['sentiment_code'] = pd.Categorical(data.Sentiment).codes
data['sentiment_code'] = data['sentiment_code'].astype('Int64')
data.dropna(inplace = True)

# Convert data into Dataset object for using with distilBERT

data_2 = Dataset.from_pandas(data[['Summary', 'sentiment_code']])


In [None]:
data.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment,sentiment_code
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive,2
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive,2
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive,2
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative,0
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral,1


In [None]:
len(data)

180379

In [None]:
data.Sentiment.value_counts()

positive    147171
negative     24401
neutral       8807
Name: Sentiment, dtype: int64

In [None]:
data.Summary[0]

'great cooler excellent air flow and for this price its so amazing and unbelievablejust love it'

## Modelling

In [None]:
torch.cuda.is_available()

True

In [None]:
# Train-Test data split

train = data_2.shuffle(seed=42).select([i for i in list(range(20000))])
test = data_2.shuffle(seed=42).select([i for i in list(range(20000, len(data_2)))])


print(train[0])
print(test[0])

{'Summary': 'good quality product i think price little bit high otherwise awesome stuff', 'sentiment_code': 2, '__index_level_0__': 138001}
{'Summary': 'gud product', 'sentiment_code': 2, '__index_level_0__': 17880}


In [None]:
# Load tokenizer from distilBERT

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Tokenize train and test data

def tokenize_function(df):
    return tokenizer(df["Summary"], truncation=True)

tokenized_train = train.map(tokenize_function, batched=True)
tokenized_test = test.map(tokenize_function, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/160379 [00:00<?, ? examples/s]

In [None]:
# Process train and test data to match the requirements of distilBERT 

tokenized_train = tokenized_train.remove_columns('__index_level_0__')
tokenized_test = tokenized_test.remove_columns('__index_level_0__')


tokenized_train = tokenized_train.rename_column("Summary", "text")
tokenized_train = tokenized_train.rename_column("sentiment_code", "labels")

tokenized_test = tokenized_test.rename_column("Summary", "text")
tokenized_test = tokenized_test.rename_column("sentiment_code", "labels")


In [None]:
tokenized_test

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 160379
})

In [None]:
# Define data collator

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define DistilBERT
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [None]:
# Define function to compute metrics

def compute_metrics(eval_pred):

    load_recall = load_metric('recall')
    load_precision = load_metric('precision')
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    recall = load_recall.compute(predictions = predictions, references=labels, average="micro")["recall"]
    precision = load_precision.compute(predictions = predictions, references=labels, average="micro")["precision"]

    return {"recall": recall, "precision": precision}

In [None]:
# Log in to your Hugging Face account 
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Define trainer object for model training and evaluation

repo_name = 'distilbert_finetuned_flipkart_product_reviews_kaggle'


training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/prajwalkhairnar/distilbert_finetuned_flipkart_product_reviews_kaggle into local empty directory.


In [None]:
# Train model 

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.2559
1000,0.1946
1500,0.1596
2000,0.1498
2500,0.1486
3000,0.1065
3500,0.1151
4000,0.0929
4500,0.0889
5000,0.0869


TrainOutput(global_step=12500, training_loss=0.08212771041870118, metrics={'train_runtime': 1328.5725, 'train_samples_per_second': 150.538, 'train_steps_per_second': 9.409, 'total_flos': 2696856652046880.0, 'train_loss': 0.08212771041870118, 'epoch': 10.0})

In [None]:
# Evaluate metrics for model on test data

trainer.evaluate()

  load_recall = load_metric('recall')


{'eval_loss': 0.3674069344997406,
 'eval_recall': 0.9395182661071587,
 'eval_precision': 0.9395182661071587,
 'eval_runtime': 268.7207,
 'eval_samples_per_second': 596.824,
 'eval_steps_per_second': 37.303,
 'epoch': 10.0}

In [None]:
# Upload the model to the Hub
trainer.push_to_hub() 

Upload file runs/May01_19-25-17_0e70f2bbf7f0/events.out.tfevents.1682969122.0e70f2bbf7f0.27795.0: 100%|#######…

Upload file runs/May01_19-25-17_0e70f2bbf7f0/events.out.tfevents.1682970719.0e70f2bbf7f0.27795.2: 100%|#######…

To https://huggingface.co/prajwalkhairnar/distilbert_finetuned_flipkart_product_reviews_kaggle
   8bfe0ce..71f072c  main -> main

   8bfe0ce..71f072c  main -> main

To https://huggingface.co/prajwalkhairnar/distilbert_finetuned_flipkart_product_reviews_kaggle
   71f072c..acb5c1b  main -> main

   71f072c..acb5c1b  main -> main



'https://huggingface.co/prajwalkhairnar/distilbert_finetuned_flipkart_product_reviews_kaggle/commit/71f072cad1685c9e76390eb5b52ad20da8cc0fe9'

In [None]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="prajwalkhairnar/distilbert_finetuned_flipkart_product_reviews_kaggle")


Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
sentiment_model(["I love this move", "This movie is just so bad!", "okay"])

[{'label': 'LABEL_2', 'score': 0.999225378036499},
 {'label': 'LABEL_0', 'score': 0.9994316697120667},
 {'label': 'LABEL_1', 'score': 0.9128064513206482}]