In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Upload


In [4]:
df = pd.read_csv('data/Kaggle/Reviews.csv') 
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Inpecting the Dataset
### We'll begin by checking the structure of the dataset including:
- Number of rows and columns
- Column names and types
- Sample records
- Missing values
- Duplicate records

In [5]:
# Shape of the dataset
print(f"Dataset contains {df.shape[0]:,} rows and {df.shape[1]} columns")

# Column names and data types
df.info()

Dataset contains 568,454 rows and 10 columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [6]:
# Total missing values per column
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [7]:
# Number of duplicate rows
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0


### Handling missing values and dropping unncessary columns 

In [8]:
# Drop rows with missing values in 'ProfileName' or 'Summary'
df = df.dropna(subset=['ProfileName', 'Summary'])

# Drop unnecessary columns
df = df.drop(columns=['Id', 'UserId', 'ProfileName'])

# Confirm changes
print(f"Shape after cleanup: {df.shape}")
df.head()

Shape after cleanup: (568401, 7)


Unnamed: 0,ProductId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,B001E4KFG0,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Text cleaning 

In [9]:
import re
import string

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

df['cleaned_text'] = df['Text'].apply(clean_text)


In [10]:
# Check a few cleaned entries
df[['Text', 'cleaned_text']].sample(10)

Unnamed: 0,Text,cleaned_text
535269,I was running short on time but had to make a ...,i was running short on time but had to make a ...
190689,I am really happy with this AeroGarden. Plants...,i am really happy with this aerogarden plants ...
330847,"If you like Earl Grey tea, you'll find no bett...",if you like earl grey tea youll find no better...
326377,This makes the best pancakes in the world - be...,this makes the best pancakes in the world bett...
371449,This sauce is hot enough that there is almost ...,this sauce is hot enough that there is almost ...
169990,We enjoy our Keurig coffee maker and this is o...,we enjoy our keurig coffee maker and this is o...
225661,Buying this product in the bigger box and all ...,buying this product in the bigger box and all ...
466965,these are sooo good!! i couln't stop eating th...,these are sooo good i coulnt stop eating them ...
270285,This is the yummiest lebkuchen ever! too bad i...,this is the yummiest lebkuchen ever too bad it...
201113,I orderd Van Houtte Chocolate Raspberry Truffl...,i orderd van houtte chocolate raspberry truffl...


In [12]:
df.head()


Unnamed: 0,ProductId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,cleaned_text
0,B001E4KFG0,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...
1,B00813GRG4,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...
2,B000LQOCH0,1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,this is a confection that has been around a fe...
3,B000UA0QIQ,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...
4,B006K2ZZ7K,0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy at a great price there was a wide ...


## Save the cleaned dataset

In [13]:
df.to_csv('cleaned_reviews.csv', index=False)

## EDA and Target Distribution Analysis