In [1]:
import pandas as pd
import re

In [2]:
# Load the data
costco = pd.read_csv("GroceryDataset.csv")

In [3]:
# Check for missing values
costco.isnull().sum()

Sub Category              0
Price                     3
Discount                  0
Rating                 1075
Title                     0
Currency                  5
Feature                  18
Product Description      42
dtype: int64

In [4]:
# Drop the Currency & Rating columns
df = costco.drop(columns=["Currency", "Rating"], errors="ignore")
df.head()

Unnamed: 0,Sub Category,Price,Discount,Title,Feature,Product Description
0,Bakery & Desserts,$56.99,No Discount,"David’s Cookies Mile High Peanut Butter Cake, ...","""10"""" Peanut Butter Cake\nCertified Kosher OU-...",A cake the dessert epicure will die for!Our To...
1,Bakery & Desserts,$159.99,No Discount,"The Cake Bake Shop 8"" Round Carrot Cake (16-22...",Spiced Carrot Cake with Cream Cheese Frosting ...,"Due to the perishable nature of this item, ord..."
2,Bakery & Desserts,$44.99,No Discount,"St Michel Madeleine, Classic French Sponge Cak...",100 count\nIndividually wrapped\nMade in and I...,Moist and buttery sponge cakes with the tradit...
3,Bakery & Desserts,$39.99,No Discount,"David's Cookies Butter Pecan Meltaways 32 oz, ...",Butter Pecan Meltaways\n32 oz 2-Pack\nNo Prese...,These delectable butter pecan meltaways are th...
4,Bakery & Desserts,$59.99,No Discount,"David’s Cookies Premier Chocolate Cake, 7.2 lb...","""10"" Four Layer Chocolate Cake\nCertified Kosh...",A cake the dessert epicure will die for!To the...


In [5]:
# Check for null values
df.isnull().sum()

Sub Category            0
Price                   3
Discount                0
Title                   0
Feature                18
Product Description    42
dtype: int64

In [6]:
# Drop null values
df = df.dropna().reset_index(drop=True)

In [7]:
# Remove the $ symbol from the Price column and convert to float
df["Price"] = df["Price"].astype(str).apply(lambda x: re.findall(r"\d+\.\d+|\d+", x)[0] if re.findall(r"\d+\.\d+|\d+", x) else None)
df["Price"] = df["Price"].astype(float)

In [8]:
# Convert "Discount" column -> 0 for No Discount, 1 for any discount
df["Discount"] = df["Discount"].apply(lambda x: 0 if x == "No Discount" else 1)

In [9]:
df.head()

Unnamed: 0,Sub Category,Price,Discount,Title,Feature,Product Description
0,Bakery & Desserts,56.99,0,"David’s Cookies Mile High Peanut Butter Cake, ...","""10"""" Peanut Butter Cake\nCertified Kosher OU-...",A cake the dessert epicure will die for!Our To...
1,Bakery & Desserts,159.99,0,"The Cake Bake Shop 8"" Round Carrot Cake (16-22...",Spiced Carrot Cake with Cream Cheese Frosting ...,"Due to the perishable nature of this item, ord..."
2,Bakery & Desserts,44.99,0,"St Michel Madeleine, Classic French Sponge Cak...",100 count\nIndividually wrapped\nMade in and I...,Moist and buttery sponge cakes with the tradit...
3,Bakery & Desserts,39.99,0,"David's Cookies Butter Pecan Meltaways 32 oz, ...",Butter Pecan Meltaways\n32 oz 2-Pack\nNo Prese...,These delectable butter pecan meltaways are th...
4,Bakery & Desserts,59.99,0,"David’s Cookies Premier Chocolate Cake, 7.2 lb...","""10"" Four Layer Chocolate Cake\nCertified Kosh...",A cake the dessert epicure will die for!To the...


In [None]:
# Export df to CSV
df.to_csv("cleaned_costco_data.csv", index=False)

In [10]:
# New df with reviews
df_reviews = costco.drop(columns=["Currency"], errors="ignore")
df_reviews.head()

Unnamed: 0,Sub Category,Price,Discount,Rating,Title,Feature,Product Description
0,Bakery & Desserts,$56.99,No Discount,Rated 4.3 out of 5 stars based on 265 reviews.,"David’s Cookies Mile High Peanut Butter Cake, ...","""10"""" Peanut Butter Cake\nCertified Kosher OU-...",A cake the dessert epicure will die for!Our To...
1,Bakery & Desserts,$159.99,No Discount,Rated 5 out of 5 stars based on 1 reviews.,"The Cake Bake Shop 8"" Round Carrot Cake (16-22...",Spiced Carrot Cake with Cream Cheese Frosting ...,"Due to the perishable nature of this item, ord..."
2,Bakery & Desserts,$44.99,No Discount,Rated 4.1 out of 5 stars based on 441 reviews.,"St Michel Madeleine, Classic French Sponge Cak...",100 count\nIndividually wrapped\nMade in and I...,Moist and buttery sponge cakes with the tradit...
3,Bakery & Desserts,$39.99,No Discount,Rated 4.7 out of 5 stars based on 9459 reviews.,"David's Cookies Butter Pecan Meltaways 32 oz, ...",Butter Pecan Meltaways\n32 oz 2-Pack\nNo Prese...,These delectable butter pecan meltaways are th...
4,Bakery & Desserts,$59.99,No Discount,Rated 4.5 out of 5 stars based on 758 reviews.,"David’s Cookies Premier Chocolate Cake, 7.2 lb...","""10"" Four Layer Chocolate Cake\nCertified Kosh...",A cake the dessert epicure will die for!To the...


In [11]:
# Check for null values
df_reviews.isnull().sum()

Sub Category              0
Price                     3
Discount                  0
Rating                 1075
Title                     0
Feature                  18
Product Description      42
dtype: int64

In [12]:
# Drop null values
df_reviews = df_reviews.dropna().reset_index(drop=True)

In [13]:
# Get just the rating value from the Rating column
df_reviews["Rating"] = df_reviews["Rating"].str.extract(r'Rated ([\d\.]+) out of 5')[0].astype(float)

In [14]:
# Remove the $ symbol from the Price column and convert to float
df_reviews["Price"] = df_reviews["Price"].astype(str).apply(lambda x: re.findall(r"\d+\.\d+|\d+", x)[0] if re.findall(r"\d+\.\d+|\d+", x) else None)
df_reviews["Price"] = df_reviews["Price"].astype(float)

In [15]:
# Convert "Discount" column -> 0 for No Discount, 1 for any discount
df_reviews["Discount"] = df_reviews["Discount"].apply(lambda x: 0 if x == "No Discount" else 1)

In [16]:
df_reviews.head()

Unnamed: 0,Sub Category,Price,Discount,Rating,Title,Feature,Product Description
0,Bakery & Desserts,56.99,0,4.3,"David’s Cookies Mile High Peanut Butter Cake, ...","""10"""" Peanut Butter Cake\nCertified Kosher OU-...",A cake the dessert epicure will die for!Our To...
1,Bakery & Desserts,159.99,0,5.0,"The Cake Bake Shop 8"" Round Carrot Cake (16-22...",Spiced Carrot Cake with Cream Cheese Frosting ...,"Due to the perishable nature of this item, ord..."
2,Bakery & Desserts,44.99,0,4.1,"St Michel Madeleine, Classic French Sponge Cak...",100 count\nIndividually wrapped\nMade in and I...,Moist and buttery sponge cakes with the tradit...
3,Bakery & Desserts,39.99,0,4.7,"David's Cookies Butter Pecan Meltaways 32 oz, ...",Butter Pecan Meltaways\n32 oz 2-Pack\nNo Prese...,These delectable butter pecan meltaways are th...
4,Bakery & Desserts,59.99,0,4.5,"David’s Cookies Premier Chocolate Cake, 7.2 lb...","""10"" Four Layer Chocolate Cake\nCertified Kosh...",A cake the dessert epicure will die for!To the...
