In [1]:
# 📌 Cell 1: Import libraries and load cleaned dataset

import pandas as pd
import numpy as np

# Load the cleaned dataset from previous notebook
df_clean = pd.read_csv("data/processed/amazon_clean.csv")
df_clean.head()

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link,discount_amount,is_heavily_discounted
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,399.0,1099.0,64.0,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...","Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...,700.0,True
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Computers&Accessories|Accessories&Peripherals|...,199.0,349.0,43.0,4.0,43994.0,"Compatible with all Type C enabled devices, be...","AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...","ArdKn,Nirbhay kumar,Sagar Viswanathan,Asp,Plac...","RGIQEG07R9HS2,R1SMWZQ86XIN8U,R2J3Y1WL29GWDE,RY...","A Good Braided Cable for Your Type C Device,Go...",I ordered this cable to connect my phone to An...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Ambrane-Unbreakable-Char...,150.0,False
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,Computers&Accessories|Accessories&Peripherals|...,199.0,1899.0,90.0,3.9,7928.0,【 Fast Charger& Data Sync】-With built-in safet...,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...","Kunal,Himanshu,viswanath,sai niharka,saqib mal...","R3J3EQQ9TZI5ZJ,R3E7WBGK7ID0KV,RWU79XKQ6I1QF,R2...","Good speed for earlier versions,Good Product,W...","Not quite durable and sturdy,https://m.media-a...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Sounce-iPhone-Charging-C...,1700.0,True
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,Computers&Accessories|Accessories&Peripherals|...,329.0,699.0,53.0,4.2,94363.0,The boAt Deuce USB 300 2 in 1 cable is compati...,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...","Omkar dhale,JD,HEMALATHA,Ajwadh a.,amar singh ...","R3EEUZKKK9J36I,R3HJVYCLYOY554,REDECAZ7AMPQC,R1...","Good product,Good one,Nice,Really nice product...","Good product,long wire,Charges good,Nice,I bou...",https://m.media-amazon.com/images/I/41V5FtEWPk...,https://www.amazon.in/Deuce-300-Resistant-Tang...,370.0,True
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,Computers&Accessories|Accessories&Peripherals|...,154.0,399.0,61.0,4.2,16905.0,[CHARGE & SYNC FUNCTION]- This cable comes wit...,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...","rahuls6099,Swasat Borah,Ajay Wadke,Pranali,RVK...","R1BP4L2HH9TFUP,R16PVJEXKV6QZS,R2UPDB81N66T4P,R...","As good as original,Decent,Good one for second...","Bought this instead of original apple, does th...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Portronics-Konnect-POR-1...,245.0,True


In [2]:
# Ensure numeric types
df_clean['actual_price'] = pd.to_numeric(df_clean['actual_price'], errors='coerce')
df_clean['discounted_price'] = pd.to_numeric(df_clean['discounted_price'], errors='coerce')
df_clean['rating_count'] = pd.to_numeric(df_clean['rating_count'], errors='coerce')
df_clean['rating'] = pd.to_numeric(df_clean['rating'], errors='coerce')

# 1. Discount amount
df_clean['discount_amount'] = df_clean['actual_price'] - df_clean['discounted_price']

# 2. Discount ratio
df_clean['discount_ratio'] = df_clean['discount_amount'] / df_clean['actual_price']

# 3. Rating score = rating × log(1 + rating_count)
df_clean['rating_score'] = df_clean['rating'] * np.log1p(df_clean['rating_count'])

# Show sample
df_clean[['actual_price', 'discounted_price', 'discount_amount', 'discount_ratio', 'rating_score']].head()

Unnamed: 0,actual_price,discounted_price,discount_amount,discount_ratio,rating_score
0,1099.0,399.0,700.0,0.636943,42.407384
1,349.0,199.0,150.0,0.429799,42.767325
2,1899.0,199.0,1700.0,0.895208,35.015301
3,699.0,329.0,370.0,0.529328,48.110643
4,399.0,154.0,245.0,0.614035,40.88878


In [3]:
# 📌 Cell 3: Create text-based review features

# 1. Length of review title
df_clean['review_title_len'] = df_clean['review_title'].astype(str).apply(len)

# 2. Length of review content
df_clean['review_content_len'] = df_clean['review_content'].astype(str).apply(len)

# 3. Count number of words in review content
df_clean['review_content_word_count'] = df_clean['review_content'].astype(str).apply(lambda x: len(x.split()))

# 4. Presence of review (binary flag)
df_clean['has_review'] = df_clean['review_content'].notnull().astype(int)

# Display new text features summary
df_clean[['review_title_len', 'review_content_len', 'review_content_word_count', 'has_review']].describe()

Unnamed: 0,review_title_len,review_content_len,review_content_word_count,has_review
count,1465.0,1465.0,1465.0,1465.0
mean,181.91058,1394.137884,240.772696,1.0
std,55.574307,1823.491796,316.605516,0.0
min,7.0,67.0,11.0,1.0
25%,140.0,505.0,81.0,1.0
50%,176.0,823.0,139.0,1.0
75%,217.0,1388.0,243.0,1.0
max,407.0,18547.0,2974.0,1.0


In [5]:
# 📌 Cell 4: Category and product-based features

# 1. Total number of products per category
category_counts = df_clean['category'].value_counts().to_dict()
df_clean['category_product_count'] = df_clean['category'].map(category_counts)

# 2. Average rating per product
df_clean['product_avg_rating'] = df_clean.groupby('product_id')['rating'].transform('mean')

# 3. Number of reviews per product
df_clean['product_review_count'] = df_clean.groupby('product_id')['review_id'].transform('count')

# View sample of new features
df_clean[['category', 'category_product_count', 'product_avg_rating', 'product_review_count']].head(20)

Unnamed: 0,category,category_product_count,product_avg_rating,product_review_count
0,Computers&Accessories|Accessories&Peripherals|...,233,4.2,3
1,Computers&Accessories|Accessories&Peripherals|...,233,4.0,3
2,Computers&Accessories|Accessories&Peripherals|...,233,3.9,3
3,Computers&Accessories|Accessories&Peripherals|...,233,4.2,3
4,Computers&Accessories|Accessories&Peripherals|...,233,4.2,3
5,Computers&Accessories|Accessories&Peripherals|...,233,3.9,3
6,Computers&Accessories|Accessories&Peripherals|...,233,4.1,3
7,Computers&Accessories|Accessories&Peripherals|...,233,4.3,3
8,Computers&Accessories|NetworkingDevices|Networ...,18,4.2,1
9,Computers&Accessories|Accessories&Peripherals|...,233,4.0,2


In [6]:
# 📌 Cell 5: Save feature-engineered dataset

import os

# Ensure processed directory exists
os.makedirs("data/processed", exist_ok=True)

# Save the enriched dataset to CSV
df_clean.to_csv("data/processed/amazon_feature_engineered.csv", index=False)

# Confirm the file and preview
print("Saved file:", "data/processed/amazon_feature_engineered.csv")
df_check = pd.read_csv("data/processed/amazon_feature_engineered.csv")
df_check.head()

Saved file: data/processed/amazon_feature_engineered.csv


Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,...,is_heavily_discounted,discount_ratio,rating_score,review_title_len,review_content_len,review_content_word_count,has_review,category_product_count,product_avg_rating,product_review_count
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,399.0,1099.0,64.0,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...",...,True,0.636943,42.407384,124,483,60,1,233,4.2,3
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Computers&Accessories|Accessories&Peripherals|...,199.0,349.0,43.0,4.0,43994.0,"Compatible with all Type C enabled devices, be...","AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...",...,False,0.429799,42.767325,181,1186,201,1,233,4.0,3
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,Computers&Accessories|Accessories&Peripherals|...,199.0,1899.0,90.0,3.9,7928.0,【 Fast Charger& Data Sync】-With built-in safet...,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...",...,True,0.895208,35.015301,137,271,13,1,233,3.9,3
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,Computers&Accessories|Accessories&Peripherals|...,329.0,699.0,53.0,4.2,94363.0,The boAt Deuce USB 300 2 in 1 cable is compati...,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...",...,True,0.529328,48.110643,140,443,77,1,233,4.2,3
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,Computers&Accessories|Accessories&Peripherals|...,154.0,399.0,61.0,4.2,16905.0,[CHARGE & SYNC FUNCTION]- This cable comes wit...,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...",...,True,0.614035,40.88878,138,2451,415,1,233,4.2,3
