# Amazon Recommendation System Based On Preview

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

In [2]:
import kagglehub
import os
import shutil
# Download latest version
path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset")
data_dir = "./datasets"

os.makedirs(data_dir, exist_ok=True)
shutil.copytree(path, data_dir, dirs_exist_ok=True)

print("Dataset đã được lưu tại:", os.path.abspath(data_dir))

  from .autonotebook import tqdm as notebook_tqdm


Dataset đã được lưu tại: /home/dikhangcshcmut/Code/code_for_fun/Project/RecommendationProduct/AmazonRecommendationBasedOnPreview/datasets


## Reading dataset

In [3]:
dataset = pd.read_csv(data_dir+"/amazon.csv")

print(f"Number of rows: {dataset.shape}")
print(f"Number of column: {dataset.columns.tolist()}")

Number of rows: (1465, 16)
Number of column: ['product_id', 'product_name', 'category', 'discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count', 'about_product', 'user_id', 'user_name', 'review_id', 'review_title', 'review_content', 'img_link', 'product_link']


## Preprocessing dataset

In [4]:
# Checking number of missing value
print(dataset.isna().sum())

product_id             0
product_name           0
category               0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           2
about_product          0
user_id                0
user_name              0
review_id              0
review_title           0
review_content         0
img_link               0
product_link           0
dtype: int64


### Clear value in each column


In [6]:
import re
def clear_special_character(x):
    if pd.isna(x):
        return np.nan
    
    x = str(x)
    x = re.sub(r"[^0-9.]", "", x)
    return x if x != "" else np.nan


def clear_price(x):
    x = clear_special_character(x)
    try:
        return float(x)
    except:
        return np.nan
    
def clear_percent(x):
    x = clear_special_character(x)
    try:
        return float(x) / 100.0
    except:
        return np.nan

In [7]:
print(dataset["discount_percentage"][:10])

0    64%
1    43%
2    90%
3    53%
4    61%
5    85%
6    65%
7    23%
8    50%
9    33%
Name: discount_percentage, dtype: object


In [8]:
# Aplly for columns needed
dataset["discounted_price"] = dataset["discounted_price"].apply(clear_price)
dataset["actual_price"] = dataset["actual_price"].apply(clear_price)
dataset["discount_percentage"] = dataset["discount_percentage"].apply(clear_percent)

print(dataset[["discounted_price", "actual_price", "discount_percentage"]].head(10))

   discounted_price  actual_price  discount_percentage
0            399.00        1099.0                 0.64
1            199.00         349.0                 0.43
2            199.00        1899.0                 0.90
3            329.00         699.0                 0.53
4            154.00         399.0                 0.61
5            149.00        1000.0                 0.85
6            176.63         499.0                 0.65
7            229.00         299.0                 0.23
8            499.00         999.0                 0.50
9            199.00         299.0                 0.33


In [10]:
def fill_missing_auto(df: pd.DataFrame) -> pd.DataFrame:
    df_copy = df.copy()
    
    for col in df_copy.columns:
        if pd.api.types.is_numeric_dtype(df_copy[col]):
            mean_val = df_copy[col].mean(skipna=True)
            df_copy[col] = df_copy[col].fillna(mean_val)
        else:
            df_copy[col] = df_copy[col].where(df_copy[col].notna(), np.nan)
    
    return df_copy

numeric_cols = ["discounted_price", "actual_price", "discount_percentage", "rating", "rating_count"]

for col in numeric_cols:
    dataset[col] = pd.to_numeric(dataset[col], errors="coerce")

# Sau đó fill missing auto
dataset = fill_missing_auto(dataset)
print(dataset.isna().sum())

product_id             0
product_name           0
category               0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           0
about_product          0
user_id                0
user_name              0
review_id              0
review_title           0
review_content         0
img_link               0
product_link           0
dtype: int64


### Ensure that rating is normalized

In [11]:
print(dataset["rating"][:10])
# The data is normalized

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numeric_cols = dataset.select_dtypes(include=[np.number]).columns

dataset[numeric_cols] = scaler.fit_transform(dataset[numeric_cols])


0    4.2
1    4.0
2    3.9
3    4.2
4    4.2
5    3.9
6    4.1
7    4.3
8    4.2
9    4.0
Name: rating, dtype: float64


## Create price_category

In [None]:
dataset = dataset[dataset["actual_price"].notna()].copy()

labels = ['Low', 'Medium', 'High', 'Luxury']
dataset['price_category'] = pd.qcut(dataset['actual_price'], q=4, labels=labels, duplicates='drop')

print(dataset['price_category'].value_counts())

price_category
Low       368
High      366
Luxury    366
Medium    365
Name: count, dtype: int64


In [13]:
features = ["discounted_price", "discount_percentage", "rating", "rating_count", "category"]
for c in features:
    if c not in dataset.columns:
        raise ValueError(f"Missing column: {c}")


## One-hot encode category

In [14]:
X = dataset[features].copy()
X = pd.get_dummies(X, columns=["category"], prefix="cat", dummy_na=False)

y = dataset["price_category"]
print("X shape:", X.shape)
print("Example features:", X.columns.tolist()[:10])

X shape: (1465, 215)
Example features: ['discounted_price', 'discount_percentage', 'rating', 'rating_count', 'cat_Car&Motorbike|CarAccessories|InteriorAccessories|AirPurifiers&Ionizers', 'cat_Computers&Accessories|Accessories&Peripherals|Adapters|USBtoUSBAdapters', 'cat_Computers&Accessories|Accessories&Peripherals|Audio&VideoAccessories|PCHeadsets', 'cat_Computers&Accessories|Accessories&Peripherals|Audio&VideoAccessories|PCMicrophones', 'cat_Computers&Accessories|Accessories&Peripherals|Audio&VideoAccessories|PCSpeakers', 'cat_Computers&Accessories|Accessories&Peripherals|Audio&VideoAccessories|Webcams&VoIPEquipment|Webcams']


## Split into training test and test set

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (1172, 215) Test size: (293, 215)
