In [15]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [16]:
# Dynamic path settings
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
DATA_DIR = os.path.join(BASE_DIR, "dataset")
TRAIN_TEST_SPLIT_DIR = os.path.join(BASE_DIR, "train_test_split")

In [17]:
# Check if directories exist and create if they don't
os.makedirs(TRAIN_TEST_SPLIT_DIR, exist_ok=True)

In [18]:
# File paths
product_csv_path = os.path.join(DATA_DIR, "product_details.csv")
customer_csv_path = os.path.join(DATA_DIR, "customer_details.csv")
sales_csv_path = os.path.join(DATA_DIR, "sales_data.csv")

In [19]:
# Reading CSV files
products_df = pd.read_csv(product_csv_path)
customers_df = pd.read_csv(customer_csv_path)
sales_df = pd.read_csv(sales_csv_path)

In [20]:
# Checking for missing values
print("Products DataFrame missing values:\n", products_df.isnull().sum(), "\n")
print("Customers DataFrame missing values:\n", customers_df.isnull().sum(), "\n")
print("Sales DataFrame missing values:\n", sales_df.isnull().sum(), "\n")

Products DataFrame missing values:
 Uniqe Id                     0
Product Name                 0
Brand Name               10002
Asin                     10002
Category                   830
Upc Ean Code              9968
List Price               10002
Selling Price              107
Quantity                 10002
Model Number              1772
About Product              273
Product Specification     1632
Technical Details          790
Shipping Weight           1138
Product Dimensions        9523
Image                        0
Variants                  7524
Sku                      10002
Product Url                  0
Stock                    10002
Product Details          10002
Dimensions               10002
Color                    10002
Ingredients              10002
Direction To Use         10002
Is Amazon Seller             0
Size Quantity Variant    10002
Product Description      10002
dtype: int64 

Customers DataFrame missing values:
 Customer ID               0
Age             

In [21]:
# Dropping rows with missing values in specific columns
sales_df.dropna(subset=['user id', 'product id', 'Interaction type', 'Time stamp'], inplace=True)

In [22]:
# Checking the data
print("Products DataFrame:\n", products_df.head(), "\n")
print("Customers DataFrame:\n", customers_df.head(), "\n")
print("Sales DataFrame:\n", sales_df.head(), "\n")

Products DataFrame:
                            Uniqe Id   
0  4c69b61db1fc16e7013b43fc926e502d  \
1  66d49bbed043f5be260fa9f7fbff5957   
2  2c55cae269aebf53838484b0d7dd931a   
3  18018b6bc416dab347b1b7db79994afa   
4  e04b990e95bf73bbe6a3fa09785d7cd0   

                                        Product Name  Brand Name  Asin   
0  DB Longboards CoreFlex Crossbow 41" Bamboo Fib...         NaN   NaN  \
1  Electronic Snap Circuits Mini Kits Classpack, ...         NaN   NaN   
2  3Doodler Create Flexy 3D Printing Filament Ref...         NaN   NaN   
3  Guillow Airplane Design Studio with Travel Cas...         NaN   NaN   
4                   Woodstock- Collage 500 pc Puzzle         NaN   NaN   

                                            Category Upc Ean Code  List Price   
0  Sports & Outdoors | Outdoor Recreation | Skate...          NaN         NaN  \
1  Toys & Games | Learning & Education | Science ...          NaN         NaN   
2          Toys & Games | Arts & Crafts | Craft Kits    

In [23]:
# Converting timestamp to datetime format
try:
    sales_df['Time stamp'] = pd.to_datetime(sales_df['Time stamp'], dayfirst=True)
except Exception as e:
    raise ValueError(f"Error converting 'Time stamp' to datetime: {e}")

In [24]:
# Encoding user and product IDs to numeric values
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()
sales_df['user_id'] = user_encoder.fit_transform(sales_df['user id'])
sales_df['product_id'] = product_encoder.fit_transform(sales_df['product id'])

In [25]:
# Map interaction types to numeric values
interaction_mapping = {'view': 1, 'like': 2, 'purchase': 3}
sales_df['interaction_type'] = sales_df['Interaction type'].map(interaction_mapping)

In [26]:
# Splitting data into training and test sets
train_df, test_df = train_test_split(sales_df, test_size=0.2, random_state=42)

In [27]:
# Saving processed data
train_sales_path = os.path.join(TRAIN_TEST_SPLIT_DIR, "train_sales_data.csv")
test_sales_path = os.path.join(TRAIN_TEST_SPLIT_DIR, "test_sales_data.csv")

train_df.to_csv(train_sales_path, index=False)
test_df.to_csv(test_sales_path, index=False)

In [28]:
print(f"Training sales data saved to {train_sales_path}")
print(f"Test sales data saved to {test_sales_path}")

Training sales data saved to C:\Users\kaank\Desktop\Test\KG-Enhanced-Recommender\train_test_split\train_sales_data.csv
Test sales data saved to C:\Users\kaank\Desktop\Test\KG-Enhanced-Recommender\train_test_split\test_sales_data.csv


In [29]:
# Print dataframes to check
print("Train DataFrame:\n", train_df.head(), "\n")
print("Test DataFrame:\n", test_df.head(), "\n")

Train DataFrame:
       user id                        product id Interaction type   
1485   1486.0  bc6faefaf23665e596730c45eba88676             view  \
995     996.0  b154535de7fb99aaf266e7d4734d34a7         purchase   
608     609.0  6eb080a2fb050a168259b81c4085cbfb             view   
2664   2665.0  ac38f5e3be8aa9d157b9ff91029c5eee             like   
615     616.0  9ba26b43016cba318f40cccb1a816d47             like   

              Time stamp  Unnamed: 4  user_id  product_id  interaction_type  
1485 2023-01-20 10:00:00         NaN     1423        2140                 1  
995  2023-10-22 08:00:00         NaN      958        2013                 3  
608  2023-10-31 08:00:00         NaN      605        1218                 1  
2664 2023-04-27 10:00:00         NaN     2561        1956                 2  
615  2023-11-07 08:00:00         NaN      612        1760                 2   

Test DataFrame:
       user id                        product id Interaction type   
732     733.0  583