In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [14]:
# Dynamic path settings
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
DATA_DIR = os.path.join(BASE_DIR, "dataset")
TRAIN_TEST_SPLIT_DIR = os.path.join(BASE_DIR, "train_test_split")

In [15]:
# Check if directories exist and create if they don't
if not os.path.exists(TRAIN_TEST_SPLIT_DIR):
    os.makedirs(TRAIN_TEST_SPLIT_DIR)

In [16]:
# File paths
product_csv_path = os.path.join(DATA_DIR, "product_details.csv")
customer_csv_path = os.path.join(DATA_DIR, "customer_details.csv")
sales_csv_path = os.path.join(DATA_DIR, "sales_data.csv")

In [17]:
# Reading CSV files
products_df = pd.read_csv(product_csv_path)
customers_df = pd.read_csv(customer_csv_path)
sales_df = pd.read_csv(sales_csv_path)

In [18]:
# Dropping rows with missing values in specific columns
sales_df.dropna(subset=['user id', 'product id', 'Interaction type', 'Time stamp'], inplace=True)

In [19]:
# Checking the data
print("Products DataFrame:\n", products_df.head(), "\n")
print("Customers DataFrame:\n", customers_df.head(), "\n")
print("Sales DataFrame:\n", sales_df.head(), "\n")

Products DataFrame:
                            Uniqe Id   
0  4c69b61db1fc16e7013b43fc926e502d  \
1  66d49bbed043f5be260fa9f7fbff5957   
2  2c55cae269aebf53838484b0d7dd931a   
3  18018b6bc416dab347b1b7db79994afa   
4  e04b990e95bf73bbe6a3fa09785d7cd0   

                                        Product Name  Brand Name  Asin   
0  DB Longboards CoreFlex Crossbow 41" Bamboo Fib...         NaN   NaN  \
1  Electronic Snap Circuits Mini Kits Classpack, ...         NaN   NaN   
2  3Doodler Create Flexy 3D Printing Filament Ref...         NaN   NaN   
3  Guillow Airplane Design Studio with Travel Cas...         NaN   NaN   
4                   Woodstock- Collage 500 pc Puzzle         NaN   NaN   

                                            Category Upc Ean Code  List Price   
0  Sports & Outdoors | Outdoor Recreation | Skate...          NaN         NaN  \
1  Toys & Games | Learning & Education | Science ...          NaN         NaN   
2          Toys & Games | Arts & Crafts | Craft Kits    

In [20]:
# Converting timestamp to datetime format
try:
    sales_df['Time stamp'] = pd.to_datetime(sales_df['Time stamp'], dayfirst=True)
except Exception as e:
    raise ValueError(f"Error converting 'Time stamp' to datetime: {e}")

In [21]:
# Encoding user and product IDs to numeric values
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()
sales_df['user_id'] = user_encoder.fit_transform(sales_df['user id'])
sales_df['product_id'] = product_encoder.fit_transform(sales_df['product id'])

In [22]:
# Map interaction types to numeric values
interaction_mapping = {'view': 1, 'like': 2, 'purchase': 3}
sales_df['interaction_type'] = sales_df['Interaction type'].map(interaction_mapping)

In [23]:
# Splitting data into training and test sets
train_df, test_df = train_test_split(sales_df, test_size=0.2, random_state=42)

In [24]:
# Saving processed data
train_sales_path = os.path.join(TRAIN_TEST_SPLIT_DIR, "train_sales_data.csv")
test_sales_path = os.path.join(TRAIN_TEST_SPLIT_DIR, "test_sales_data.csv")

train_df.to_csv(train_sales_path, index=False)
test_df.to_csv(test_sales_path, index=False)

In [25]:
print(f"Training sales data saved to {train_sales_path}")
print(f"Test sales data saved to {test_sales_path}")

Training sales data saved to C:\Users\kaank\Desktop\Test\KG-Enhanced-Recommender\train_test_split\train_sales_data.csv
Test sales data saved to C:\Users\kaank\Desktop\Test\KG-Enhanced-Recommender\train_test_split\test_sales_data.csv
