In [3]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Step 1: Download the dataset using kagglehub
path = kagglehub.dataset_download("felixzhao/productdemandforecasting")
print("Path to dataset files:", path)

# Load the dataset file (adjust filename if necessary)
file_path = f"{path}/Historical Product Demand.csv"
data = pd.read_csv(file_path, nrows=50000)  # Load only a subset

# Step 2: Handle Missing Values
data.dropna(inplace=True)

# Step 3: Feature Selection and Preprocessing
# Drop unnecessary columns
X = data.drop(columns=["Date", "Order_Demand"])

# Clean the target column (Order_Demand)
y = (
    data["Order_Demand"]
    .astype(str)  # Convert to string
    .str.replace('(', '', regex=False)  # Remove '('
    .str.replace(')', '', regex=False)  # Remove ')'
    .replace('', np.nan)  # Replace empty strings with NaN
    .astype(float)  # Convert to float
)

# Drop rows with NaN in the target column and align features
y.dropna(inplace=True)
X = X.loc[y.index]

# Identify categorical and numerical features
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(include=["number"]).columns

# Efficient One-Hot Encoding
if not categorical_cols.empty:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    encoded_features = ohe.fit_transform(X[categorical_cols])
else:
    encoded_features = np.empty((len(X), 0))

# Scale numerical features
if not numerical_cols.empty:
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(X[numerical_cols])
else:
    scaled_features = np.empty((len(X), 0))

# Combine scaled numerical features and encoded categorical features
X_preprocessed = np.hstack([scaled_features, encoded_features])

# Step 4: Split the Data
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_preprocessed, y, test_size=0.1, random_state=0
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1111, random_state=0
)

# Step 5: Train Random Forest Regressor
rf_model = RandomForestRegressor(random_state=0, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = rf_model.predict(X_val)
rf_val_score = r2_score(y_val, y_val_pred)
print(f"Random Forest R-squared score on validation set: {rf_val_score}")

# Evaluate on test set
y_test_pred = rf_model.predict(X_test)
rf_test_score = r2_score(y_test, y_test_pred)
print(f"Random Forest R-squared score on test set: {rf_test_score}")

# Step 6: Optional - SVR for Smaller Data
small_X_train, _, small_y_train, _ = train_test_split(
    X_train, y_train, test_size=0.8, random_state=0
)  # Use a smaller subset for SVR

svr_model = SVR(kernel='linear', C=1.0, epsilon=0.1)  # Linear kernel for faster computation
svr_model.fit(small_X_train, small_y_train)

# Predict and evaluate on the test set
y_test_pred_svr = svr_model.predict(X_test)
svr_test_score = r2_score(y_test, y_test_pred_svr)
print(f"SVR (Linear Kernel) R-squared score on test set: {svr_test_score}")


Path to dataset files: C:\Users\justino\.cache\kagglehub\datasets\felixzhao\productdemandforecasting\versions\1
Random Forest R-squared score on validation set: 0.2261002205511743
Random Forest R-squared score on test set: 0.25442006097430336
SVR (Linear Kernel) R-squared score on test set: -0.03849910610601848
