## Load the Processed Dataset

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Define directories
DATA_DIR = "../data/"
df = pd.read_csv(os.path.join(DATA_DIR, "train_processed.csv"))

print("Processed dataset loaded!")
print(df.info())


Processed dataset loaded!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128585 entries, 0 to 128584
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   record_ID        128585 non-null  float64
 1   store_id         128585 non-null  float64
 2   sku_id           128585 non-null  float64
 3   total_price      128585 non-null  float64
 4   base_price       128585 non-null  float64
 5   is_featured_sku  128585 non-null  float64
 6   is_display_sku   128585 non-null  float64
 7   units_sold       128585 non-null  float64
 8   year             128585 non-null  float64
 9   month            128585 non-null  float64
 10  week_num         128585 non-null  float64
 11  quarter          128585 non-null  float64
 12  day_of_week      128585 non-null  float64
dtypes: float64(13)
memory usage: 12.8 MB
None


In [2]:
## Split Data into Train & Test Sets

In [3]:
# Define target variable
TARGET = "units_sold"  # Adjust if needed

# Separate features (X) and target (y)
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Split into train (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [4]:
print("Data split into train & test sets!")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Data split into train & test sets!
Train shape: (102868, 12), Test shape: (25717, 12)


## Train Multiple Model

In [5]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize models
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
}

In [6]:
# Train models and evaluate performance
results = {}
for name, model in models.items():
    print(f" Training {name}...")
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {"MAE": mae, "MSE": mse, "R2": r2}
    print(f"✅ {name} - MAE: {mae:.4f}, MSE: {mse:.4f}, R²: {r2:.4f}")


 Training RandomForest...
✅ RandomForest - MAE: 0.3756, MSE: 0.2721, R²: 0.7259
 Training XGBoost...
✅ XGBoost - MAE: 0.3738, MSE: 0.2519, R²: 0.7462
 Training GradientBoosting...
✅ GradientBoosting - MAE: 0.5735, MSE: 0.5555, R²: 0.4404


In [7]:
# Convert results to a DataFrame for easy comparison
import pandas as pd
results_df = pd.DataFrame(results).T
print("\n🔍 Model Performance Comparison:\n", results_df)



🔍 Model Performance Comparison:
                        MAE       MSE        R2
RandomForest      0.375562  0.272121  0.725851
XGBoost           0.373834  0.251892  0.746230
GradientBoosting  0.573512  0.555499  0.440361


In [8]:
## Save the Best Model

In [31]:
import joblib
import os

# Get the best model based on R²
best_model_name = results_df["R2"].idxmax()
best_model = models[best_model_name]

# Save the model in .pkl format
MODEL_DIR = "../models/"
os.makedirs(MODEL_DIR, exist_ok=True)
model_path = os.path.join(MODEL_DIR, "xgboost_model.pkl")

joblib.dump(best_model, model_path)
print(f"✅ Best model '{best_model_name}' saved at {model_path}!")


✅ Best model 'XGBoost' saved at ../models/xgboost_model.pkl!
