In [None]:
# ## Simple XGBoost Model with Evaluation Metrics

# ### Step 1: Import Required Libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import numpy as np
import os

# Set the working directory
os.chdir("/Users/alexanderschou/Desktop/Projects/adv_mla_2024/AT2_experimentation/data/processed")

# ### Step 2: Load the Data
sales_train_processed = pd.read_csv('sales_train_processed.csv')

# ### Step 3: Prepare the Feature Matrix (X) and Target (y)
X = sales_train_processed.drop(columns=['revenue'])
y = sales_train_processed['revenue']

# Specify categorical columns
categorical_columns = ['item_id', 'store_id']

# ### Step 4: Preprocessing Pipeline
# Define preprocessing for categorical columns (Imputation + OneHotEncoding)
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine categorical preprocessing into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_preprocessor, categorical_columns)
    ])

# ### Step 5: Define XGBoost Pipeline
# Pipeline includes preprocessing followed by XGBoost regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

# ### Step 6: Train-Test Split
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ### Step 7: Model Training
pipeline.fit(X_train, y_train)

# ### Step 8: Model Evaluation
# Predict on validation set
y_pred = pipeline.predict(X_val)

# ### Step 9: Compute Evaluation Metrics
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_val, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# R-squared (R2) Score
r2 = r2_score(y_val, y_pred)
print(f"R-squared (R2): {r2:.2f}")

# ### Step 10: Save the Model
# Refit on the entire dataset and save the trained model
#pipeline.fit(X, y)
#os.chdir("/Users/alexanderschou/Desktop/Projects/adv_mla_2024/AT2_experimentation/models/predictive")
#import joblib
#joblib.dump(pipeline, 'xgboost_model_new.joblib')


  sales_train_processed = pd.read_csv('sales_train_processed.csv')
