<a href="https://colab.research.google.com/github/katenovita/retail-sales-prediction/blob/main/rossmann_sales_prediction_CNK_vFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Rossmann Store Sales - Catherine Novita Kusumaningrum

Data derived from Kaggle competition (Florian Knauer and Will Cukierski. Rossmann Store Sales. https://kaggle.com/competitions/rossmann-store-sales, 2015. Kaggle.)

## Load packages and data

In [None]:
from google.colab import files
files.upload()

In [None]:
%pip install optuna optuna-integration[xgboost]

In [None]:
# Required packages
import os
import pandas as pd
import math
import numpy as np
from datetime import timedelta

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.colors import ListedColormap

# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Preprocessing and Feature Engineering
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    LabelEncoder,
    MinMaxScaler,
    StandardScaler,
    FunctionTransformer,
)
from sklearn.impute import SimpleImputer

# Model Selection and Evaluation
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_predict,
    StratifiedKFold,
)
from sklearn.metrics import (
    mean_squared_error,
    make_scorer,
    log_loss,
    roc_curve,
    roc_auc_score
)

from optuna.integration import XGBoostPruningCallback
import optuna

# Machine Learning Models
from sklearn.linear_model import Lasso, LogisticRegression
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
import zipfile

zip_path = "rossmann-store-sales.zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    for file in zip_ref.namelist():
        if file.endswith(".csv"):
            name = file.replace(".csv", "").split("/")[-1]
            globals()[name] = pd.read_csv(zip_ref.open(file), low_memory=False)


In [None]:
print(zip_ref.namelist())

In [None]:
print("Train data:", train.shape)
print(train.info())

print("Test data:", test.shape)
print(test.info())

print("Store data:",store.shape)
print(store.info())

In [None]:
print("Train data: \n", train.head())
print("Test data: \n", test.head())
print("Store data: \n", store.head())

In [None]:
# Ensure Date is datetime
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

# Breaking down Date into smaller elements
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Day'] = train['Date'].dt.day
train['WeekOfYear'] = train['Date'].dt.isocalendar().week
test['Year'] = test['Date'].dt.year
test['Month'] = test['Date'].dt.month
test['Day'] = test['Date'].dt.day
test['WeekOfYear'] = test['Date'].dt.isocalendar().week

# fill NAs on 'Open' in test set as 1
test['Open'] = test['Open'].fillna(1)

In [None]:
# Merge train and test with 'store' df
df = train.copy()

# Ensure we train only on obs with Open=1 and Sales > 0
df = df[df["Open"] != 0]
df = df[df["Sales"] > 0]

# Standardize Sales into its log form
df['log_sales'] = np.log1p(df['Sales'])  # Safe for zero sales

# Merge sales and store-related variables
df = pd.merge(df, store, on='Store')
test = pd.merge(test, store, on='Store')

In [None]:
print(df.info())
print(test.info())

In [None]:
#looping to categorize 'cat' to include categorical columns and 'num' to include numeric columns
cat = []
num = []
for col in df.columns:
    if df[col].dtype == 'O':
        cat.append(col)
    else:
        num.append(col)

In [None]:
print("Train data:", df.shape)
df[cat].describe()

In [None]:
df[num].describe()

In [None]:
num

In [None]:
cat

#### Visualizing data structures

In [None]:
# # quick look at the data structure (numerical attributes)
plt.figure(figsize=(25,20))
for i in range(len(num)):
    plt.subplot(8, 3, i+1)
    ax = sns.countplot(x=num[i], data=df, legend=False) #, order=df[num[i]].value_counts().index)    # y=num[i],
    #plt.bar_label(ax.containers[0], rotation=90, label_type='edge')
    sns.despine()
plt.tight_layout()

In [None]:
# # quick look at the data structure (categorical attributes)
plt.figure(figsize=(30,15))
for i in range(len(cat)):
    plt.subplot(4, 5, i+1)
    ax = sns.countplot(y=cat[i], data=df, legend=False, order=df[cat[i]].value_counts().index)
    plt.bar_label(ax.containers[0])
    sns.despine()
plt.tight_layout()

#### Correlation Matrix

In [None]:
df['StateHoliday'] = df['StateHoliday'].replace({'0': '0'})  # fix mixed types
df['StateHoliday_flag'] = df['StateHoliday'].ne('0').astype(int)  # 1 if holiday, 0 otherwise
test['StateHoliday'] = test['StateHoliday'].replace({'0': '0'})  # fix mixed types
test['StateHoliday_flag'] = test['StateHoliday'].ne('0').astype(int)  # 1 if holiday, 0 otherwise

correlation = df[['Sales',
                  'log_sales',
                  'Store',
                  'DayOfWeek',
                  'Date',
#                  'Customers',
#                  'Open',
                  'Promo',
                  'SchoolHoliday',
                  'CompetitionDistance',
                  'CompetitionOpenSinceMonth',
                  'CompetitionOpenSinceYear',
                  'Promo2',
                  'Promo2SinceWeek',
                  'Promo2SinceYear',
                  'StateHoliday_flag',
                  'Year',
                  'Month',
                  'Day',
                  'WeekOfYear']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix with Sales')
plt.show()

## Data preprocessing

In [None]:
# Set a seed for reproducibility of this notebook's output
np.random.seed(42)

In [None]:
# Copy dfs
df_copy = df.copy()
test_copy = test.copy()

# Drop specific columns manually
# (Sales are represented now with log_sales, Open has no variation, we don't have # of customers on Test, so we can't predict on Test if training uses Customers)
manual_cols = ['Sales','Open','Customers']

# Combine and drop columns
all_cols_to_drop = manual_cols # list(set(pattern_cols).union(manual_cols))
d = df_copy.drop(columns=all_cols_to_drop)
test_set = test_copy.drop(columns=['Open']) # We don't train with Open, so this should be excluded for prediction later

# Sort by date just to be safe
d = d.sort_values('Date')

# Set cutoff date (6 weeks, same as per test), to split train set and use for validation
cutoff_date = d["Date"].max() - pd.Timedelta(weeks=6)

# Time-based split - cutoff of last 6 weeks to mimic the test set
train_set = d[d['Date'] <= cutoff_date] # Use strictly for training
valid_set = d[d['Date'] > cutoff_date]  # Use to validate training results, before predicting on real test set

# Check sample and # of columns per set
print(f"Train shape: {train_set.shape}, Valid shape: {valid_set.shape}, Test shape: {test_set.shape}")

# Remove Date from all dataset
train_set = train_set.drop(columns=['Date'])
valid_set = valid_set.drop(columns=['Date'])
test_set = test_set.drop(columns=['Date'])

In [None]:
train_set.head()

In [None]:
cutoff_date

In [None]:
# Keep 'log_sales' as Y column from train, validation sets
y_train_log = train_set['log_sales'].copy()
y_valid_log = valid_set['log_sales'].copy()

# Set back log to its actual sales
y_train_sales = np.expm1(y_train_log)
y_valid_sales = np.expm1(y_valid_log)

# Remove the 'log_sales' column from train, validation sets
X_train = train_set.drop(columns=['log_sales'])
X_valid = valid_set.drop(columns=['log_sales'])

In [None]:
### Define functions to build preprocessing pipeline

## Passthrough Transformer to create placeholder step for features that don't need to be transformed
class PassthroughTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

    def get_feature_names_out(self, input_features=None):
        return input_features

# Define the PassthroughTransformer
passthrough_transformer = PassthroughTransformer()

# to scale & impute NAs in 'CompetitionDistance' - it doesn't make sense if distance is 0, so it must be assumed to be very high number/far away from each other
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant", fill_value=99999)),
    ('standard', StandardScaler())
])

# MinMix scaling for the rest of numerical vars that we know have certain bounds
num_minmax_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant", fill_value=0)), #fillnas as 0
    ('minmax', MinMaxScaler())
])

# # Define a custom transformer to convert data to string
class ToStringTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure X is treated as a pandas DataFrame for .astype(str)
        if not isinstance(X, pd.DataFrame):
             # Handle case where X might be a numpy array from ColumnTransformer
             # Assuming single column input based on usage
             X = pd.DataFrame(X, columns=self.feature_names_in_)

        return X.astype(str)

    def get_feature_names_out(self, input_features=None):
        # This method is required for compatibility with ColumnTransformer
        # It should return the input feature names as the output feature names
        return input_features if input_features is not None else self.feature_names_in_

# Build pipeline to correctly encode Store
store_transformer = Pipeline(steps=[
    ('to_string', ToStringTransformer()),
    ('imputer', SimpleImputer(strategy='constant', fill_value='0')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# For PromoInterval, one-hot encode (NAs will also be encoded as 0)
cat_promoint_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

cat_ohe_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [None]:
# Features to transform
num_features = ['CompetitionDistance']
minmax_features = ['CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Promo2SinceWeek','Promo2SinceYear','Year','Month','Day','WeekOfYear']
binary_features = ['Promo','Promo2','SchoolHoliday']
store_features = ['Store']
promo_features = ['PromoInterval']

# Filter out the ordinal ones, to one-hot encode categorical features that has no "levels"
ohe_features = ['StoreType','Assortment','DayOfWeek','StateHoliday']  #DoW better be ohe https://otexts.com/fpp2/useful-predictors.html

# ColumnTransformer to tie all together
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_features),
    ('minmax', num_minmax_pipeline, minmax_features),
    ('binary', PassthroughTransformer(), binary_features),
    ('store', store_transformer, store_features),
    ('promo', cat_promoint_pipeline, promo_features),
    ('ohe', cat_ohe_pipeline, ohe_features),
])

In [None]:
# Fit and transform the pipeline only on the training set
train_prepared = preprocessor.fit_transform(X_train)
print(train_prepared)

# Transform pipeline on valid and test sets
valid_prepared = preprocessor.transform(X_valid)
test_prepared = preprocessor.transform(test_set)

In [None]:
# Turn numpy output into pd.DataFrame
column_names = preprocessor.get_feature_names_out()
X_train_prepared = pd.DataFrame(train_prepared, columns=column_names)
X_valid_prepared = pd.DataFrame(valid_prepared, columns=column_names)
test_prepared_df = pd.DataFrame(test_prepared, columns=column_names)

# Now you can use .head()
X_train_prepared.head()

In [None]:
X_train_prepared.describe()

In [None]:
# Sanitize problematic chars in column names
X_train_prepared.columns = X_train_prepared.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_valid_prepared.columns = X_valid_prepared.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
test_prepared_df.columns = test_prepared_df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

## Modelling

In [None]:
# Define global functions for evaluating the models based on RMSPE and scaling back to actual sales metrics

def rmspe(y_true, y_pred):
    """Root Mean Squared Percentage Error."""
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def evaluate_model(y_valid_log, y_pred_log, model_name="Model"):
    """Evaluate both log-scale and original sales scale metrics."""

    # RMSE in log space
    rmse_log = np.sqrt(mean_squared_error(y_valid_log, y_pred_log))
    rmspe_log = rmspe(y_valid_log, y_pred_log)

    # Convert back to original sales values
    y_valid_sales = np.expm1(y_valid_log)
    y_pred_sales = np.expm1(y_pred_log)

    # RMSE and RMSPE on real sales values
    rmse_sales = np.sqrt(mean_squared_error(y_valid_sales, y_pred_sales))
    rmspe_sales = rmspe(y_valid_sales, y_pred_sales)

    print(f"✅ {model_name} Evaluation:")
    print(f"  • RMSE (log): {rmse_log:.4f}")
    print(f"  • RMSPE (log): {rmspe_log:.4f}")
    print(f"  • RMSE (sales): {rmse_sales:.4f}")
    print(f"  • RMSPE (sales): {rmspe_sales:.4f}")
    print("-" * 50)

    return {
        "rmse_log": rmse_log,
        "rmspe_log": rmspe_log,
        "rmse_sales": rmse_sales,
        "rmspe_sales": rmspe_sales
    }

### Linear Regression

In [None]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X_train_prepared, y_train_log)
y_pred_log = reg.predict(X_valid_prepared)
# Evaluate
evaluate_model(y_valid_log, y_pred_log, model_name="Linear Regression")

### LightGBM

In [None]:
# 1. LightGBM default
lgb_model = LGBMRegressor()
lgb_model.fit(X_train_prepared, y_train_log)
y_pred_log = lgb_model.predict(X_valid_prepared)

# Evaluate
evaluate_model(y_valid_log, y_pred_log, model_name="LightGBM")

#### LightGBM hyperparameter tuning

In [None]:
import lightgbm as lgb

def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
        'random_state': 42
    }

    model = LGBMRegressor(**params)
    model.fit(
        X_train_prepared, y_train_log,
        eval_set=[(X_valid_prepared, y_valid_log)],
        eval_metric="rmse"
    )

    # Predictions
    y_pred_log = model.predict(X_valid_prepared)
    y_pred_sales = np.expm1(y_pred_log)
    y_valid_sales = np.expm1(y_valid_log)

    # Calculate RMSPE (sales)
    rmspe_sales = rmspe(y_valid_sales, y_pred_sales)

    return rmspe_sales  # must return a single float

lgb_study = optuna.create_study(study_name="rossmann_lgbm_study", direction='minimize')
lgb_study.optimize(objective, n_trials=30, show_progress_bar=True)

if lgb_study.best_trial:
    print("Best RMSE:", lgb_study.best_value)
    print("Best Params:", lgb_study.best_trial.params)
else:
    print("No successful trials.")

In [None]:
best_params_lgb = lgb_study.best_params
best_params_lgb.update({'objective': 'regression', 'metric': 'rmse'})
best_model_lgb = lgb.train(best_params_lgb, lgb.Dataset(X_train_prepared, label=y_train_log),
                  valid_sets=[lgb.Dataset(X_valid_prepared, label=y_valid_log)],
                  )

# Predict (in log scale), then convert to real sales
y_pred_log = best_model_lgb.predict(X_valid_prepared)
y_pred_sales = np.expm1(y_pred_log)
y_valid_sales = np.expm1(y_valid_log)

# Evaluate
evaluate_model(y_valid_log, y_pred_log, model_name="Best Tuned LightGBM")

# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 8))  # adjust size as needed
lgb.plot_importance(best_model_lgb, max_num_features=20, importance_type='gain', height=0.5, ax=ax)

ax.grid(False)  # Remove grid lines
ax.set_title("Top 20 Feature Importances LightGBM", fontsize=14)
plt.tight_layout(pad=1.5)
plt.show()

### XGBoost

In [None]:
# Create DMatrix
dtrain = xgb.DMatrix(X_train_prepared, label=y_train_log)
dvalid = xgb.DMatrix(X_valid_prepared, label=y_valid_log)

# Set parameters with GPU support
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': "hist",
    'device': "cuda",
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Train model
evals = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals,
                  early_stopping_rounds=50, verbose_eval=100)

# Predict (in log scale), then convert to real sales
y_pred_log = model.predict(dvalid)

# Evaluate
evaluate_model(y_valid_log, y_pred_log, model_name="XGBoost")

#### XGBoost hyperparameter tuning

In [None]:
def objective(trial):
    # Suggest hyperparameters space to search
    param = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'lambda': trial.suggest_float('lambda', 1e-4, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-4, 10.0, log=True),
        'tree_method': 'hist',
        'device': "cuda",
        'seed': 42
    }

    dtrain = xgb.DMatrix(X_train_prepared, label=y_train_log)
    dvalid = xgb.DMatrix(X_valid_prepared, label=y_valid_log)

    evals = [(dtrain, 'train'), (dvalid, 'valid')]

    model = xgb.train(
        params=param,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Predictions
    y_pred_log = model.predict(dvalid)
    y_pred_sales = np.expm1(y_pred_log)
    y_valid_sales = np.expm1(y_valid_log)

    # Calculate RMSPE (sales)
    rmspe_sales = rmspe(y_valid_sales, y_pred_sales)

    return rmspe_sales  # must return a single float

# Create and run the optimization process with 30 trials
study_xgb = optuna.create_study(study_name="rossmann_xgboost_study", direction='minimize')
study_xgb.optimize(objective, n_trials=30, show_progress_bar=True, n_jobs=-1)

# Retrieve the best parameter values
best_params_xgb = study_xgb.best_params
print(f"\nBest parameters: {best_params_xgb}")

In [None]:
# Retrieve the best parameter values
best_params_xgb = study_xgb.best_params
print(f"\nBest parameters: {best_params_xgb}")

In [None]:
# Apply the best params on the valid/test set
best_model_xgb = xgb.train(
    params=best_params_xgb,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=100
)

# Predict (in log scale), then convert to real sales
y_pred_log = best_model_xgb.predict(dvalid)
y_pred_sales = np.expm1(y_pred_log)
y_valid_sales = np.expm1(y_valid_log)

# Evaluate
evaluate_model(y_valid_log, y_pred_log, model_name="Best Tuned XGBoost")

# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 10))  # adjust size as needed
xgb.plot_importance(best_model_xgb, max_num_features=20, importance_type='gain', height=0.5, ax=ax)

ax.grid(False)  # Remove grid lines
ax.set_title("Top 20 Feature Importances XGBoost", fontsize=14)
plt.tight_layout(pad=1.5)
plt.show()

In [None]:
# Results
models = ['Linear Regression', 'LightGBM', 'XGBoost']
rmse_sales = [2603.3063, 1322.0624, 885.7547]
rmspe_sales = [0.4325, 0.1829, 0.1240]

# Create figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# --- RMSE (Sales) ---
bars1 = axes[0].bar(models, rmse_sales, color=['#A5C9CA', '#E7F6F2', '#395B64'])
axes[0].set_title('RMSE (Sales) Comparison')
axes[0].set_ylabel('RMSE')
axes[0].set_xlabel('Model')

# Add value annotations
for bar in bars1:
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2, height,
                 f"{height:,.0f}", ha='center', va='bottom', fontsize=10)

# --- RMSPE (Sales) ---
bars2 = axes[1].bar(models, rmspe_sales, color=['#A5C9CA', '#E7F6F2', '#395B64'])
axes[1].set_title('RMSPE (Sales) Comparison')
axes[1].set_ylabel('RMSPE')
axes[1].set_xlabel('Model')

# Add value annotations
for bar in bars2:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2, height,
                 f"{height:.3f}", ha='center', va='bottom', fontsize=10)

plt.suptitle('Model Performance Comparison', fontsize=14, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

#### Save the best model and re-use it on test set

In [None]:
best_model_xgb.save_model("best_xgb_model.pkl")
# loaded_model = xgb.Booster()
# loaded_model.load_model("best_xgb_model.pkl")

# # Predict (in log scale), then convert to real sales
# y_pred_log = loaded_model.predict(dvalid)

# # Evaluate
# evaluate_model(y_valid_log, y_pred_log, model_name="Best Tuned XGBoost")

In [None]:
# Convert to DMatrix
dtest = xgb.DMatrix(test_prepared_df)

# Predict on actual test set
y_test_log_pred = best_model_xgb.predict(dtest)
y_test_pred = np.expm1(y_test_log_pred)  # Convert back from log

In [None]:
# Final output - consist of 'Id' and predicted 'Sales'
submission = pd.DataFrame({
    'Id': test['Id'],
    'Open': test['Open'],
    'Sales': y_test_pred
})

# Ensure that closed obs has 0 Sales even on prediction and no negative predictions (competition rule)
submission['Sales'] = submission['Sales'].clip(lower=0)
submission.loc[test['Open'] == 0, 'Sales'] = 0

submission = pd.DataFrame({
    'Id': test['Id'],
    'Sales': y_test_pred
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head(20)

#### Saving final model and submission on my GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set your Drive folder
target_folder = '/content/drive/My Drive/Colab Notebooks/0_Portfolio/outputs'
os.makedirs(target_folder, exist_ok=True)

In [None]:
import joblib  # or pickle

# Example DataFrame and model
submission_df = submission
best_model = best_model_xgb

# Save CSV
csv_path = os.path.join(target_folder, 'submission.csv')
submission_df.to_csv(csv_path, index=False)

# Save model as .pkl
model_path = os.path.join(target_folder, 'best_model_xgb.pkl')
joblib.dump(best_model, model_path)

print(f"Saved CSV to {csv_path}")
print(f"Saved model to {model_path}")