In [2]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt 
import numpy as np
import optuna
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import holidays 

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit

import category_encoders as ce
import xgboost as xgb
from xgboost import XGBRegressor
from ydata_profiling import ProfileReport
from sklearn.linear_model import LinearRegression

import catboost as cb
from scalecast.Forecaster import Forecaster

import lightgbm as lgb
import torch

import time

import warnings
warnings.filterwarnings('ignore')

alt.data_transformers.disable_max_rows()
sklearn.set_config(transform_output="pandas")

## Import Data

In [3]:
train = pd.read_csv('train.csv')
blind = pd.read_csv('test.csv')

In [4]:
def is_holiday(row):
    """
    Determines if a given date is a holiday in the specified country.
    Optimized for Canada, Finland, and Italy.
    
    """
    country = row['country']
    date = pd.to_datetime(row['date']).date()  # Convert to date only

    country_mapping = {
        'Canada': holidays.CA(),
        'Finland': holidays.FI(),
        'Italy': holidays.IT(), 
        'Kenya': holidays.KE(),
        'Norway': holidays.NO(),
        'Singapore': holidays.SG(),
    }
    
    try:
        # Get the holidays object for the country
        if country in country_mapping:
            country_holidays = country_mapping[country]
            return date in country_holidays
        else:
            return False
    except Exception as e:
        print(f"Error checking holiday for {country} on {date}: {e}")
        return False

# Create a cache of holiday objects to improve performance
def initialize_holiday_detection(df):
    """
    Initialize holiday detection by creating holiday objects for all unique years in the dataset.
    This improves performance by avoiding repeated creation of holiday objects.
    
    Parameters:
    df: pandas DataFrame containing 'date' column
    """
    years = pd.to_datetime(df['date']).dt.year.unique()
    holiday_cache = {
        'Canada': {year: holidays.CA(years=year) for year in years},
        'Finland': {year: holidays.FI(years=year) for year in years},
        'Italy': {year: holidays.IT(years=year) for year in years}, 
        'Kenya': {year: holidays.KE(years=year) for year in years},
        'Norway': {year: holidays.NO(years=year) for year in years},
        'Singapore': {year: holidays.SG(years=year) for year in years}

    }
    
    def is_holiday_cached(row):
        country = row['country']
        date = pd.to_datetime(row['date']).date()
        year = date.year
        
        try:
            if country in holiday_cache:
                return date in holiday_cache[country][year]
            return False
        except Exception as e:
            print(f"Error checking holiday for {country} on {date}: {e}")
            return False
    
    return is_holiday_cached

# Example usage:
# First initialize the cached version
holiday_checker = initialize_holiday_detection(train)

# Then apply it to create the holiday flag
train['is_holiday'] = train.apply(holiday_checker, axis=1)

holiday_checker = initialize_holiday_detection(blind)
blind['is_holiday'] = blind.apply(holiday_checker, axis=1)



In [130]:

def feature_eng(df):
    df = df.copy()
    
    # Convert date column and extract features
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df['day_of_year'] = df['date'].dt.dayofyear
        df['month'] = df['date'].dt.month
        df['year'] = df['date'].dt.year
        df.drop(columns=['date'], inplace=True)

    # Assign seasons based on month
    df['season'] = pd.cut(
        df['month'], bins=[0, 3, 6, 9, 12], labels=['Winter', 'Spring', 'Summer', 'Fall']
    )

    # Convert categorical features
    categorical_cols = ["country", "store", "product", "is_holiday", "season"]
    df[categorical_cols] = df[categorical_cols].astype("category")

    # Ensure 'id' column is the index if available
    if 'id' in df.columns:
        df.set_index('id', inplace=True)

    # Impute missing values for num_sold per country using KNN
    try:
        imputer = SimpleImputer(strategy='mean')
        for country in df['country'].unique():
            mask = df['country'] == country
            df.loc[mask, 'num_sold'] = imputer.fit_transform(df.loc[mask, ['num_sold']])
    except: pass
    return df

# Feature engineering
train = feature_eng(train)
blind = feature_eng(blind)

# Splitting the data
SPLIT = 0.85
split_index = int(SPLIT * len(X))

X_train = X[:split_index]
y_train = y[:split_index]
X_test = X[split_index:]
y_test = y[split_index:]

# Verify shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (195610, 7)
y_train shape: (195610,)
X_test shape: (34520, 7)
y_test shape: (34520,)


In [131]:
blind

Unnamed: 0_level_0,country,store,product,is_holiday,day_of_year,month,year,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
230130,Canada,Discount Stickers,Holographic Goose,True,1,1,2017,Winter
230131,Canada,Discount Stickers,Kaggle,True,1,1,2017,Winter
230132,Canada,Discount Stickers,Kaggle Tiers,True,1,1,2017,Winter
230133,Canada,Discount Stickers,Kerneler,True,1,1,2017,Winter
230134,Canada,Discount Stickers,Kerneler Dark Mode,True,1,1,2017,Winter
...,...,...,...,...,...,...,...,...
328675,Singapore,Premium Sticker Mart,Holographic Goose,False,365,12,2019,Fall
328676,Singapore,Premium Sticker Mart,Kaggle,False,365,12,2019,Fall
328677,Singapore,Premium Sticker Mart,Kaggle Tiers,False,365,12,2019,Fall
328678,Singapore,Premium Sticker Mart,Kerneler,False,365,12,2019,Fall


In [132]:
blind.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98550 entries, 230130 to 328679
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   country      98550 non-null  category
 1   store        98550 non-null  category
 2   product      98550 non-null  category
 3   is_holiday   98550 non-null  category
 4   day_of_year  98550 non-null  int32   
 5   month        98550 non-null  int32   
 6   year         98550 non-null  int32   
 7   season       98550 non-null  category
dtypes: category(5), int32(3)
memory usage: 2.4 MB


In [150]:


class SalesForecastModel:
    def __init__(self, n_trials=20, n_splits=5):
        self.n_trials = n_trials
        self.n_splits = n_splits
        self.country_models = {}
        self.label_encoders = {}
        
    def _create_features(self, df):
        """Enhanced feature engineering with time-based and categorical features"""
        df = df.copy()
        
        return df
    
    def _prepare_features(self, df):
        """Prepare final feature set for modeling"""
        
        return df
    
    def _objective(self, trial, X, y, country_data):
        """Optuna objective function with cross-validation"""
        trial_start = time.time()
        print(f"🔄 Starting Trial {trial.number + 1} out of {self.n_trials}")

        param = {
            "n_estimators": trial.suggest_int("n_estimators", 20, 1500),
            "max_depth": trial.suggest_int("max_depth", 4, 24),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
            "device": "gpu" if torch.cuda.is_available() else "cpu",
            "tree_method": "hist" if torch.cuda.is_available() else "auto", 
            "enable_categorical": True
        }
        
        # Time series cross-validation
        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        mape_scores = []
        
        for train_idx, val_idx in tscv.split(X):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            model = xgb.XGBRegressor(
                **param,
                    early_stopping_rounds=50,
                    eval_metric=['mae', 'mape']
            )
            X_train = X_train.astype('category')
            
            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                #early_stopping_rounds=50,
                verbose=False
            )
            
            predictions = model.predict(X_val)
            mape = mean_absolute_percentage_error(y_val, predictions)
            mape_scores.append(mape)
        
        avg_mape = np.mean(mape_scores)
        trial.set_user_attr('cv_mape_std', np.std(mape_scores))
        
        return avg_mape
    
    def fit(self, train_df):
        """Train models for each country"""
        processed_df = self._create_features(train_df)
        X = self._prepare_features(processed_df)
        X = X.astype('category')
        y = processed_df['num_sold']
        
        for country in processed_df['country'].unique():
            print(f"\nTraining model for {country}")
            
            # Filter data for current country
            country_mask = processed_df['country'] == country
            X_country = X[country_mask]
            y_country = y[country_mask]
            
            # Create and optimize model
            study = optuna.create_study(direction="minimize")
            study.optimize(
                lambda trial: self._objective(trial, X_country, y_country, processed_df[country_mask]),
                n_trials=self.n_trials,
                show_progress_bar=True
            )
            
            # Train final model with best parameters
            best_params = study.best_params
            best_params.update({
                "device": "gpu" if torch.cuda.is_available() else "cpu",
                "tree_method": "hist" if torch.cuda.is_available() else "auto", 
                 "enable_categorical": True
            })
            
            final_model = xgb.XGBRegressor(**best_params)
            final_model.fit(X_country, y_country)
            
            self.country_models[country] = {
                'model': final_model,
                'best_params': best_params,
                'study': study,
                'mape': study.best_value,
                'feature_importance': dict(zip(X.columns, final_model.feature_importances_))
            }
            
            print(f"Best MAPE for {country}: {study.best_value:.4f}")
        
        return self
    
    def predict(self, test_df):
        """Generate predictions for test data"""
        processed_df = self._create_features(test_df)
        X = self._prepare_features(processed_df)
        predictions = np.zeros(len(test_df))
        
        for country, model_dict in self.country_models.items():
            country_mask = processed_df['country'] == country
            if country_mask.any():
                predictions[country_mask] = model_dict['model'].predict(X[country_mask])
        
        return predictions
    
    def get_feature_importance(self):
        """Return feature importance for each country"""
        importance_dict = {}
        for country, model_dict in self.country_models.items():
            importance_dict[country] = model_dict['feature_importance']
        return pd.DataFrame(importance_dict)

# Example usage:


In [None]:
# Initialize and train model
model = SalesForecastModel(n_trials=5, n_splits=2)


model.fit(train)

# Make predictions


[I 2025-01-31 12:49:37,104] A new study created in memory with name: no-name-9aeb5a1e-b117-4042-b1ec-8cd21c698021



Training model for Canada


  0%|          | 0/5 [00:00<?, ?it/s]

🔄 Starting Trial 1 out of 5
[I 2025-01-31 12:49:55,407] Trial 0 finished with value: 0.1086425869314824 and parameters: {'n_estimators': 517, 'max_depth': 15, 'learning_rate': 0.1616181544067575, 'subsample': 0.9325617502054365, 'colsample_bytree': 0.9265335429097733, 'min_child_weight': 5, 'reg_alpha': 0.0019145027669666824, 'reg_lambda': 8.475569997600753e-05}. Best is trial 0 with value: 0.1086425869314824.
🔄 Starting Trial 2 out of 5
[I 2025-01-31 12:50:38,885] Trial 1 finished with value: 0.1353150824655538 and parameters: {'n_estimators': 1493, 'max_depth': 17, 'learning_rate': 0.05345941540000315, 'subsample': 0.6012530496039133, 'colsample_bytree': 0.6900143713240914, 'min_child_weight': 4, 'reg_alpha': 0.3792977413392054, 'reg_lambda': 3.505732621673327e-05}. Best is trial 0 with value: 0.1086425869314824.
🔄 Starting Trial 3 out of 5
[I 2025-01-31 12:51:47,441] Trial 2 finished with value: 0.13060187073549548 and parameters: {'n_estimators': 1202, 'max_depth': 18, 'learning_ra

[I 2025-01-31 12:52:55,578] A new study created in memory with name: no-name-ce0005ff-194a-4012-927e-47e4b82045c1


Best MAPE for Canada: 0.1086

Training model for Finland


  0%|          | 0/5 [00:00<?, ?it/s]

🔄 Starting Trial 1 out of 5
[I 2025-01-31 12:53:03,204] Trial 0 finished with value: 0.112885951846148 and parameters: {'n_estimators': 870, 'max_depth': 4, 'learning_rate': 0.03207816244646398, 'subsample': 0.7695005057527081, 'colsample_bytree': 0.698494883347731, 'min_child_weight': 3, 'reg_alpha': 7.344628990445784e-06, 'reg_lambda': 1.2097303000124497e-05}. Best is trial 0 with value: 0.112885951846148.
🔄 Starting Trial 2 out of 5
[I 2025-01-31 12:53:05,146] Trial 1 finished with value: 0.12938627061850805 and parameters: {'n_estimators': 467, 'max_depth': 6, 'learning_rate': 0.12993698147418947, 'subsample': 0.6314756859674517, 'colsample_bytree': 0.6352779442753548, 'min_child_weight': 3, 'reg_alpha': 3.386480559228881e-08, 'reg_lambda': 0.1504799830748681}. Best is trial 0 with value: 0.112885951846148.
🔄 Starting Trial 3 out of 5
[I 2025-01-31 12:59:21,153] Trial 2 finished with value: 0.07698770736352768 and parameters: {'n_estimators': 959, 'max_depth': 21, 'learning_rate': 

In [140]:
predictions = model.predict(blind)

# Get feature importance
feature_importance = model.get_feature_importance()

In [141]:
feature_importance

Unnamed: 0,Canada,Finland,Italy,Kenya,Norway,Singapore
month,0.007331,0.007772,0.006246,0.012169,0.002352,0.002724
year,0.013081,0.009031,0.011852,0.0595,0.012687,0.005273
is_holiday,0.000679,0.00189,0.000748,0.001749,0.000663,0.000115
country,0.0,0.0,0.0,0.0,0.0,0.0
store,0.495207,0.397404,0.394152,0.377797,0.366428,0.375421
product,0.470081,0.568169,0.576305,0.532654,0.614379,0.613896
season,0.013622,0.015734,0.010697,0.016131,0.003491,0.002571


In [142]:
predictions

array([ 811.48297119,  895.15722656,  732.99316406, ..., 1841.95874023,
       1105.24316406, 1276.53076172])

In [143]:

predictions_df = pd.DataFrame({"num_sold": predictions})
predictions_df = predictions_df.reset_index().rename(columns={"index": "on"})

temp_blind = blind.copy()
temp_blind['on'] = range(1, len(temp_blind) + 1)
temp_blind = temp_blind.reset_index()

In [144]:
submissions = temp_blind.merge(predictions_df, on='on', how='left')
submissions

Unnamed: 0,id,country,store,product,is_holiday,day_of_year,month,year,season,on,num_sold
0,230130,Canada,Discount Stickers,Holographic Goose,True,1,1,2017,Winter,1,895.157227
1,230131,Canada,Discount Stickers,Kaggle,True,1,1,2017,Winter,2,732.993164
2,230132,Canada,Discount Stickers,Kaggle Tiers,True,1,1,2017,Winter,3,333.550079
3,230133,Canada,Discount Stickers,Kerneler,True,1,1,2017,Winter,4,519.088257
4,230134,Canada,Discount Stickers,Kerneler Dark Mode,True,1,1,2017,Winter,5,565.036377
...,...,...,...,...,...,...,...,...,...,...,...
98545,328675,Singapore,Premium Sticker Mart,Holographic Goose,False,365,12,2019,Fall,98546,2136.420898
98546,328676,Singapore,Premium Sticker Mart,Kaggle,False,365,12,2019,Fall,98547,1841.958740
98547,328677,Singapore,Premium Sticker Mart,Kaggle Tiers,False,365,12,2019,Fall,98548,1105.243164
98548,328678,Singapore,Premium Sticker Mart,Kerneler,False,365,12,2019,Fall,98549,1276.530762


In [123]:
submissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98550 entries, 0 to 98549
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   id           98550 non-null  int64   
 1   country      98550 non-null  category
 2   store        98550 non-null  category
 3   product      98550 non-null  category
 4   is_holiday   98550 non-null  category
 5   day_of_year  98550 non-null  int32   
 6   month        98550 non-null  int32   
 7   year         98550 non-null  int32   
 8   season       98550 non-null  category
 9   on           98550 non-null  int64   
 10  num_sold     98549 non-null  float64 
dtypes: category(5), float64(1), int32(3), int64(2)
memory usage: 3.9 MB


In [145]:
imputer = SimpleImputer(strategy="mean")

In [146]:
submissions["num_sold"] = submissions["num_sold"].astype("float")

In [147]:
submissions['num_sold'] = imputer.fit_transform(submissions[['num_sold']])

In [149]:


submissions = submissions[["id","num_sold"]]
submissions = submissions.set_index("id")
submissions.to_csv("submission_0131.csv")


In [148]:
submissions

Unnamed: 0,id,country,store,product,is_holiday,day_of_year,month,year,season,on,num_sold
0,230130,Canada,Discount Stickers,Holographic Goose,True,1,1,2017,Winter,1,895.157227
1,230131,Canada,Discount Stickers,Kaggle,True,1,1,2017,Winter,2,732.993164
2,230132,Canada,Discount Stickers,Kaggle Tiers,True,1,1,2017,Winter,3,333.550079
3,230133,Canada,Discount Stickers,Kerneler,True,1,1,2017,Winter,4,519.088257
4,230134,Canada,Discount Stickers,Kerneler Dark Mode,True,1,1,2017,Winter,5,565.036377
...,...,...,...,...,...,...,...,...,...,...,...
98545,328675,Singapore,Premium Sticker Mart,Holographic Goose,False,365,12,2019,Fall,98546,2136.420898
98546,328676,Singapore,Premium Sticker Mart,Kaggle,False,365,12,2019,Fall,98547,1841.958740
98547,328677,Singapore,Premium Sticker Mart,Kaggle Tiers,False,365,12,2019,Fall,98548,1105.243164
98548,328678,Singapore,Premium Sticker Mart,Kerneler,False,365,12,2019,Fall,98549,1276.530762
