In [1]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt 
import numpy as np
import optuna
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import holidays 

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import category_encoders as ce
import xgboost as xgb
from xgboost import XGBRegressor
from ydata_profiling import ProfileReport
from sklearn.linear_model import LinearRegression

import catboost as cb
from scalecast.Forecaster import Forecaster

import lightgbm as lgb
import torch

import time

alt.data_transformers.disable_max_rows()
sklearn.set_config(transform_output="pandas")

## Import Data

In [2]:
train = pd.read_csv('train.csv')
blind = pd.read_csv('test.csv')

In [3]:
def is_holiday(row):
    """
    Determines if a given date is a holiday in the specified country.
    Optimized for Canada, Finland, and Italy.
    
    Parameters:
    row: pandas Series containing 'country' and 'date' columns
    
    Returns:
    bool: True if the date is a holiday in the given country, False otherwise
    """
    country = row['country']
    date = pd.to_datetime(row['date']).date()  # Convert to date only

    country_mapping = {
        'Canada': holidays.CA(),
        'Finland': holidays.FI(),
        'Italy': holidays.IT(), 
        'Kenya': holidays.KE(),
        'Norway': holidays.NO(),
        'Singapore': holidays.SG(),
    }
    
    try:
        # Get the holidays object for the country
        if country in country_mapping:
            country_holidays = country_mapping[country]
            return date in country_holidays
        else:
            return False
    except Exception as e:
        print(f"Error checking holiday for {country} on {date}: {e}")
        return False

# Create a cache of holiday objects to improve performance
def initialize_holiday_detection(df):
    """
    Initialize holiday detection by creating holiday objects for all unique years in the dataset.
    This improves performance by avoiding repeated creation of holiday objects.
    
    Parameters:
    df: pandas DataFrame containing 'date' column
    """
    years = pd.to_datetime(df['date']).dt.year.unique()
    holiday_cache = {
        'Canada': {year: holidays.CA(years=year) for year in years},
        'Finland': {year: holidays.FI(years=year) for year in years},
        'Italy': {year: holidays.IT(years=year) for year in years}, 
        'Kenya': {year: holidays.KE(years=year) for year in years},
        'Norway': {year: holidays.NO(years=year) for year in years},
        'Singapore': {year: holidays.SG(years=year) for year in years}

    }
    
    def is_holiday_cached(row):
        country = row['country']
        date = pd.to_datetime(row['date']).date()
        year = date.year
        
        try:
            if country in holiday_cache:
                return date in holiday_cache[country][year]
            return False
        except Exception as e:
            print(f"Error checking holiday for {country} on {date}: {e}")
            return False
    
    return is_holiday_cached

# Example usage:
# First initialize the cached version
holiday_checker = initialize_holiday_detection(train)

# Then apply it to create the holiday flag
train['is_holiday'] = train.apply(holiday_checker, axis=1)

holiday_checker = initialize_holiday_detection(blind)
blind['is_holiday'] = blind.apply(holiday_checker, axis=1)



In [4]:
def feature_eng(df):
    df = df.copy()
    try: 
        df['date'] = pd.to_datetime(df['date'])
        df['day_of_year'] = df['date'].dt.dayofyear
        df['month'] = df['date'].dt.month
        df['year'] = df['date'].dt.year
    except: pass

    df[["country", "store", "product", "is_holiday"]] = df[["country", "store", "product", "is_holiday"]].astype("category")

    # Encode categorical features
    # Drop missing values and irrelevant columns
    imputer = SimpleImputer(strategy="mean")  # Change to "mean", "most_frequent", or "constant"
    scaler = MinMaxScaler()
    try: 
        df = df.set_index('id')

        df = df.drop(columns=['date'])  # Dropping columns that are not useful for modeling
    except: pass
    # Separate features and target
    try: 
        df["num_sold"] = imputer.fit_transform(df[["num_sold"]])
       # df["num_sold"] = scaler.fit_transform(df[["num_sold"]])
    except: pass
    
    try: 
        X = df.drop(columns=['num_sold'])
        #X = pd.get_dummies(X)
    except:
        X=df#pd.get_dummies(df)
    try:
        y = df['num_sold']
    except:
        y = None
    # Scale features
    return df, X, y

# Feature engineering
df, X, y = feature_eng(train)
blind = feature_eng(blind)[1]

# Splitting the data
SPLIT = 0.85
split_index = int(SPLIT * len(X))

X_train = X[:split_index]
y_train = y[:split_index]
X_test = X[split_index:]
y_test = y[split_index:]

# Verify shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (195610, 7)
y_train shape: (195610,)
X_test shape: (34520, 7)
y_test shape: (34520,)


In [5]:
import time

def objective(trial, X_train, X_test, y_train, y_test, total_trials):
    """
    Optuna objective function for XGBoost hyperparameter optimization with GPU support.
    """
    trial_start = time.time()
    print(f"🔄 Starting Trial {trial.number + 1} out of {total_trials}")
    
    gpu_available = torch.cuda.is_available()
    device = "gpu" if gpu_available else "cpu"
    tree_method = "hist" if gpu_available else "auto"
    
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 18),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 10.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "enable_categorical": True,
        "device": device,
        "tree_method": tree_method
    }
    
    try:
        if device == "gpu":
            X_train_device = xgb.DMatrix(X_train, y_train)
            X_test_device = xgb.DMatrix(X_test)
        else:
            X_train_device = X_train
            X_test_device = X_test
        
        model = xgb.XGBRegressor(
            **param,
            early_stopping_rounds=50,
            eval_metric=['mae', 'mape']
        )
        
        model.fit(
            X_train_device, y_train,
            eval_set=[(X_test_device, y_test)],
            verbose=False
        )
        
        preds = model.predict(X_test_device)
        mape = mean_absolute_percentage_error(y_test, preds)
        mae = np.mean(np.abs(y_test - preds))
        rmse = np.sqrt(np.mean((y_test - preds) ** 2))
        
        trial.set_user_attr('mae', mae)
        trial.set_user_attr('rmse', rmse)
        trial.set_user_attr('n_estimators_used', model.best_iteration or param['n_estimators'])
        
        trial_time = time.time() - trial_start
        print(f"✅ Trial {trial.number + 1} completed in {trial_time:.1f}s")
        print(f"   MAPE: {mape:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        
        return mape
        
    except Exception as e:
        print(f"❌ Trial {trial.number + 1} failed: {str(e)}")
        raise optuna.exceptions.TrialPruned()

def run_optimization(X_train, X_test, y_train, y_test, n_trials=2):
    """
    Run the complete optimization process and return the best model.
    """
    study = optuna.create_study(direction="minimize")
    
    try:
        study.optimize(
            lambda trial: objective(trial, X_train, X_test, y_train, y_test, n_trials),
            n_trials=n_trials,
            show_progress_bar=True
        )
        
        best_params = study.best_params
        best_params.update({
            "enable_categorical": True,
            "device": "gpu" if torch.cuda.is_available() else "cpu",
            "tree_method": "hist" if torch.cuda.is_available() else "auto"
        })
        
        best_model = xgb.XGBRegressor(**best_params)
        best_model.fit(X_train, y_train)
        
        print("\n🏆 Best Trial Results:")
        print(f"MAPE: {study.best_value:.4f}")
        print("Best Parameters:", best_params)
        
        return best_model, study
        
    except KeyboardInterrupt:
        print("\n⚠️ Optimization interrupted by user")
        return None, study

def train_country_models(df, X_train, X_test, y_train, y_test, n_trials=2):
    """
    Train separate XGBoost models for each country using Optuna optimization.
    """
    start_time = time.time()
    countries = df['country'].unique()
    country_models = {}
    
    for country in countries:
        country_start_time = time.time()
        print(f"\n🌍 Training model for {country}")
        print("=" * 50)
        
        train_country_mask = df.loc[X_train.index]['country'] == country
        test_country_mask = df.loc[X_test.index]['country'] == country
        
        X_train_country = X_train[train_country_mask]
        X_test_country = X_test[test_country_mask]
        y_train_country = y_train[train_country_mask]
        y_test_country = y_test[test_country_mask]
        
        if len(X_train_country) < 10:
            print(f"⚠️ Insufficient data for {country}. Skipping...")
            continue
            
        try:
            best_model, study = run_optimization(
                X_train_country, 
                X_test_country,
                y_train_country, 
                y_test_country,
                n_trials=n_trials
            )
            
            if best_model is not None:
                predictions = best_model.predict(X_test_country)
                final_mape = mean_absolute_percentage_error(y_test_country, predictions)
                
                country_models[country] = {
                    'model': best_model,
                    'study': study,
                    'mape': final_mape,
                    'n_train_samples': len(X_train_country),
                    'n_test_samples': len(X_test_country),
                    'best_parameters': study.best_params,
                    'training_time': time.time() - country_start_time
                }
                
                print(f"\n📊 Results for {country}:")
                print(f"Training samples: {len(X_train_country)}")
                print(f"Test samples: {len(X_test_country)}")
                print(f"Final MAPE: {final_mape:.4f}")
                print(f"Training time: {country_models[country]['training_time']:.1f}s")
                
        except Exception as e:
            print(f"❌ Error training model for {country}: {str(e)}")
            continue
    
    total_time = time.time() - start_time
    print(f"\n⏱️ Total training time: {total_time:.1f}s")
    
    print("\n📑 Overall Summary:")
    print("=" * 50)
    summary_data = []
    for country, results in country_models.items():
        summary_data.append({
            'Country': country,
            'MAPE': results['mape'],
            'Training Samples': results['n_train_samples'],
            'Test Samples': results['n_test_samples'],
            'Training Time (s)': results['training_time']
        })
    
    summary_df = pd.DataFrame(summary_data)
    print(summary_df)
    
    return country_models

def predict_with_country_models(X_new, country_models, country_column):
    """
    Make predictions using country-specific models on new, unlabeled data.
    
    Parameters:
    X_new: DataFrame containing features for prediction (without target)
    country_models: Dictionary of trained models from train_country_models()
    country_column: String or array containing country values for each row
    
    Returns:
    array: Predictions for each row
    """
    predictions = np.zeros(len(X_new))
    
    for country, model_dict in country_models.items():
        # Get indices for this country
        country_mask = country_column == country
        
        if country_mask.any():
            # Get predictions for this country
            country_data = X_new[country_mask]
            predictions[country_mask] = model_dict['model'].predict(country_data)
    
    return predictions

# Example usage:

# Train the models
country_models = train_country_models(
    df=train,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    n_trials=20
)

# Make predictions
predictions = predict_with_country_models(
    blind, 
    country_models, 
    blind['country'])   



[I 2025-01-30 16:34:32,854] A new study created in memory with name: no-name-ce08c678-0768-4f1a-93d4-76e988aed254



🌍 Training model for Canada


  0%|          | 0/20 [00:00<?, ?it/s]

🔄 Starting Trial 1 out of 20
✅ Trial 1 completed in 0.4s
   MAPE: 0.1480, MAE: 91.3790, RMSE: 124.9555
[I 2025-01-30 16:34:33,324] Trial 0 finished with value: 0.148006009725715 and parameters: {'n_estimators': 89, 'learning_rate': 0.11004005019811143, 'max_depth': 6, 'subsample': 0.8825560098780566, 'colsample_bytree': 0.6298945689472297, 'reg_alpha': 4.222504380990488, 'reg_lambda': 0.7813791824154713, 'min_child_weight': 3, 'gamma': 0.17091598545284414}. Best is trial 0 with value: 0.148006009725715.
🔄 Starting Trial 2 out of 20
✅ Trial 2 completed in 1.0s
   MAPE: 0.1958, MAE: 105.0906, RMSE: 130.7108
[I 2025-01-30 16:34:34,292] Trial 1 finished with value: 0.1958252808730472 and parameters: {'n_estimators': 413, 'learning_rate': 0.010032680748256856, 'max_depth': 6, 'subsample': 0.5916275003461888, 'colsample_bytree': 0.7076861888821623, 'reg_alpha': 0.4622464534607482, 'reg_lambda': 0.1811274379543044, 'min_child_weight': 4, 'gamma': 2.3822884395077235}. Best is trial 0 with valu

[I 2025-01-30 16:34:58,958] A new study created in memory with name: no-name-7fd48d1e-5c8b-4c84-9bd7-170947d0a9e4



🏆 Best Trial Results:
MAPE: 0.1288
Best Parameters: {'n_estimators': 199, 'learning_rate': 0.19925144741463305, 'max_depth': 15, 'subsample': 0.7796923094040221, 'colsample_bytree': 0.9844347586090221, 'reg_alpha': 0.1096781616965365, 'reg_lambda': 1.1262048708328567, 'min_child_weight': 2, 'gamma': 4.1264740724932345, 'enable_categorical': True, 'device': 'cpu', 'tree_method': 'auto'}

📊 Results for Canada:
Training samples: 32610
Test samples: 5745
Final MAPE: 0.1327
Training time: 26.1s

🌍 Training model for Finland


  0%|          | 0/20 [00:00<?, ?it/s]

🔄 Starting Trial 1 out of 20
✅ Trial 1 completed in 0.9s
   MAPE: 0.1201, MAE: 69.1807, RMSE: 98.8046
[I 2025-01-30 16:34:59,875] Trial 0 finished with value: 0.12014569980305223 and parameters: {'n_estimators': 272, 'learning_rate': 0.012123798737494066, 'max_depth': 7, 'subsample': 0.8070877632803615, 'colsample_bytree': 0.9299792639514792, 'reg_alpha': 0.4657284633412854, 'reg_lambda': 1.2096407340003825, 'min_child_weight': 2, 'gamma': 1.2871133636064735}. Best is trial 0 with value: 0.12014569980305223.
🔄 Starting Trial 2 out of 20
✅ Trial 2 completed in 0.9s
   MAPE: 0.0938, MAE: 66.1215, RMSE: 98.0266
[I 2025-01-30 16:35:00,742] Trial 1 finished with value: 0.0938392470825015 and parameters: {'n_estimators': 164, 'learning_rate': 0.03798009654132587, 'max_depth': 15, 'subsample': 0.7042739083365632, 'colsample_bytree': 0.8332773556121789, 'reg_alpha': 0.9800262995709539, 'reg_lambda': 1.4518993073200224, 'min_child_weight': 7, 'gamma': 2.91018165402569}. Best is trial 1 with val

[I 2025-01-30 16:35:16,723] A new study created in memory with name: no-name-fac16239-36d0-4ce3-9198-eb1170803dd5



🏆 Best Trial Results:
MAPE: 0.0896
Best Parameters: {'n_estimators': 206, 'learning_rate': 0.09039349965146042, 'max_depth': 5, 'subsample': 0.5936670113870827, 'colsample_bytree': 0.9939686359408468, 'reg_alpha': 2.225596452631923, 'reg_lambda': 4.181708490399382, 'min_child_weight': 6, 'gamma': 3.6920951631305043, 'enable_categorical': True, 'device': 'cpu', 'tree_method': 'auto'}

📊 Results for Finland:
Training samples: 32610
Test samples: 5745
Final MAPE: 0.0927
Training time: 17.8s

🌍 Training model for Italy


  0%|          | 0/20 [00:00<?, ?it/s]

🔄 Starting Trial 1 out of 20
✅ Trial 1 completed in 1.2s
   MAPE: 0.1091, MAE: 45.2541, RMSE: 64.2481
[I 2025-01-30 16:35:17,927] Trial 0 finished with value: 0.10906520506146102 and parameters: {'n_estimators': 373, 'learning_rate': 0.040834948839449256, 'max_depth': 13, 'subsample': 0.7668403519358427, 'colsample_bytree': 0.5468468601204305, 'reg_alpha': 3.176576376345812, 'reg_lambda': 2.91647641685125, 'min_child_weight': 7, 'gamma': 4.03005220065191}. Best is trial 0 with value: 0.10906520506146102.
🔄 Starting Trial 2 out of 20
✅ Trial 2 completed in 0.8s
   MAPE: 0.1150, MAE: 46.4377, RMSE: 66.3975
[I 2025-01-30 16:35:18,757] Trial 1 finished with value: 0.11504352328473448 and parameters: {'n_estimators': 248, 'learning_rate': 0.02328012567919698, 'max_depth': 12, 'subsample': 0.9035618589322498, 'colsample_bytree': 0.6387898369394501, 'reg_alpha': 0.5238192179084964, 'reg_lambda': 5.560945917383307, 'min_child_weight': 2, 'gamma': 3.5205773758682524}. Best is trial 0 with value

[I 2025-01-30 16:35:32,243] A new study created in memory with name: no-name-72296325-989d-4cf2-9e56-9ee263b46dd4



🏆 Best Trial Results:
MAPE: 0.0844
Best Parameters: {'n_estimators': 144, 'learning_rate': 0.08391450115544734, 'max_depth': 7, 'subsample': 0.6204110017627611, 'colsample_bytree': 0.9864058090663964, 'reg_alpha': 9.686922411831155, 'reg_lambda': 9.874567429043495, 'min_child_weight': 5, 'gamma': 2.9884836695563055, 'enable_categorical': True, 'device': 'cpu', 'tree_method': 'auto'}

📊 Results for Italy:
Training samples: 32605
Test samples: 5750
Final MAPE: 0.0871
Training time: 15.5s

🌍 Training model for Kenya


  0%|          | 0/20 [00:00<?, ?it/s]

🔄 Starting Trial 1 out of 20
✅ Trial 1 completed in 0.5s
   MAPE: 5.8651, MAE: 93.8730, RMSE: 151.7847
[I 2025-01-30 16:35:32,725] Trial 0 finished with value: 5.865053148261212 and parameters: {'n_estimators': 302, 'learning_rate': 0.11271190898222559, 'max_depth': 16, 'subsample': 0.7254187012982074, 'colsample_bytree': 0.5236300832836958, 'reg_alpha': 1.9991902818097396, 'reg_lambda': 0.18216199102068564, 'min_child_weight': 2, 'gamma': 1.6635074218365486}. Best is trial 0 with value: 5.865053148261212.
🔄 Starting Trial 2 out of 20
✅ Trial 2 completed in 0.8s
   MAPE: 2.3676, MAE: 21.1038, RMSE: 62.7864
[I 2025-01-30 16:35:33,559] Trial 1 finished with value: 2.3675835606415028 and parameters: {'n_estimators': 220, 'learning_rate': 0.014423617798988455, 'max_depth': 10, 'subsample': 0.9910195993219921, 'colsample_bytree': 0.896416545909304, 'reg_alpha': 5.052679864008598, 'reg_lambda': 2.38199243799127, 'min_child_weight': 4, 'gamma': 3.2510151891758294}. Best is trial 1 with value:

[I 2025-01-30 16:36:02,806] A new study created in memory with name: no-name-a852c13c-c6f7-43eb-b0d1-983d4f1c4044



🏆 Best Trial Results:
MAPE: 2.0085
Best Parameters: {'n_estimators': 337, 'learning_rate': 0.06839743886316835, 'max_depth': 15, 'subsample': 0.5142914229082487, 'colsample_bytree': 0.9205177788310376, 'reg_alpha': 0.11802029567947248, 'reg_lambda': 9.529229623725607, 'min_child_weight': 3, 'gamma': 0.14415173940310755, 'enable_categorical': True, 'device': 'cpu', 'tree_method': 'auto'}

📊 Results for Kenya:
Training samples: 32595
Test samples: 5760
Final MAPE: 2.0905
Training time: 30.6s

🌍 Training model for Norway


  0%|          | 0/20 [00:00<?, ?it/s]

🔄 Starting Trial 1 out of 20
✅ Trial 1 completed in 2.3s
   MAPE: 0.1360, MAE: 135.4440, RMSE: 192.8875
[I 2025-01-30 16:36:05,128] Trial 0 finished with value: 0.13602472150949416 and parameters: {'n_estimators': 185, 'learning_rate': 0.022855961238391287, 'max_depth': 18, 'subsample': 0.6014593020260105, 'colsample_bytree': 0.9129782653674798, 'reg_alpha': 1.4377589420036718, 'reg_lambda': 1.0443042073202553, 'min_child_weight': 2, 'gamma': 1.0264700280698846}. Best is trial 0 with value: 0.13602472150949416.
🔄 Starting Trial 2 out of 20
✅ Trial 2 completed in 1.4s
   MAPE: 0.1169, MAE: 132.7593, RMSE: 187.1188
[I 2025-01-30 16:36:06,495] Trial 1 finished with value: 0.11685945975040565 and parameters: {'n_estimators': 450, 'learning_rate': 0.013517938819948987, 'max_depth': 6, 'subsample': 0.6769715516673728, 'colsample_bytree': 0.7172893316322333, 'reg_alpha': 0.37015705819023975, 'reg_lambda': 7.673839644918192, 'min_child_weight': 6, 'gamma': 1.8604262904996183}. Best is trial 1 

[I 2025-01-30 16:36:25,692] A new study created in memory with name: no-name-ff81ed96-4788-4cb1-956a-da6e2661a6e1



🏆 Best Trial Results:
MAPE: 0.1157
Best Parameters: {'n_estimators': 482, 'learning_rate': 0.0219996624434328, 'max_depth': 9, 'subsample': 0.7576232106530884, 'colsample_bytree': 0.6936335154713036, 'reg_alpha': 0.11824350959205644, 'reg_lambda': 9.402879805990972, 'min_child_weight': 7, 'gamma': 3.9786238157573868, 'enable_categorical': True, 'device': 'cpu', 'tree_method': 'auto'}

📊 Results for Norway:
Training samples: 32595
Test samples: 5760
Final MAPE: 0.1158
Training time: 22.9s

🌍 Training model for Singapore


  0%|          | 0/20 [00:00<?, ?it/s]

🔄 Starting Trial 1 out of 20
✅ Trial 1 completed in 1.1s
   MAPE: 0.0938, MAE: 84.7305, RMSE: 125.0689
[I 2025-01-30 16:36:26,762] Trial 0 finished with value: 0.09379330023953264 and parameters: {'n_estimators': 347, 'learning_rate': 0.05906260908030551, 'max_depth': 6, 'subsample': 0.5982821464023126, 'colsample_bytree': 0.6028490877609953, 'reg_alpha': 0.11089757662334422, 'reg_lambda': 2.2827388884268203, 'min_child_weight': 3, 'gamma': 0.6295150415820505}. Best is trial 0 with value: 0.09379330023953264.
🔄 Starting Trial 2 out of 20
✅ Trial 2 completed in 0.5s
   MAPE: 0.0850, MAE: 79.8805, RMSE: 120.2206
[I 2025-01-30 16:36:27,240] Trial 1 finished with value: 0.08498676821021622 and parameters: {'n_estimators': 159, 'learning_rate': 0.0696684891064068, 'max_depth': 4, 'subsample': 0.9051860105873033, 'colsample_bytree': 0.9227110009671095, 'reg_alpha': 0.5348972845071818, 'reg_lambda': 3.184326819742593, 'min_child_weight': 1, 'gamma': 0.03860664387659862}. Best is trial 1 with 

In [6]:
country_models

{'Canada': {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.9844347586090221, device='cpu',
               early_stopping_rounds=None, enable_categorical=True,
               eval_metric=None, feature_types=None, gamma=4.1264740724932345,
               grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.19925144741463305,
               max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=15, max_leaves=None,
               min_child_weight=2, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=199, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
  'study': <optuna.study.study.Study at 0x1f9ddfcd8d0>,
  'mape': 0.13270264595981837,
  'n_train_samples': 32610,
  'n_test_samples': 5745,
  'best_parameters

In [7]:
X_train

Unnamed: 0_level_0,country,store,product,is_holiday,day_of_year,month,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Canada,Discount Stickers,Holographic Goose,True,1,1,2010
1,Canada,Discount Stickers,Kaggle,True,1,1,2010
2,Canada,Discount Stickers,Kaggle Tiers,True,1,1,2010
3,Canada,Discount Stickers,Kerneler,True,1,1,2010
4,Canada,Discount Stickers,Kerneler Dark Mode,True,1,1,2010
...,...,...,...,...,...,...,...
195605,Italy,Stickers for Less,Holographic Goose,False,348,12,2015
195606,Italy,Stickers for Less,Kaggle,False,348,12,2015
195607,Italy,Stickers for Less,Kaggle Tiers,False,348,12,2015
195608,Italy,Stickers for Less,Kerneler,False,348,12,2015


In [9]:
predictions_df

Unnamed: 0,on,num_sold
0,0,752.895813
1,1,659.886780
2,2,559.243164
3,3,302.295563
4,4,331.575073
...,...,...
98545,98545,428.819061
98546,98546,2209.399414
98547,98547,1989.598511
98548,98548,1101.572266


In [10]:
blind

Unnamed: 0_level_0,country,store,product,is_holiday,day_of_year,month,year,on
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
230130,Canada,Discount Stickers,Holographic Goose,True,1,1,2017,1
230131,Canada,Discount Stickers,Kaggle,True,1,1,2017,2
230132,Canada,Discount Stickers,Kaggle Tiers,True,1,1,2017,3
230133,Canada,Discount Stickers,Kerneler,True,1,1,2017,4
230134,Canada,Discount Stickers,Kerneler Dark Mode,True,1,1,2017,5
...,...,...,...,...,...,...,...,...
328675,Singapore,Premium Sticker Mart,Holographic Goose,False,365,12,2019,98546
328676,Singapore,Premium Sticker Mart,Kaggle,False,365,12,2019,98547
328677,Singapore,Premium Sticker Mart,Kaggle Tiers,False,365,12,2019,98548
328678,Singapore,Premium Sticker Mart,Kerneler,False,365,12,2019,98549


In [24]:

predictions_df = pd.DataFrame({"num_sold": predictions})
predictions_df = predictions_df.reset_index().rename(columns={"index": "on"})


blind['on'] = range(1, len(blind) + 1)

submissions = blind.merge(predictions_df, on='on', how='left')
submissions = submissions[["num_sold", "id"]]
submissions = submissions.set_index("id")

In [26]:
submissions.to_csv("submission.csv")

  values = values.astype(str)
