In [1]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt 
import numpy as np
import optuna
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import holidays 

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import category_encoders as ce
import xgboost as xgb
from xgboost import XGBRegressor
from ydata_profiling import ProfileReport
from sklearn.linear_model import LinearRegression

import catboost as cb
from scalecast.Forecaster import Forecaster

import lightgbm as lgb
import torch

import time

alt.data_transformers.disable_max_rows()
sklearn.set_config(transform_output="pandas")

ModuleNotFoundError: No module named 'altair'

## Import Data

In [2]:
train = pd.read_csv('train.csv')
blind = pd.read_csv('test.csv')

In [3]:
def is_holiday(row):
    """
    Determines if a given date is a holiday in the specified country.
    Optimized for Canada, Finland, and Italy.
    
    """
    country = row['country']
    date = pd.to_datetime(row['date']).date()  # Convert to date only

    country_mapping = {
        'Canada': holidays.CA(),
        'Finland': holidays.FI(),
        'Italy': holidays.IT(), 
        'Kenya': holidays.KE(),
        'Norway': holidays.NO(),
        'Singapore': holidays.SG(),
    }
    
    try:
        # Get the holidays object for the country
        if country in country_mapping:
            country_holidays = country_mapping[country]
            return date in country_holidays
        else:
            return False
    except Exception as e:
        print(f"Error checking holiday for {country} on {date}: {e}")
        return False

# Create a cache of holiday objects to improve performance
def initialize_holiday_detection(df):
    """
    Initialize holiday detection by creating holiday objects for all unique years in the dataset.
    This improves performance by avoiding repeated creation of holiday objects.
    
    Parameters:
    df: pandas DataFrame containing 'date' column
    """
    years = pd.to_datetime(df['date']).dt.year.unique()
    holiday_cache = {
        'Canada': {year: holidays.CA(years=year) for year in years},
        'Finland': {year: holidays.FI(years=year) for year in years},
        'Italy': {year: holidays.IT(years=year) for year in years}, 
        'Kenya': {year: holidays.KE(years=year) for year in years},
        'Norway': {year: holidays.NO(years=year) for year in years},
        'Singapore': {year: holidays.SG(years=year) for year in years}

    }
    
    def is_holiday_cached(row):
        country = row['country']
        date = pd.to_datetime(row['date']).date()
        year = date.year
        
        try:
            if country in holiday_cache:
                return date in holiday_cache[country][year]
            return False
        except Exception as e:
            print(f"Error checking holiday for {country} on {date}: {e}")
            return False
    
    return is_holiday_cached

# Example usage:
# First initialize the cached version
holiday_checker = initialize_holiday_detection(train)

# Then apply it to create the holiday flag
train['is_holiday'] = train.apply(holiday_checker, axis=1)

holiday_checker = initialize_holiday_detection(blind)
blind['is_holiday'] = blind.apply(holiday_checker, axis=1)



In [None]:
def feature_eng(df):
    df = df.copy()
    try: 
        df['date'] = pd.to_datetime(df['date'])
        df['day_of_year'] = df['date'].dt.dayofyear
        df['month'] = df['date'].dt.month
        df['year'] = df['date'].dt.year
    except: pass

    df[["country", "store", "product", "is_holiday"]] = df[["country", "store", "product", "is_holiday"]].astype("category")
    # Encode categorical features
    # Drop missing values and irrelevant columns
    imputer = SimpleImputer(strategy="mean")  # Change to "mean", "most_frequent", or "constant"
    scaler = MinMaxScaler()
    try: 
        df = df.set_index('id')
    except: pass
    try:
        df = df.drop(columns=['date'])  # Dropping columns that are not useful for modeling
    except: pass
    # Separate features and target
    try: 
        df["num_sold"] = imputer.fit_transform(df[["num_sold"]])
       # df["num_sold"] = scaler.fit_transform(df[["num_sold"]])
    except: pass
    print(df.info())
    
    try: 
        X = df.drop(columns=['num_sold'])
        #X = pd.get_dummies(X)
    except:
        X=df#pd.get_dummies(df)
    try:
        y = df['num_sold']
    except:
        y = None
    # Scale features
    return df, X, y

# Feature engineering
df, X, y = feature_eng(train)
blind = feature_eng(blind)[1]

# Splitting the data
SPLIT = 0.85
split_index = int(SPLIT * len(X))

X_train = X[:split_index]
y_train = y[:split_index]
X_test = X[split_index:]
y_test = y[split_index:]

# Verify shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


In [None]:
import time

def objective(trial, X_train, X_test, y_train, y_test, total_trials):
    """
    Optuna objective function for XGBoost hyperparameter optimization with GPU support.
    """
    trial_start = time.time()
    print(f"🔄 Starting Trial {trial.number + 1} out of {total_trials}")
    
    gpu_available = torch.cuda.is_available()
    device = "gpu" if gpu_available else "cpu"
    tree_method = "hist" if gpu_available else "auto"
    
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 18),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 10.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "enable_categorical": True,
        "device": device,
        "tree_method": tree_method
    }
    
    try:
        X_train.info()
        y_train.info()
        X_train_device = xgb.DMatrix(X_train, y_train)
        X_test_device = xgb.DMatrix(X_test)
       
        
        model = xgb.XGBRegressor(
            **param,
            early_stopping_rounds=50,
            eval_metric=['mae', 'mape']
        )
        
        model.fit(
            X_train_device, y_train,
            eval_set=[(X_test_device, y_test)],
            verbose=False
        )
        
        preds = model.predict(X_test_device)
        mape = mean_absolute_percentage_error(y_test, preds)
        mae = np.mean(np.abs(y_test - preds))
        rmse = np.sqrt(np.mean((y_test - preds) ** 2))
        
        trial.set_user_attr('mae', mae)
        trial.set_user_attr('rmse', rmse)
        trial.set_user_attr('n_estimators_used', model.best_iteration or param['n_estimators'])
        
        trial_time = time.time() - trial_start
        print(f"✅ Trial {trial.number + 1} completed in {trial_time:.1f}s")
        print(f"   MAPE: {mape:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        
        return mape
        
    except Exception as e:
        print(f"❌ Trial {trial.number + 1} failed: {str(e)}")
        raise optuna.exceptions.TrialPruned()

def run_optimization(X_train, X_test, y_train, y_test, n_trials=2):
    """
    Run the complete optimization process and return the best model.
    """
    study = optuna.create_study(direction="minimize")
    
    try:
        study.optimize(
            lambda trial: objective(trial, X_train, X_test, y_train, y_test, n_trials),
            n_trials=n_trials,
            show_progress_bar=True
        )
        
        best_params = study.best_params
        best_params.update({
            "enable_categorical": True,
            "device": "gpu" if torch.cuda.is_available() else "cpu",
            "tree_method": "hist" if torch.cuda.is_available() else "auto"
        })
        
        best_model = xgb.XGBRegressor(**best_params)
        best_model.fit(X_train, y_train)
        
        print("\n🏆 Best Trial Results:")
        print(f"MAPE: {study.best_value:.4f}")
        print("Best Parameters:", best_params)
        
        return best_model, study
        
    except KeyboardInterrupt:
        print("\n⚠️ Optimization interrupted by user")
        return None, study

def train_country_models(df, X_train, X_test, y_train, y_test, n_trials=2):
    """
    Train separate XGBoost models for each country using Optuna optimization.
    """
    start_time = time.time()
    countries = df['country'].unique()
    country_models = {}
    
    for country in countries:
        country_start_time = time.time()
        print(f"\n🌍 Training model for {country}")
        print("=*" * 50)
        
        train_country_mask = df.loc[X_train.index]['country'] == country
        test_country_mask = df.loc[X_test.index]['country'] == country
        
        X_train_country = X_train[train_country_mask]
        X_test_country = X_test[test_country_mask]
        y_train_country = y_train[train_country_mask]
        y_test_country = y_test[test_country_mask]
        
        if len(X_train_country) < 10:
            print(f"⚠️ Insufficient data for {country}. Skipping...")
            continue
            
        try:
            best_model, study = run_optimization(
                X_train_country, 
                X_test_country,
                y_train_country, 
                y_test_country,
                n_trials=n_trials
            )
            
            if best_model is not None:
                predictions = best_model.predict(X_test_country)
                final_mape = mean_absolute_percentage_error(y_test_country, predictions)
                
                country_models[country] = {
                    'model': best_model,
                    'study': study,
                    'mape': final_mape,
                    'n_train_samples': len(X_train_country),
                    'n_test_samples': len(X_test_country),
                    'best_parameters': study.best_params,
                    'training_time': time.time() - country_start_time
                }
                
                print(f"\n📊 Results for {country}:")
                print(f"Training samples: {len(X_train_country)}")
                print(f"Test samples: {len(X_test_country)}")
                print(f"Final MAPE: {final_mape:.4f}")
                print(f"Training time: {country_models[country]['training_time']:.1f}s")
                
        except Exception as e:
            print(f"❌ Error training model for {country}: {str(e)}")
            continue
    
    total_time = time.time() - start_time
    print(f"\n⏱️ Total training time: {total_time:.1f}s")
    
    print("\n📑 Overall Summary:")
    print("=" * 50)
    summary_data = []
    for country, results in country_models.items():
        summary_data.append({
            'Country': country,
            'MAPE': results['mape'],
            'Training Samples': results['n_train_samples'],
            'Test Samples': results['n_test_samples'],
            'Training Time (s)': results['training_time']
        })
    
    summary_df = pd.DataFrame(summary_data)
    print(summary_df)
    
    return country_models

def predict_with_country_models(X_new, country_models, country_column):
    """
    Make predictions using country-specific models on new, unlabeled data.
    
    Parameters:
    X_new: DataFrame containing features for prediction (without target)
    country_models: Dictionary of trained models from train_country_models()
    country_column: String or array containing country values for each row
    
    Returns:
    array: Predictions for each row
    """
    predictions = np.zeros(len(X_new))
    
    for country, model_dict in country_models.items():
        # Get indices for this country
        country_mask = country_column == country
        
        if country_mask.any():
            # Get predictions for this country
            country_data = X_new[country_mask]
            predictions[country_mask] = model_dict['model'].predict(country_data)
    
    return predictions

# Example usage:

# Train the models
country_models = train_country_models(
    df=df,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    n_trials=20
)

# Make predictions
predictions = predict_with_country_models(
    blind, 
    country_models, 
    blind['country'])   



In [None]:
country_models

In [None]:
X_test.info()

In [None]:
blind

In [24]:

predictions_df = pd.DataFrame({"num_sold": predictions})
predictions_df = predictions_df.reset_index().rename(columns={"index": "on"})

temp_blind = blind.copy()
temp_blind['on'] = range(1, len(temp_blind) + 1)

submissions = temp_blind.merge(predictions_df, on='on', how='left')
submissions = submissions[["num_sold", "id"]]
submissions = submissions.set_index("id")

In [None]:
submissions.to_csv("submission.csv")