In [1]:
#import split data
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats import truncnorm
import pandas as pd

In [2]:
import requests
import pandas as pd


# Specify the countries and indicator
countries = "CAN;FIN;ITA;KEN;NOR;SGP"  # ISO codes for Canada, Finland, Italy, Kenya, Norway, Singapore
indicator = "NY.GDP.PCAP.CD"  # GDP per capita (current US$)
years = ["2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022","2023"]

gdp_data = {}

for year in years:
        
    url = f"https://api.worldbank.org/v2/country/{countries}/indicator/{indicator}?format=json&date={year}"
    # Fetch the data
    response = requests.get(url)
    data = response.json()

    # Extract relevant data
    for entry in data[1]:
        country = entry['country']['value']
        gdp_per_capita = entry['value']
        year = entry['date']
        if country not in gdp_data:
            gdp_data[country] = {}
            gdp_data[country][year] = gdp_per_capita
        else:
            if year not in gdp_data[country]:
                gdp_data[country][year] = gdp_per_capita


print(gdp_data)


{'Canada': {'2010': 47560.6666009406, '2011': 52223.8588398531, '2012': 52670.3447335415, '2013': 52638.1187235237, '2014': 50960.8431174661, '2015': 43594.1941045394, '2016': 42314.0615817218, '2017': 45129.628116623, '2018': 46539.1761570405, '2019': 46352.8693445211, '2020': 43537.839298904, '2021': 52496.8441693242, '2022': 55509.393176404, '2023': 53431.1857063879}, 'Finland': {'2010': 46506.2919016566, '2011': 51060.3242589767, '2012': 47551.6740841369, '2013': 49691.0145200739, '2014': 50073.7760815871, '2015': 42560.3456767103, '2016': 43451.2562442158, '2017': 46085.0174739036, '2018': 49654.2497035329, '2019': 48358.1807773701, '2020': 48828.6846862799, '2021': 53099.1351400148, '2022': 50438.4753952355, '2023': 52925.6897638424}, 'Italy': {'2010': 36184.7118698678, '2011': 38851.3881339353, '2012': 35235.7989058242, '2013': 35747.707952689, '2014': 35750.7197500382, '2015': 30387.1293187854, '2016': 31126.3246947273, '2017': 32554.14668453, '2018': 34746.3441392416, '2019': 

In [3]:

from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

def compute_gdp_per_capita(X,gdp_data):
    def get_gdp(row):
        country = str(row["country"])
        year = str(int(row["year"])+2010)
        return gdp_data[country][year]
    X["gdp_per_capita"] = X.apply(get_gdp, axis=1)
    return X

def generate_features(X, train=False):
    X = X.copy()
        
    if train:
        X = X.dropna(subset=["num_sold"])
        X = X.drop(columns=["id"])

    X["date"] = pd.to_datetime(X["date"], format="%Y-%m-%d")
    X["dayofweek"] = X["date"].dt.dayofweek
    X["month"] = X["date"].dt.month - 1
    X["year"] = X["date"].dt.year - 2010
    X["is_weekend"] = X["dayofweek"].isin([5,6])
    X["sin_dayofweek"] = np.sin(X["dayofweek"] * (2 * np.pi / 7))
    X["cos_dayofweek"] = np.cos(X["dayofweek"] * (2 * np.pi / 7))
    X["sin_month"] = np.sin(X["month"] * (2 * np.pi / 12))
    X["cos_month"] = np.cos(X["month"] * (2 * np.pi / 12))
    X["sin_year"] = np.sin(X["year"] * (2 * np.pi / 10))
    X["cos_year"] = np.cos(X["year"] * (2 * np.pi / 10))
    
    X = compute_gdp_per_capita(X,gdp_data)
    
    X = X.drop(columns=["date","month","year","dayofweek"])
        
    return X

FOLDER = "playground-series-s5e1/"
train_data = pd.read_csv(FOLDER + "train.csv")
test_data = pd.read_csv(FOLDER + "test.csv")

print(train_data["country"].unique())
train_data = generate_features(train_data,train=True)
test_data = generate_features(test_data)

print("Train data")
print(train_data.head())
print("Test data")
print(test_data.head())

['Canada' 'Finland' 'Italy' 'Kenya' 'Norway' 'Singapore']
Train data
  country              store             product  num_sold  is_weekend  \
1  Canada  Discount Stickers              Kaggle     973.0       False   
2  Canada  Discount Stickers        Kaggle Tiers     906.0       False   
3  Canada  Discount Stickers            Kerneler     423.0       False   
4  Canada  Discount Stickers  Kerneler Dark Mode     491.0       False   
5  Canada  Stickers for Less   Holographic Goose     300.0       False   

   sin_dayofweek  cos_dayofweek  sin_month  cos_month  sin_year  cos_year  \
1      -0.433884      -0.900969        0.0        1.0       0.0       1.0   
2      -0.433884      -0.900969        0.0        1.0       0.0       1.0   
3      -0.433884      -0.900969        0.0        1.0       0.0       1.0   
4      -0.433884      -0.900969        0.0        1.0       0.0       1.0   
5      -0.433884      -0.900969        0.0        1.0       0.0       1.0   

   gdp_per_capita  
1  

In [14]:
from xgboost import XGBRegressor
import optuna
from sklearn.metrics import mean_squared_error
X = train_data.drop(columns=['num_sold'])
y = train_data['num_sold']

cat_cols = ['country', 'store', 'product']
num_cols = ["sin_dayofweek", "cos_dayofweek", "sin_month", "cos_month", "sin_year", "cos_year","gdp_per_capita"]

# Create different preprocessors for different models
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# For models that need one-hot encoding (XGBoost, RandomForest)
categorical_transformer_ohe = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Create different preprocessors
preprocessor_ohe = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer_ohe, cat_cols)
])

# For LightGBM and CatBoost, we'll just scale numerics and pass categoricals as is
preprocessor_native = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols)
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_ohe),
    ('regressor', XGBRegressor(
        n_estimators=5000,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

def objective(trial):
    
    #params max_depth, min_child_weight, gamma, learning_rate
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)
    gamma = trial.suggest_int("gamma", 0, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    n_estimators = trial.suggest_int("n_estimators", 100, 5000)
    subsample = trial.suggest_float("subsample", 0.5, 1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1)
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.5, 1)
    colsample_bynode = trial.suggest_float("colsample_bynode", 0.5, 1)
    lambda_l1 = trial.suggest_float("lambda_l1", 0, 10)
    lambda_l2 = trial.suggest_float("lambda_l2", 0, 10)
    
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor_ohe),
        ('regressor', XGBRegressor(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_child_weight=min_child_weight,
            gamma=gamma,
            random_state=42,
            objective='reg:squarederror',
            n_jobs=4,
            booster="gbtree",
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            colsample_bylevel=colsample_bylevel,
            colsample_bynode=colsample_bynode,
            reg_alpha=lambda_l1,
            reg_lambda=lambda_l2
        ))
    ])
    #Time-series cross-validation
    time_series = TimeSeriesSplit(n_splits=5)
    fold_scores_val= []
    fold_scores_train= []
    loss_train= []
    loss_val= []
    model_name = "xgboost"
    for fold_index, (train_index, val_index) in enumerate(time_series.split(X)):
        X_train_fold = X.iloc[train_index]
        y_train_fold = y.iloc[train_index]
        X_val_fold = X.iloc[val_index]
        y_val_fold = y.iloc[val_index]
        
        model_pipeline.fit(X_train_fold, y_train_fold)
        y_pred_val = model_pipeline.predict(X_val_fold)    
        y_pred_train = model_pipeline.predict(X_train_fold)
        
        mape_val = mean_absolute_percentage_error(y_val_fold, y_pred_val)
        mape_train = mean_absolute_percentage_error(y_train_fold, y_pred_train)
        
        mse_train = mean_squared_error(y_train_fold, y_pred_train)
        mse_val = mean_squared_error(y_val_fold, y_pred_val)
        
        fold_scores_val.append(mape_val)
        fold_scores_train.append(mape_train)
        loss_train.append(mse_train)
        loss_val.append(mse_val)
        
        # print(f"Fold {fold_index + 1} loss train: {mse_train:.4f}")
        # print(f"Fold {fold_index + 1} loss val: {mse_val:.4f}")
        # print("--------------------------------")
        # print(f"Fold {fold_index + 1} metric val: {mape_val:.4f}")
        # print(f"Fold {fold_index + 1} metric train: {mape_train:.4f}")
        # print("--------------------------------")
        
    print(f"Average MAPE for {model_name}: {np.mean(fold_scores_val):.4f}")
    print(f"Average MAPE for {model_name} train: {np.mean(fold_scores_train):.4f}")
    # print(f"Average MSE for {model_name} train: {np.mean(loss_train):.4f}")
    # print(f"Average MSE for {model_name} val: {np.mean(loss_val):.4f}")
    
    return np.mean(fold_scores_val)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40)
print(f"Best parameters: {study.best_params}")
print(f"Best value: {study.best_value:.4f}")





[I 2025-01-16 15:06:38,556] A new study created in memory with name: no-name-2d2e70d2-d55d-4152-a26e-e14082ce8e22
