In [3]:
#import split data
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats import truncnorm
import pandas as pd

In [4]:
!pip install catboost lightgbm xgboost



In [5]:
import requests
import pandas as pd


# Specify the countries and indicator
countries = "CAN;FIN;ITA;KEN;NOR;SGP"  # ISO codes for Canada, Finland, Italy, Kenya, Norway, Singapore
indicator = "NY.GDP.PCAP.CD"  # GDP per capita (current US$)
years = ["2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022","2023"]

gdp_data = {}

for year in years:
        
    url = f"https://api.worldbank.org/v2/country/{countries}/indicator/{indicator}?format=json&date={year}"
    # Fetch the data
    response = requests.get(url)
    data = response.json()

    # Extract relevant data
    for entry in data[1]:
        country = entry['country']['value']
        gdp_per_capita = entry['value']
        year = entry['date']
        if country not in gdp_data:
            gdp_data[country] = {}
            gdp_data[country][year] = gdp_per_capita
        else:
            if year not in gdp_data[country]:
                gdp_data[country][year] = gdp_per_capita


print(gdp_data)


{'Canada': {'2010': 47560.6666009406, '2011': 52223.8588398531, '2012': 52670.3447335415, '2013': 52638.1187235237, '2014': 50960.8431174661, '2015': 43594.1941045394, '2016': 42314.0615817218, '2017': 45129.628116623, '2018': 46539.1761570405, '2019': 46352.8693445211, '2020': 43537.839298904, '2021': 52496.8441693242, '2022': 55509.393176404, '2023': 53431.1857063879}, 'Finland': {'2010': 46506.2919016566, '2011': 51060.3242589767, '2012': 47551.6740841369, '2013': 49691.0145200739, '2014': 50073.7760815871, '2015': 42560.3456767103, '2016': 43451.2562442158, '2017': 46085.0174739036, '2018': 49654.2497035329, '2019': 48358.1807773701, '2020': 48828.6846862799, '2021': 53099.1351400148, '2022': 50438.4753952355, '2023': 52925.6897638424}, 'Italy': {'2010': 36184.7118698678, '2011': 38851.3881339353, '2012': 35235.7989058242, '2013': 35747.707952689, '2014': 35750.7197500382, '2015': 30387.1293187854, '2016': 31126.3246947273, '2017': 32554.14668453, '2018': 34746.3441392416, '2019': 

**Train and prediction**

In [6]:

from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

def compute_gdp_per_capita(X,gdp_data):
    def get_gdp(row):
        country = str(row["country"])
        year = str(int(row["year"])+2010)
        return gdp_data[country][year]
    X["gdp_per_capita"] = X.apply(get_gdp, axis=1)
    return X

def generate_features(X, train=False):
    X = X.copy()
        
    if train:
        X = X.dropna(subset=["num_sold"])
        X = X.drop(columns=["id"])

    X["date"] = pd.to_datetime(X["date"], format="%Y-%m-%d")
    X["dayofweek"] = X["date"].dt.dayofweek
    X["month"] = X["date"].dt.month - 1
    X["year"] = X["date"].dt.year - 2010
    X["is_weekend"] = X["dayofweek"].isin([5,6])
    X["sin_dayofweek"] = np.sin(X["dayofweek"] * (2 * np.pi / 7))
    X["cos_dayofweek"] = np.cos(X["dayofweek"] * (2 * np.pi / 7))
    X["sin_month"] = np.sin(X["month"] * (2 * np.pi / 12))
    X["cos_month"] = np.cos(X["month"] * (2 * np.pi / 12))
    X["sin_year"] = np.sin(X["year"] * (2 * np.pi / 10))
    X["cos_year"] = np.cos(X["year"] * (2 * np.pi / 10))
    
    X = compute_gdp_per_capita(X,gdp_data)
    
    X = X.drop(columns=["date","month","year","dayofweek"])
        
    return X

FOLDER = "playground-series-s5e1/"
train_data = pd.read_csv(FOLDER + "train.csv")
test_data = pd.read_csv(FOLDER + "test.csv")

print(train_data["country"].unique())
train_data = generate_features(train_data,train=True)
test_data = generate_features(test_data)

print("Train data")
print(train_data.head())
print("Test data")
print(test_data.head())


['Canada' 'Finland' 'Italy' 'Kenya' 'Norway' 'Singapore']
Train data
  country              store             product  num_sold  is_weekend  \
1  Canada  Discount Stickers              Kaggle     973.0       False   
2  Canada  Discount Stickers        Kaggle Tiers     906.0       False   
3  Canada  Discount Stickers            Kerneler     423.0       False   
4  Canada  Discount Stickers  Kerneler Dark Mode     491.0       False   
5  Canada  Stickers for Less   Holographic Goose     300.0       False   

   sin_dayofweek  cos_dayofweek  sin_month  cos_month  sin_year  cos_year  \
1      -0.433884      -0.900969        0.0        1.0       0.0       1.0   
2      -0.433884      -0.900969        0.0        1.0       0.0       1.0   
3      -0.433884      -0.900969        0.0        1.0       0.0       1.0   
4      -0.433884      -0.900969        0.0        1.0       0.0       1.0   
5      -0.433884      -0.900969        0.0        1.0       0.0       1.0   

   gdp_per_capita  
1  

In [17]:
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

X = train_data.drop(columns=['num_sold'])
y = train_data['num_sold']

cat_cols = ['country', 'store', 'product']
num_cols = ["sin_dayofweek", "cos_dayofweek", "sin_month", "cos_month", "sin_year", "cos_year","gdp_per_capita"]

# Create different preprocessors for different models
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# For models that need one-hot encoding (XGBoost, RandomForest)
categorical_transformer_ohe = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Create different preprocessors
preprocessor_ohe = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer_ohe, cat_cols)
])

# For LightGBM and CatBoost, we'll just scale numerics and pass categoricals as is
preprocessor_native = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols)
])

# Define models with appropriate parameters
models = {
    'xgb': (XGBRegressor(
        n_estimators=5000,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ), preprocessor_ohe),
    
    'lgbm': (LGBMRegressor(
        n_estimators=5000,
        learning_rate=0.1,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ), preprocessor_native),
    
    'catboost': (CatBoostRegressor(
        n_estimators=5000,
        learning_rate=0.1,
        depth=6,
        subsample=0.8,
        random_state=42,
        verbose=False
    ), preprocessor_native)
}

# Create pipelines for each model
model_pipelines = {
    name: Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ]) for name, (model, preprocessor) in models.items()
}

# For LightGBM, we need to ensure categorical columns are the right type
def prepare_data_lgbm(X):
    X = X.copy()
    for col in cat_cols:
        X[col] = X[col].astype('category')
    return X

# Modified evaluation loop
time_series = TimeSeriesSplit(n_splits=5)

for model_name, pipeline in model_pipelines.items():
    print(f"\nEvaluating {model_name}")
    fold_scores = []
    
    for fold_index, (train_index, val_index) in enumerate(time_series.split(X)):
        X_train_fold = X.iloc[train_index]
        y_train_fold = y.iloc[train_index]
        X_val_fold = X.iloc[val_index]
        y_val_fold = y.iloc[val_index]
        
        # Special handling for LightGBM
        if model_name == 'lgbm':
            X_train_fold = prepare_data_lgbm(X_train_fold)
            X_val_fold = prepare_data_lgbm(X_val_fold)
        
        pipeline.fit(X_train_fold, y_train_fold)
        y_pred_val = pipeline.predict(X_val_fold)    
        mape = mean_absolute_percentage_error(y_val_fold, y_pred_val)
        fold_scores.append(mape)
        print(f"Fold {fold_index + 1} MAPE: {mape:.4f}")
    
    print(f"Average MAPE for {model_name}: {np.mean(fold_scores):.4f}")

# Train final model
best_model_name = min(model_pipelines.keys(), 
                     key=lambda k: np.mean([mean_absolute_percentage_error(y.iloc[val_idx], 
                     model_pipelines[k].fit(X.iloc[train_idx], y.iloc[train_idx]).predict(X.iloc[val_idx])) 
                     for train_idx, val_idx in time_series.split(X)]))

print(f"\nBest model: {best_model_name}")

# Train the best model on full dataset
best_pipeline = model_pipelines[best_model_name]
if best_model_name == 'lgbm':
    X = prepare_data_lgbm(X)
    test_data = prepare_data_lgbm(test_data)

best_pipeline.fit(X, y)
y_pred_test = best_pipeline.predict(test_data)


Evaluating xgb


KeyboardInterrupt: 

In [36]:

#Create a numeric transformer with imputer and scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))
])


In [37]:
time_series = TimeSeriesSplit(n_splits=5)

for fold_index, (train_index, val_index) in enumerate(time_series.split(X)):
    
    X_train_fold = X.iloc[train_index]
    y_train_fold = y.iloc[train_index]
    X_val_fold = X.iloc[val_index]
    y_val_fold = y.iloc[val_index]
    model_pipeline.fit(X_train_fold, y_train_fold)

    y_pred_val = model_pipeline.predict(X_val_fold)    
    mape = mean_absolute_percentage_error(y_val_fold, y_pred_val)
    print(f"MAPE: {mape}")
    
model_pipeline.fit(X,y)

MAPE: 0.08561272829569276
MAPE: 0.07944890426994645
MAPE: 0.07684108964408275
MAPE: 0.0885103730156893
MAPE: 0.07450817040666825


**Inference**

In [39]:
y_pred_test = model_pipeline.predict(test_data)
#submission
submission = pd.DataFrame({"id": test_data["id"],"num_sold":y_pred_test})
submission.to_csv(FOLDER + "submission.csv", index=False)
