Published on January 01st, 2025. By Prata, Marília (mpwolke).

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

#Two lines Required to Plot Plotly
import plotly.io as pio
# pio.renderers.default = 'iframe'

import plotly.graph_objs as go
import plotly.offline as py
import plotly.express as px

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

In [None]:
train_data.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [None]:


train_data_row, train_data_col = train_data.shape
print('Total rows: ', train_data_row)
print('Total columns: ', train_data_col)

Total rows:  230130
Total columns:  6


In [None]:
train_data.tail()

Unnamed: 0,id,date,country,store,product,num_sold
230125,230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0
230129,230129,2016-12-31,Singapore,Premium Sticker Mart,Kerneler Dark Mode,1622.0


### Kaggle Mart info()

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        230130 non-null  int64  
 1   date      230130 non-null  object 
 2   country   230130 non-null  object 
 3   store     230130 non-null  object 
 4   product   230130 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 10.5+ MB


In [None]:
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

## Fun fact, No Kaggle stickers on US!

We have Kaggle stickers sales on Kenya, though none on US 😆 😆 😆

In [None]:
train_data['country'].value_counts()

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
Canada,38355
Finland,38355
Italy,38355
Kenya,38355
Norway,38355
Singapore,38355


### Missing and Duplicated values

In [None]:
print("duplicated data :", train_data.duplicated().sum())
print("null data : ", train_data.isnull().sum().sum())

duplicated data : 0
null data :  8871


In [None]:
train_data.describe().loc[['mean','min','max']].T

Unnamed: 0,mean,min,max
id,115064.5,0.0,230129.0
date,2013-07-02 00:00:00,2010-01-01 00:00:00,2016-12-31 00:00:00
num_sold,752.527382,5.0,5939.0


In [None]:
train_data = train_data.dropna()

#Split datetime

In [None]:
import numpy as np

def transform_date(df, col):
    # Convert the column to datetime
    df[col] = pd.to_datetime(df[col])

    # Extract temporal features
    df[f'{col}_year'] = df[col].dt.year.astype('float64')
    df[f'{col}_quarter'] = df[col].dt.quarter.astype('float64')
    df[f'{col}_month'] = df[col].dt.month.astype('float64')
    df[f'{col}_day'] = df[col].dt.day.astype('float64')
    df[f'{col}_day_of_week'] = df[col].dt.dayofweek.astype('float64')
    df[f'{col}_week_of_year'] = df[col].dt.isocalendar().week.astype('float64')
    df[f'{col}_hour'] = df[col].dt.hour.astype('float64')
    df[f'{col}_minute'] = df[col].dt.minute.astype('float64')

    # Add cyclical encodings
    df[f'{col}_day_sin'] = np.sin(2 * np.pi * df[f'{col}_day'] / 365.0)
    df[f'{col}_day_cos'] = np.cos(2 * np.pi * df[f'{col}_day'] / 365.0)
    df[f'{col}_month_sin'] = np.sin(2 * np.pi * df[f'{col}_month'] / 12.0)
    df[f'{col}_month_cos'] = np.cos(2 * np.pi * df[f'{col}_month'] / 12.0)
    df[f'{col}_year_sin'] = np.sin(2 * np.pi * df[f'{col}_year'] / 7.0)
    df[f'{col}_year_cos'] = np.cos(2 * np.pi * df[f'{col}_year'] / 7.0)


    # Add group feature (for time-based grouping)
    df[f'{col}_Group'] = (df[f'{col}_year'] - 2010) * 48 + df[f'{col}_month'] * 4 + df[f'{col}_day'] // 7

    return df

# Apply the function to your datasets
new_train = transform_date(train_data, 'date')
new_test = transform_date(test_data, 'date')


In [None]:
new_train['num_sold'] = np.log1p(new_train['num_sold'])
new_train = new_train.drop(columns=['date', 'id'], axis=1)
new_test = new_test.drop(columns=['date', 'id'], axis=1)

In [None]:
num_cols = list(new_train.select_dtypes(exclude=['object']).columns.difference(['num_sold']))
cat_ftrs = list(new_train.select_dtypes(include=['object']).columns)

num_cols_test = list(new_test.select_dtypes(exclude=['object']).columns.difference(['id']))
cat_ftrs_test = list(new_test.select_dtypes(include=['object']).columns)

In [None]:
train_test_comb = pd.concat([new_train, new_test], axis=0, ignore_index=True)
for col in cat_ftrs:
    train_test_comb[col], _ = train_test_comb[col].factorize()
    train_test_comb[col] -= train_test_comb[col].min()
    # label encode to categorical and convert int32 to category
    train_test_comb[col] = train_test_comb[col].astype('int32')
    train_test_comb[col] = train_test_comb[col].astype('category')

for col in num_cols:
    if train_test_comb[col].dtype=='float64':
        train_test_comb[col].astype('float32')
    if train_test_comb[col].dtype=='int64':
        train_test_comb[col].astype('int32')

new_train = train_test_comb.iloc[:len(new_train)].copy()
new_test = train_test_comb.iloc[len(new_train):].copy()

In [None]:
new_test = new_test.drop(columns='num_sold', axis=1)

In [None]:

X= new_train.drop(columns=['num_sold'])
y = new_train['num_sold']

## Libraries to try our model

In [None]:


import lightgbm as lgbm
from lightgbm import LGBMRegressor
from sklearn.metrics import make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV , cross_val_score ,cross_validate, train_test_split, KFold

## Label encoding

### Drop date and id

## Fillna Better later, than never. Fingers crossed!

## Define X and y

In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
pip install grpcio-tools==1.67.0


Collecting grpcio-tools==1.67.0
  Downloading grpcio_tools-1.67.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools==1.67.0)
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading grpcio_tools-1.67.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.7/319.7 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf, grpcio-tools
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.5
    Uninstalling protobuf-4.25.5:
      Successfully uninstalled protobuf-4.25.5
[31mERROR: pip's dependency resolver does not currently take into a

In [None]:
import lightgbm as lgb
import optuna
from sklearn.metrics import mean_absolute_percentage_error
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mape',  # We'll evaluate on MAPE
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'max_depth': trial.suggest_int('max_depth', 5, 25),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-4, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 300),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'random_state': 42,
        'verbose': -1,
        'device': 'cpu'
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    y_pred = model.predict(X_val)
    mape = mean_absolute_percentage_error(y_val, y_pred)
    return mape

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best parameters and MAPE
print("Best parameters:", study.best_params)
print("Best MAPE:", study.best_value)

[I 2025-01-24 22:47:23,930] A new study created in memory with name: no-name-7e97ff45-c893-4da6-b5fa-2a952a0c1d01
[I 2025-01-24 22:47:46,401] Trial 0 finished with value: 0.008341448506445547 and parameters: {'n_estimators': 361, 'learning_rate': 0.07227543541158188, 'max_depth': 19, 'reg_alpha': 0.1628006358552527, 'lambda_l2': 0.00011499348559006007, 'min_child_samples': 54, 'colsample_bytree': 0.7606089213177164, 'subsample': 0.5776998852671172}. Best is trial 0 with value: 0.008341448506445547.
[I 2025-01-24 22:48:28,920] Trial 1 finished with value: 0.008088747704141572 and parameters: {'n_estimators': 1644, 'learning_rate': 0.025192237186689673, 'max_depth': 18, 'reg_alpha': 0.0008739267255433046, 'lambda_l2': 0.0005644354320681499, 'min_child_samples': 91, 'colsample_bytree': 0.6039112943170623, 'subsample': 0.8051514786553111}. Best is trial 1 with value: 0.008088747704141572.
[I 2025-01-24 22:49:05,753] Trial 2 finished with value: 0.008067960293171942 and parameters: {'n_esti

Best parameters: {'n_estimators': 1821, 'learning_rate': 0.08225310457160621, 'max_depth': 16, 'reg_alpha': 0.3496714683659068, 'lambda_l2': 0.4986654468849776, 'min_child_samples': 61, 'colsample_bytree': 0.7033737020022187, 'subsample': 0.932921207563455}
Best MAPE: 0.007866413631004801


In [None]:

lgb_params = study.best_params
lgb_params.update({
    'device': 'cpu',                # Use GPU for training
    'n_jobs': -1,                   # Use all available CPU threads
})

# K-Fold Cross-validation with LightGBM
scores, lgb_test_preds = [], []

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_idx, val_idx) in enumerate(kfold.split(X)):
    print(f'Fold {i}')
    X_train_fold, X_val_fold = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    # Train the model with the best parameters
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)])

    y_preds = lgb_model.predict(X_val_fold)
    mape_score = mean_absolute_percentage_error(y_val_fold, y_preds)
    print(f'MAPE Score for fold {i}:', mape_score)
    scores.append(mape_score)
    lgb_test_preds.append(lgb_model.predict(X))

# Calculate mean and std of MAPE
lgb_score = np.mean(scores)
lgb_std = np.std(scores)

print(f"Mean MAPE: {lgb_score}, Std MAPE: {lgb_std}")

Fold 0
MAPE Score for fold 0: 0.007881438953762823
Fold 1
MAPE Score for fold 1: 0.007882508468324025
Fold 2
MAPE Score for fold 2: 0.00790701832455299
Fold 3
MAPE Score for fold 3: 0.007905085296033195
Fold 4
MAPE Score for fold 4: 0.007864530072731381
Mean MAPE: 0.007888116223080882, Std MAPE: 1.5984856609872988e-05


#Grid Search

In [None]:
sub['num_sold'] = np.expm1(lgb_model.predict(new_test))
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,num_sold
0,230130,133.612313
1,230131,851.818147
2,230132,689.189588
3,230133,382.371017
4,230134,446.439937
