In [14]:
import warnings
warnings.filterwarnings('ignore')

path = "../../kaggle/data/"

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

#main module for evaluation
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error,r2_score

def calcMetrics(testActualVal, predictions):
    #regression evaluation measures
    data={"RMSLE":[mean_squared_log_error(testActualVal, predictions)**0.5],
         "MAE":[mean_absolute_error(testActualVal, predictions)],
         "RMSE":[mean_squared_error(testActualVal, predictions)**0.5],
         "R2":[r2_score(testActualVal, predictions)]}
    metric_df=pd.DataFrame(data)
    return metric_df

def split(df, useful_features, target):
    val_start_index=df.shape[0]-len(df["date"][df["date"] >=pd.to_datetime("2017-01-01")])
    train = df[:val_start_index]
    val=df[val_start_index:]
    X_train = train[useful_features]
    y_train = train[target]
    X_valid = val[useful_features]
    y_valid = val[target]
    return X_train, y_train, X_valid, y_valid

def pl(df_compare, actual, pred):
    fig, ax = plt.subplots(figsize=(14,3))
    df = df_compare[df_compare.date >= pd.to_datetime("2017-01-02")]
    g1=sns.lineplot(data=df, x = "date", y = actual, ax = ax)
    g2=sns.lineplot(data=df, x = "date", y = pred, ax = ax)
    g1.set(yscale='log')
    g2.set(yscale='log')
    plt.show()
    plt.close()

In [15]:
train_merged = pd.read_csv(path + "train_merged.csv")
train_merged["date"] = pd.to_datetime(train_merged.date)

In [16]:
train_merged.head(2)

Unnamed: 0,date,store_nbr,family,onpromotion,sales,city,state,typestores,cluster,typeholiday,locale,locale_name,description,transferred,dcoilwtico,day_of_week,month,year
0,2013-01-01,1,AUTOMOTIVE,0,0.0,Quito,Pichincha,D,13,Holiday,National,Ecuador,Primer dia del ano,False,93.14,2,1,2013
1,2013-01-01,1,BABY CARE,0,0.0,Quito,Pichincha,D,13,Holiday,National,Ecuador,Primer dia del ano,False,93.14,2,1,2013


In [17]:
train_merged.isna().sum()

date                 0
store_nbr            0
family               0
onpromotion          0
sales            28512
city                 0
state                0
typestores           0
cluster              0
typeholiday          0
locale         2578554
locale_name    2578554
description    2578554
transferred    2578554
dcoilwtico           0
day_of_week          0
month                0
year                 0
dtype: int64

In [18]:
np.array(train_merged.columns)

array(['date', 'store_nbr', 'family', 'onpromotion', 'sales', 'city',
       'state', 'typestores', 'cluster', 'typeholiday', 'locale',
       'locale_name', 'description', 'transferred', 'dcoilwtico',
       'day_of_week', 'month', 'year'], dtype=object)

## Selection des feautures

In [48]:
useful_features = ['date', 'family', 'city', 'typestores', 'cluster', 'typeholiday',
                   'day_of_week', 'month', 'year', 'onpromotion', 'dcoilwtico', 'sales'] 


category_columns = ['family', 'city', 'typestores', 'cluster', 'typeholiday']

time_columns = ['day_of_week', 'month', 'year', 'onpromotion', 'dcoilwtico']

# names of columns to train xgb
col_names_classic_ml = ['family', 'onpromotion', 'city', 'typestores', 'cluster', 'typeholiday', 'dcoilwtico',
                           'day_of_week', 'month', 'year'] 


# names of columns after pipeline transformations, 
# note ordering of this list isn't arbitrary.
# I manually adjusted ordering after getting feature names out of pipeline and verfiying ordering 
col_names_classic_ml_transformed = ['family', 'typeholiday','city', 'typestores', 'cluster',
       'day_of_week_sin','day_of_week_cos', 'month_sin', 'month_cos','year_sin', 'year_cos',
       'day_of_week_sin day_of_week_cos','day_of_week_sin month_sin', 'day_of_week_sin month_cos',
       'day_of_week_sin year_sin', 'day_of_week_sin year_cos','day_of_week_cos month_sin',
       'day_of_week_cos month_cos','day_of_week_cos year_sin', 'day_of_week_cos year_cos','month_sin month_cos',
       'month_sin year_sin','month_sin year_cos','month_cos year_sin','month_cos year_cos', 'year_sin year_cos',
       'onpromotion', 'dcoilwtico']

In [49]:
# selection des feautures
df = train_merged[useful_features]

# Cast en string des variables catégoriales
for column in category_columns:
    df[column] = df[column].astype('str')

# Cast en float des variables numériques
for column in time_columns:
    df[column] = df[column].astype('float')

## Split en données de training, de validation et de test

In [50]:
# données de test pour la prédiction de submissoin
X_test = df.drop(['date', 'sales'], axis=1)[df.index > 3000887]

# données de train, split en données d'entrainement et donnée de validation
X_train, y_train, X_valid, y_valid =split(df[df.index < 3000888], list(df.drop(['date', 'sales'], axis=1)), 'sales')

print(X_test.shape)
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)
print(y_valid.shape[0] / y_train.shape[0])

(28512, 10)
(2596374, 10)
(2596374,)
(404514, 10)
(404514,)
0.15579958819492107


In [51]:
X_train.head()

Unnamed: 0,family,city,typestores,cluster,typeholiday,day_of_week,month,year,onpromotion,dcoilwtico
0,AUTOMOTIVE,Quito,D,13,Holiday,2.0,1.0,2013.0,0.0,93.14
1,BABY CARE,Quito,D,13,Holiday,2.0,1.0,2013.0,0.0,93.14
2,BEAUTY,Quito,D,13,Holiday,2.0,1.0,2013.0,0.0,93.14
3,BEVERAGES,Quito,D,13,Holiday,2.0,1.0,2013.0,0.0,93.14
4,BOOKS,Quito,D,13,Holiday,2.0,1.0,2013.0,0.0,93.14


In [52]:
X_valid.head()

Unnamed: 0,family,city,typestores,cluster,typeholiday,day_of_week,month,year,onpromotion,dcoilwtico
2596374,AUTOMOTIVE,Quito,D,13,NDay,7.0,1.0,2017.0,0.0,53.286493
2596375,BABY CARE,Quito,D,13,NDay,7.0,1.0,2017.0,0.0,53.286233
2596376,BEAUTY,Quito,D,13,NDay,7.0,1.0,2017.0,0.0,53.285973
2596377,BEVERAGES,Quito,D,13,NDay,7.0,1.0,2017.0,0.0,53.285713
2596378,BOOKS,Quito,D,13,NDay,7.0,1.0,2017.0,0.0,53.285454


In [53]:
y_valid

2596374       0.000
2596375       0.000
2596376       0.000
2596377       0.000
2596378       0.000
             ...   
3000883     438.133
3000884     154.553
3000885    2419.729
3000886     121.000
3000887      16.000
Name: sales, Length: 404514, dtype: float64

## Numérisation des données catégriales et scalarisatrion

In [54]:
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import TargetEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer

category_feat = Pipeline(steps=[("target_encode", TargetEncoder(target_type="continuous"))])

# helper functions to be able to get feature names out of functional transformer 
def f_out_sin(self,input_features):
    return input_features
def f_out_cos(self,input_features):
    return input_features
    
# functions to transform time features with sine cosine transformation 
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi),feature_names_out=f_out_sin)

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi), feature_names_out=f_out_cos)


#adding polynomial transformation on sine_cosine transformed time features to capture linear interactions between time features
time_feat = make_pipeline(
                        ColumnTransformer([
                            #("cyclic_day_of_week", periodic_spline_transformer(7, n_splines=3), ["day_of_week"]),
                            ("day_of_week_sin", sin_transformer(7), ["day_of_week"]),
                            ("day_of_week_cos", cos_transformer(7), ["day_of_week"]),
                            #("cyclic_month", periodic_spline_transformer(12, n_splines=6), ["month"]),
                            ("month_sin", sin_transformer(12), ["month"]),
                            ("month_cos", cos_transformer(12), ["month"]),
                            ("year_sin", sin_transformer(365), ["year"]),
                            ("year_cos", cos_transformer(365), ["year"]),   
                            ],remainder='drop'),
    #Nystroem(kernel="poly", degree=2,n_jobs=-1, n_components=85, random_state=0),
    PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))


preprocess_pipe = Pipeline(steps=[
    ('encoder', ColumnTransformer(
                    transformers=[
                        ("category_trans",category_feat, category_columns),
                        ("time_trans",time_feat,["day_of_week","month","year"] ),
                                ],
                                remainder="passthrough", verbose_feature_names_out=True
                            )),
    ('scaler', MinMaxScaler()),
    ("pandarizer2", FunctionTransformer(lambda x: pd.DataFrame(x, columns =  col_names_classic_ml_transformed)))
                            ],verbose = True)

In [55]:
preprocess_pipe.fit(X_train[col_names_classic_ml], y_train)

[Pipeline] ........... (step 1 of 3) Processing encoder, total=   2.6s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.3s
[Pipeline] ....... (step 3 of 3) Processing pandarizer2, total=   0.0s


In [56]:
X_train = preprocess_pipe.transform(X_train[col_names_classic_ml])
X_valid = preprocess_pipe.transform(X_valid[col_names_classic_ml])

In [58]:
X_train.head(3)

Unnamed: 0,family,typeholiday,city,typestores,cluster,day_of_week_sin,day_of_week_cos,month_sin,month_cos,year_sin,...,day_of_week_cos year_sin,day_of_week_cos year_cos,month_sin month_cos,month_sin year_sin,month_sin year_cos,month_cos year_sin,month_cos year_cos,year_sin year_cos,onpromotion,dcoilwtico
0,0.001604,0.997941,0.302996,0.191951,0.029673,1.0,0.356896,0.75,0.933013,1.0,...,0.601947,0.643104,1.0,0.337899,0.25,0.219233,0.066987,0.0,0.0,0.792965
1,1.1e-05,0.997941,0.302996,0.191951,0.029673,1.0,0.356896,0.75,0.933013,1.0,...,0.601947,0.643104,1.0,0.337899,0.25,0.219233,0.066987,0.0,0.0,0.792965
2,0.000932,0.997941,0.302996,0.191951,0.029673,1.0,0.356896,0.75,0.933013,1.0,...,0.601947,0.643104,1.0,0.337899,0.25,0.219233,0.066987,0.0,0.0,0.792965


In [59]:
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance, plot_tree

#Enabling memory growth for GPU
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
import tensorflow as tf
tf.config.list_physical_devices('GPU') 

# for reproducibility
seed0=1337
np.random.seed(seed0) 
tf.keras.utils.set_random_seed(seed0)
tf.config.experimental.enable_op_determinism()
tf.random.set_seed(seed0)

# call back to avoid overfitting
early_stop = xgb.callback.EarlyStopping(rounds=10,
                                        metric_name='rmse',
                                        maximize=False,
                                        save_best= True,
                                        )

# training xgboost
xgboost_v00=XGBRegressor(random_state=seed0,verbosity=0, n_jobs = -1, reg_lambda=0.005, 
                         learning_rate=0.01, device='gpu',
                          n_estimators=5000, objective='reg:squarederror',
                        callbacks=[early_stop])
xgboost_v00.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],verbose=False)

## Prédiction et évaluation des données de validation

In [60]:
# prédition des données de test
y_pred_xgb = xgboost_v00.predict(X_valid)

# conversion des valeurs negatives en 0
y_pred_xgb=np.where(y_pred_xgb<0,0,y_pred_xgb)

# calcule de performnce
calcMetrics(y_valid, y_pred_xgb)

Unnamed: 0,RMSLE,MAE,RMSE,R2
0,0.971758,128.965031,453.124764,0.888202


In [None]:
df_compare = pd.DataFrame({'date': df[df.date >= pd.to_datetime("2017-01-01")].date, 'pred': y_pred_xgb})
df_compare['actual'] = df[df.date >= pd.to_datetime("2017-01-01")].sales
pl(df_compare, 'actual', 'pred')

## Prédiction et évaluation des données de test et création de fichier de submission

In [63]:
# scalarisation de la matrice de test 
X_test = preprocess_pipe.transform(X_test[col_names_classic_ml])

# prédition des données de test
y_pred_test = xgboost_v00.predict(X_test)

# conversion des valeurs negatives en 0
y_pred_test=np.where(y_pred_test<0,0,y_pred_test)

y_pred_test.shape

(28512,)

In [64]:
submission = pd.DataFrame(y_pred_test, columns=['sales'])
submission['id'] = index = df[df.index > 3000887].index
submission[['id', 'sales']]

Unnamed: 0,id,sales
0,3000888,6.758023
1,3000889,4.601942
2,3000890,19.225246
3,3000891,2881.416992
4,3000892,4.601942
...,...,...
28507,3029395,365.883179
28508,3029396,70.453186
28509,3029397,1317.520020
28510,3029398,57.347305


In [65]:
submission[['id', 'sales']].to_csv(path + "submission.csv", index=False, header=True, sep=',')

**Score Kaggle : 0.94056**