In [171]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-jan-2022/train.csv
/kaggle/input/tabular-playground-series-jan-2022/test.csv


In [172]:
from sklearn.model_selection import train_test_split

from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error

#from sklearn.metrics import mean_absolute_percentage_error

from sklearn.preprocessing import StandardScaler

In [173]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')

df_test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')

In [174]:
df_train.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


In [175]:
def df_processing(df):
    # Extract basic time features 
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday  #notation: Monday is 0, Sunday is 6
    df = df.drop(columns=['date'])

    #df['product'].unique()
    # get dummies on categorical data
    data_dummies = pd.get_dummies(df[['country','store', 'product']])
    df = pd.concat([df, data_dummies], axis=1)
    df = df.drop(columns=['country','store', 'product'])
    #df.head()
    
    

    return df

In [176]:
# process both train and test:

df_train_processed = df_processing(df_train)



df_submission_test_processed = df_processing(df_test)


#just training data:
X = df_train_processed.drop(columns=['num_sold', 'row_id'])
y = df_train_processed['num_sold']
#normalise:
s = StandardScaler().fit(X)
X[X.columns] = s.transform(X)

submission_X = df_submission_test_processed.drop(columns=['row_id'])
submission_X[submission_X.columns] = s.transform(submission_X)

In [177]:
submission_X.head()

Unnamed: 0,year,month,day,weekday,country_Finland,country_Norway,country_Sweden,store_KaggleMart,store_KaggleRama,product_Kaggle Hat,product_Kaggle Mug,product_Kaggle Sticker
0,2.236987,-1.601451,-1.673805,-1.00077,1.414214,-0.707107,-0.707107,1.0,-1.0,-0.707107,1.414214,-0.707107
1,2.236987,-1.601451,-1.673805,-1.00077,1.414214,-0.707107,-0.707107,1.0,-1.0,1.414214,-0.707107,-0.707107
2,2.236987,-1.601451,-1.673805,-1.00077,1.414214,-0.707107,-0.707107,1.0,-1.0,-0.707107,-0.707107,1.414214
3,2.236987,-1.601451,-1.673805,-1.00077,1.414214,-0.707107,-0.707107,-1.0,1.0,-0.707107,1.414214,-0.707107
4,2.236987,-1.601451,-1.673805,-1.00077,1.414214,-0.707107,-0.707107,-1.0,1.0,1.414214,-0.707107,-0.707107


In [178]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [179]:
# model
params = {
    "n_estimators": 600,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
}

reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)


# A - actual, F - Forecasted
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))


mse = mean_squared_error(y_test, reg.predict(X_test))
print("mean_squared_error: " + str(mse))
sm_val = smape(y_test, reg.predict(X_test))
print("smape: "+str(sm_val))

mean_squared_error: 2920.435280350332
smape: 7.558460528426352


In [180]:
prediction = reg.predict(submission_X)

In [181]:
df_1 = df_submission_test_processed['row_id'].copy()
num = pd.DataFrame(prediction, columns=['num_sold'])
df_to_submit = pd.concat([df_1, num] , axis=1)
df_to_submit.head()

Unnamed: 0,row_id,num_sold
0,26298,297.867259
1,26299,499.38905
2,26300,185.617078
3,26301,446.505167
4,26302,792.54076


In [182]:
df_to_submit.to_csv('Gradient_Booster_Regressor.csv', index=False)
df_to_submit.head()

Unnamed: 0,row_id,num_sold
0,26298,297.867259
1,26299,499.38905
2,26300,185.617078
3,26301,446.505167
4,26302,792.54076
