In [1]:
#import split data
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats import truncnorm
import pandas as pd




**Train and prediction**

In [2]:

from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

def generate_features(X, train=False):
    X = X.copy()
        
    if train:
        X = X.dropna(subset=["num_sold"])
        X = X.drop(columns=["id"])

    X["date"] = pd.to_datetime(X["date"], format="%Y-%m-%d")
    X["dayofweek"] = X["date"].dt.dayofweek
    X["month"] = X["date"].dt.month - 1
    X["year"] = X["date"].dt.year - 2010
    X["is_weekend"] = X["dayofweek"].isin([5,6])
        
    X["sin_dayofweek"] = np.sin(X["dayofweek"] * (2 * np.pi / 7))
    X["cos_dayofweek"] = np.cos(X["dayofweek"] * (2 * np.pi / 7))
    X["sin_month"] = np.sin(X["month"] * (2 * np.pi / 12))
    X["cos_month"] = np.cos(X["month"] * (2 * np.pi / 12))
    X["sin_year"] = np.sin(X["year"] * (2 * np.pi / 10))
    X["cos_year"] = np.cos(X["year"] * (2 * np.pi / 10))
    
    X = X.drop(columns=["date","month","year","dayofweek"])    
    return X

FOLDER = "playground-series-s5e1/"
train_data = pd.read_csv(FOLDER + "train.csv")
test_data = pd.read_csv(FOLDER + "test.csv")

train_data = generate_features(train_data,train=True)
test_data = generate_features(test_data)

print("Train data")
print(train_data.head())
print("Test data")
print(test_data.head())


Train data
  country              store             product  num_sold  is_weekend  \
1  Canada  Discount Stickers              Kaggle     973.0       False   
2  Canada  Discount Stickers        Kaggle Tiers     906.0       False   
3  Canada  Discount Stickers            Kerneler     423.0       False   
4  Canada  Discount Stickers  Kerneler Dark Mode     491.0       False   
5  Canada  Stickers for Less   Holographic Goose     300.0       False   

   sin_dayofweek  cos_dayofweek  sin_month  cos_month  sin_year  cos_year  
1      -0.433884      -0.900969        0.0        1.0       0.0       1.0  
2      -0.433884      -0.900969        0.0        1.0       0.0       1.0  
3      -0.433884      -0.900969        0.0        1.0       0.0       1.0  
4      -0.433884      -0.900969        0.0        1.0       0.0       1.0  
5      -0.433884      -0.900969        0.0        1.0       0.0       1.0  
Test data
       id country              store             product  is_weekend  \
0  230

In [3]:
X = train_data.drop(columns=['num_sold'])
y = train_data['num_sold']

cat_cols = ['country', 'store', 'product']
num_cols = ["sin_dayofweek", "cos_dayofweek", "sin_month", "cos_month", "sin_year", "cos_year"]

#Create a numeric transformer with imputer and scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))
])


In [4]:
time_series = TimeSeriesSplit(n_splits=5)

for fold_index, (train_index, val_index) in enumerate(time_series.split(X)):
    
    X_train_fold = X.iloc[train_index]
    y_train_fold = y.iloc[train_index]
    X_val_fold = X.iloc[val_index]
    y_val_fold = y.iloc[val_index]
    model_pipeline.fit(X_train_fold, y_train_fold)

    y_pred_val = model_pipeline.predict(X_val_fold)    
    mape = mean_absolute_percentage_error(y_val_fold, y_pred_val)
    print(f"MAPE: {mape}")
    
model_pipeline.fit(X,y)

MAPE: 0.09002418397708276
MAPE: 0.08633618478424229
MAPE: 0.08134808649381839
MAPE: 0.10526276780775626
MAPE: 0.08089637594129193


**Inference**

In [5]:
y_pred_test = model_pipeline.predict(test_data)
#submission
submission = pd.DataFrame({"id": test_data["id"],"num_sold":y_pred_test})
submission.to_csv(FOLDER + "submission222.csv", index=False)
