In [248]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import truncnorm

class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def compute_temporal_variables(self, X):
        X["date"] = pd.to_datetime(X["date"],format="%Y-%m-%d")
        X["year"] = X["date"].dt.year
        X["month"] = X["date"].dt.month
        X["day"] = X["date"].dt.day
        X["quarter"] = X["date"].dt.quarter
        X["weekday"] = X["date"].dt.weekday
        X["weeks"] = (X["date"].dt.day-1)//7
        X["is_weekend"] = X["weekday"].isin([5,6])
        return X

    def compute_sin_cos(self, X):
        X["sin_day"] = np.sin(2 * np.pi * (X["day"]-1)/31)
        X["cos_day"] = np.cos(2 * np.pi * (X["day"]-1)/31)
        X["sin_month"] = np.sin(2 * np.pi * (X["month"]-1)/12)
        X["cos_month"] = np.cos(2 * np.pi * (X["month"]-1)/12)
        X["sin_weekday"] = np.sin(2 * np.pi * (X["weekday"])/7)
        X["cos_weekday"] = np.cos(2 * np.pi * (X["weekday"])/7)
        X["sin_year"] = np.sin(2 * np.pi * (X["year"]-2010)/7)
        X["cos_year"] = np.cos(2 * np.pi * (X["year"]-2010)/7)
        return X
    
    def compute_onehot_variables(self, X):
        encoder = OneHotEncoder(sparse_output=False)
        one_hot_variables = ["store","product","country","quarter","weeks"]
        one_hot_encoded = encoder.fit_transform(X[one_hot_variables])
        one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(one_hot_variables), index=X.index)
        X = X.drop(columns=one_hot_variables)
        X = pd.concat([X, one_hot_df], axis=1)
        return X
    
    def transform(self, X):
        X = X.copy()
        X = self.compute_temporal_variables(X)
        X = self.compute_sin_cos(X)
        X = self.compute_onehot_variables(X)
        X.drop(columns=["id","date"],inplace=True)
        return X

In [249]:
#import split data
from sklearn.model_selection import train_test_split
FOLDER = "playground-series-s5e1/"
train_data = pd.read_csv(FOLDER + "train.csv")
test_data = pd.read_csv(FOLDER + "test.csv")
y = train_data["num_sold"]

def transform_y(y):
    y = y.copy()
    n_missing_values = y.isna().sum()
    random_values = truncnorm.rvs(a=0, b=np.inf, loc=y.mean(), scale=y.std(), size=n_missing_values)
    y.loc[y.isna()] = random_values
    return np.log1p(y)

def inverse_transform_y(y):
    return np.expm1(y)

y = transform_y(y)
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns=["num_sold"]),y, test_size=0.15, random_state=42)


**Train and prediction**

In [257]:
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error

feature_transformer = FeatureTransformer()

#Model for now we are using RandomForestRegressor
model = RandomForestRegressor()

#The pipeline calls in cascade the transformers for the X_train until the last step. In the last step, the output is transformed and the model is trained
pipeline = Pipeline(steps=[("feature_transformer", feature_transformer), 
                           ("model", model)])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

y_pred_anti = inverse_transform_y(y_pred)
y_test_anti = inverse_transform_y(y_test)

mape = mean_absolute_percentage_error(y_test_anti, y_pred_anti)
print(f"MAPE: {mape:.2f}%")

17.038609332142883


**Inference**

In [251]:
y_pred_test = pipeline.predict(test_data)
y_pred_test_anti = inverse_transform_y(y_pred_test)

submission = pd.DataFrame({"id":test_data["id"],"num_sold":y_pred_test_anti})
submission.to_csv(FOLDER + "submission.csv", index=False)