In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
df = pd.read_csv("train.csv", parse_dates=["date"])
df.head(2)

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0


In [3]:
df = df[df["num_sold"].notna()]
df = pd.get_dummies(df, dtype=int)
df["date"] = (df["date"] - df["date"].min()).dt.days

In [4]:
# train test split
train = df.sample(frac=0.8, random_state=42)
test = df.drop(train.index)
X_train, y_train = train.drop('num_sold', axis=1), train['num_sold']
X_test, y_test = test.drop('num_sold', axis=1), test['num_sold']

In [None]:
# lasso pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge

param_grid = {
    "model__alpha": np.logspace(-2, 2, num=20)
}

sqrt_transform = FunctionTransformer(np.sqrt, inverse_func=np.square)

lasso_pipe = Pipeline([
    ("shift", sqrt_transform),
    ("scaler", StandardScaler()),
    ("model", Lasso())
])

lasso_search = GridSearchCV(lasso_pipe, param_grid, cv=5, scoring="neg_mean_absolute_percentage_error", verbose=10)
lasso_search.fit(X_train, y_train)
lasso_rmse = np.sqrt(-lasso_search.best_score_)

In [7]:
lasso_rmse

14.912256137334982

In [26]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_percentage_error

ridge_pipe = Pipeline([
    ("shift", sqrt_transform),
    ("scaler", StandardScaler())
])

param_grid = {
    "model__regressor__alpha": np.logspace(-2, 6, num=20)
}

regressor = TransformedTargetRegressor(
    regressor=Ridge(),  # Use pipeline to preprocess X
    transformer=StandardScaler()                # Scale y
)

full_pipe = Pipeline([
    ("preprocesser", ridge_pipe),
    ("model", regressor)
])

ridge_search = GridSearchCV(full_pipe, param_grid, cv=5, scoring="neg_mean_absolute_percentage_error", verbose=1)
ridge_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [27]:
preds = ridge_search.predict(X_test)
mape = mean_absolute_percentage_error(y_test, preds) * 100
print(f"MAPE: {mape:.2f}%")

MAPE: 297.04%


In [15]:
y_test

6         1837.0
11        2212.0
16         926.0
17         774.0
19         450.0
           ...  
230105     474.0
230106    2767.0
230118     556.0
230123    1052.0
230125     466.0
Name: num_sold, Length: 44252, dtype: float64