In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [62]:
df = pd.read_csv("train.csv", parse_dates=["date"])
df.head(2)

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0


In [63]:
df = df[df["num_sold"].notna()]
df = pd.get_dummies(df, dtype=int)
df["date"] = (df["date"] - df["date"].min()).dt.days

In [65]:
# train test split
train = df.sample(frac=0.8, random_state=42)
test = df.drop(train.index)
X_train, y_train = train.drop('num_sold', axis=1), train['num_sold']
X_test, y_test = test.drop('num_sold', axis=1), test['num_sold']

In [None]:
# lasso pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge

param_grid = {
    "model__alpha": np.logspace(-2, 1, num=5)
}

sqrt_transform = FunctionTransformer(np.sqrt, inverse_func=np.square)

lasso_pipe = Pipeline([
    ("shift", sqrt_transform),
    ("scaler", StandardScaler()),
    ("model", Lasso())
])

lasso_search = GridSearchCV(lasso_pipe, param_grid, cv=4, scoring="neg_mean_squared_error", verbose=1)
lasso_search.fit(X_train, y_train)
lasso_rmse = np.sqrt(-lasso_search.best_score_)
lasso_rmse

Fitting 4 folds for each of 5 candidates, totalling 20 fits


In [None]:
ridge_pipe = Pipeline([
    ("shift", sqrt_transform),
    ("scaler", StandardScaler()),
    ("model", Ridge())
])

ridge_search = GridSearchCV(ridge_pipe, param_grid, cv=4, scoring="neg_mean_squared_error", verbose=1)
ridge_search.fit(X_train, y_train)
ridge_rmse = np.sqrt(-ridge_search.best_score_)
ridge_rmse