In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

FOLDER = "playground-series-s5e1/"
train_data = pd.read_csv(FOLDER+"train.csv")
test_data = pd.read_csv(FOLDER+"test.csv")

test_data.head()

#plots unique values 
for var in test_data.columns:
    print(var, test_data[var].nunique())


id 230130
date 2557
country 6
store 3
product 5


In [2]:
#Let's unpack date into year, month, day, quarter, weekday, weeks
test_data["date"] = pd.to_datetime(test_data["date"],format="%Y-%m-%d")
test_data["year"] = test_data["date"].dt.year
test_data["month"] = test_data["date"].dt.month
test_data["day"] = test_data["date"].dt.day
test_data["quarter"] = test_data["date"].dt.quarter
test_data["weekday"] = test_data["date"].dt.weekday
test_data["weeks"] = (test_data["date"].dt.day-1)//7

#drop date column
test_data.drop(columns=["date"],inplace=True)

#check for nan
test_data.isna().sum()



id         0
country    0
store      0
product    0
year       0
month      0
day        0
quarter    0
weekday    0
weeks      0
dtype: int64

In [3]:
test_data.describe()


Unnamed: 0,id,year,month,day,quarter,weekday,weeks
count,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0
mean,279404.5,2018.0,6.526027,15.720548,2.509589,2.998174,1.69863
std,28449.078852,0.816501,3.447869,8.796292,1.116773,2.002291,1.267999
min,230130.0,2017.0,1.0,1.0,1.0,0.0,0.0
25%,254767.25,2017.0,4.0,8.0,2.0,1.0,1.0
50%,279404.5,2018.0,7.0,16.0,3.0,3.0,2.0
75%,304041.75,2019.0,10.0,23.0,4.0,5.0,3.0
max,328679.0,2019.0,12.0,31.0,4.0,6.0,4.0


In [4]:
categorical_variables = ["country","product","store"]
temporal_variables = ["year", "month", "day", "quarter", "weekday", "weeks"]

onehot_variables = categorical_variables + ["quarter","weeks"]
sinusoidal_variables = ["year","month","day","weekday"]
print(onehot_variables)


['country', 'product', 'store', 'quarter', 'weeks']


In [5]:
#Let's onehot variables
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(test_data[onehot_variables])
encoded_features_names = encoder.get_feature_names_out(onehot_variables)
encoded_data = pd.DataFrame(encoded_data, columns=encoded_features_names)
test_data = test_data.drop(columns=onehot_variables)
test_data = pd.concat([test_data, encoded_data], axis=1)


In [6]:
test_data.isna().sum()


id                            0
year                          0
month                         0
day                           0
weekday                       0
country_Canada                0
country_Finland               0
country_Italy                 0
country_Kenya                 0
country_Norway                0
country_Singapore             0
product_Holographic Goose     0
product_Kaggle                0
product_Kaggle Tiers          0
product_Kerneler              0
product_Kerneler Dark Mode    0
store_Discount Stickers       0
store_Premium Sticker Mart    0
store_Stickers for Less       0
quarter_1                     0
quarter_2                     0
quarter_3                     0
quarter_4                     0
weeks_0                       0
weeks_1                       0
weeks_2                       0
weeks_3                       0
weeks_4                       0
dtype: int64

In [7]:
#convert day to sin and cos_day
test_data["sin_day"] = np.sin(2 * np.pi * (test_data["day"]-1)/31)
test_data["cos_day"] = np.cos(2 * np.pi * (test_data["day"]-1)/31)

#convert month to sin and cos_month
test_data["sin_month"] = np.sin(2 * np.pi * (test_data["month"]-1)/12)
test_data["cos_month"] = np.cos(2 * np.pi * (test_data["month"]-1)/12)

#convert weekday to sin and cos_weekday
test_data["sin_weekday"] = np.sin(2 * np.pi * (test_data["weekday"])/7)
test_data["cos_weekday"] = np.cos(2 * np.pi * (test_data["weekday"])/7)

test_data["sin_year"] = np.sin(2 * np.pi * (test_data["year"]-2010)/7)
test_data["cos_year"] = np.cos(2 * np.pi * (test_data["year"]-2010)/7)

test_data.head()

test_data.drop(columns=["year","month","day","weekday"],inplace=True)
test_data.head()


Unnamed: 0,id,country_Canada,country_Finland,country_Italy,country_Kenya,country_Norway,country_Singapore,product_Holographic Goose,product_Kaggle,product_Kaggle Tiers,...,weeks_3,weeks_4,sin_day,cos_day,sin_month,cos_month,sin_weekday,cos_weekday,sin_year,cos_year
0,230130,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.781831,0.62349,-2.449294e-16,1.0
1,230131,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.781831,0.62349,-2.449294e-16,1.0
2,230132,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.781831,0.62349,-2.449294e-16,1.0
3,230133,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.781831,0.62349,-2.449294e-16,1.0
4,230134,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.781831,0.62349,-2.449294e-16,1.0


In [8]:
#save test data as .csv file 
test_data.to_csv(FOLDER+"test_data.csv",index=False)



In [72]:
y = train_data["num_sold"]
X = train_data.drop(columns=["num_sold"])
X.head()

Unnamed: 0,country_Canada,country_Finland,country_Italy,country_Kenya,country_Norway,country_Singapore,product_Holographic Goose,product_Kaggle,product_Kaggle Tiers,product_Kerneler,...,weeks_3,weeks_4,sin_day,cos_day,sin_month,cos_month,sin_weekday,cos_weekday,sin_year,cos_year
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.433884,-0.900969,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.433884,-0.900969,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.433884,-0.900969,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.433884,-0.900969,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.433884,-0.900969,0.0,1.0


In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  

In [96]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Accuracy:", r2_score(y_test, y_pred))

Mean Squared Error: 140039.12970292647
Accuracy: 0.6948143553537942


In [108]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred_ridge))
print("Accuracy:", r2_score(y_test, y_pred_ridge))

Mean Squared Error: 140044.18070023507
Accuracy: 0.694803347774177


In [109]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

lasso = Lasso(alpha=0.5)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred_lasso))
print("Accuracy:", r2_score(y_test, y_pred_lasso))

Mean Squared Error: 140066.76588278692
Accuracy: 0.6947541281488396
