In [1]:
# libraries for data exploration.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# library for splitting data.
from sklearn.model_selection import train_test_split

# libraries for preparing the data.
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

# libraries for building linear models.
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

# library for measuring error. 
from sklearn.metrics import mean_squared_error

# user defined.
from helpful_functions import MLFunctions

In [2]:
# load sales data.
sales = pd.read_csv("../../../Data/future_sales/sales_train.csv")

In [3]:
# use MLfunction class to convert date to equivalent numeric values.
user_function = MLFunctions(sales)
date_num = user_function.dateNum()

In [4]:
# create new column with numeric values equivalent to dates. 
sales["date_num"] = date_num

In [5]:
# split sales into train and test sets. 
train_set, test_set = train_test_split(sales, test_size=0.2, random_state=44)

In [6]:
# training and test data created.
X_train = train_set.loc[:, ["date_num", "shop_id", "item_id", "item_price"]]
y_train = train_set["item_cnt_day"]
X_test = test_set.loc[:, ["date_num", "shop_id", "item_id", "item_price"]]
y_test = test_set["item_cnt_day"]

In [7]:
# adds additional features to fit the nonlinear data.
poly_feat = PolynomialFeatures(degree=3, include_bias=False)
X_train_poly = poly_feat.fit_transform(X_train)
X_test_poly = poly_feat.fit_transform(X_test)

In [8]:
# ensures that columns have same scale.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.fit_transform(X_test_poly)

In [9]:
# linear regression without polynomial features.
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

In [10]:
# measures training error.
sales_predictions_train = lin_reg.predict(X_train_scaled)
lin_mse = mean_squared_error(sales_predictions_train, y_train)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.713546771502155

In [11]:
# measures test error.
sales_predictions_test = lin_reg.predict(X_test_scaled)
lin_mse = mean_squared_error(sales_predictions_test, y_test)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.16161374695729

In [12]:
sgd_reg = SGDRegressor(max_iter=10000, tol=1e-10, penalty=None, eta0=1e-100)
sgd_reg.fit(X_train_scaled, y_train)

In [13]:
# measures training error.
sales_predictions_train = sgd_reg.predict(X_train_scaled)
sgd_mse = mean_squared_error(sales_predictions_train, y_train)
sgd_rmse = np.sqrt(sgd_mse)
sgd_rmse

2.9902927476979446

In [14]:
# measures test error.
sales_predictions_test = sgd_reg.predict(X_test_scaled)
sgd_mse = mean_squared_error(sales_predictions_test, y_test)
sgd_rmse = np.sqrt(sgd_mse)
sgd_rmse

2.4989672066652013

In [15]:
# regularized linear model. 
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X_train_scaled, y_train)

In [16]:
# measures training error.
sales_predictions_train = ridge_reg.predict(X_train_scaled)
grid_mse = mean_squared_error(sales_predictions_train, y_train)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.71354677158303

In [17]:
# measures test error.
sales_predictions_test = ridge_reg.predict(X_test_scaled)
grid_mse = mean_squared_error(sales_predictions_test, y_test)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.161613347317552

In [18]:
# regularized linear model.
# unnecessary features are dropped.
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train_scaled, y_train)

In [19]:
# measures training error.
sales_predictions_train = lasso_reg.predict(X_train_scaled)
grid_mse = mean_squared_error(sales_predictions_train, y_train)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.7197873194636992

In [20]:
# measures test error.
sales_predictions_test = lasso_reg.predict(X_test_scaled)
grid_mse = mean_squared_error(sales_predictions_test, y_test)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.168521623644962

In [21]:
# regularized linear model.
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train_scaled, y_train)

In [22]:
# measures training error.
sales_predictions_train = elastic_net.predict(X_train_scaled)
grid_mse = mean_squared_error(sales_predictions_train, y_train)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.7190832692452838

In [23]:
# measures test error.
sales_predictions_test = elastic_net.predict(X_test_scaled)
grid_mse = mean_squared_error(sales_predictions_test, y_test)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.1676270527193062