In [None]:
# libraries for data exploration.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# library for splitting data.
from sklearn.model_selection import train_test_split

# libraries for preparing the data.
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

# libraries for building linear models.
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

# library for measuring error. 
from sklearn.metrics import mean_squared_error

# user defined 
from helpful_functions import MLFunctions

In [2]:
# load sales data.
sales = pd.read_csv("../../../Data/future_sales/sales_train.csv")

In [3]:
# purpose of this code is to replace dates with equivalent numeric values.
date_dict = dict()
arr = []
counter = 0

for date in sales["date"]:
    if date not in date_dict:
        date_dict[date] = counter 
        counter += 1
    else:
        pass
    
for date in sales["date"]:
    arr.append(date_dict[date])

In [4]:
sales["date_num"] = arr
sales = sales.sort_values("date_num")

In [5]:
# this helps avoid using the separate test set data.
# test set data only used after a compitent model has being built.


train_set, test_set = train_test_split(sales, test_size=0.2, random_state=44)

In [6]:
# training and test data created.
X_train = train_set.loc[:, ["date_num", "shop_id", "item_id", "item_price"]]
y_train = train_set["item_cnt_day"]
X_test = test_set.loc[:, ["date_num", "shop_id", "item_id", "item_price"]]
y_test = test_set["item_cnt_day"]

In [8]:
# linear regression without polynomial features.
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [9]:
# measures training error.
sales_predictions_train = lin_reg.predict(X_train)
grid_mse = mean_squared_error(sales_predictions_train, y_train)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.7303022682486957

In [10]:
# measures test error.
sales_predictions_test = lin_reg.predict(X_test)
grid_mse = mean_squared_error(sales_predictions_test, y_test)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.1105118673573418

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [12]:
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.01)
sgd_reg.fit(X_train_scaled, y_train.ravel())

In [13]:
# measures training error.
sales_predictions_train = sgd_reg.predict(X_train_scaled)
grid_mse = mean_squared_error(sales_predictions_train, y_train)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.7304707353467306

In [14]:
# measures test error.
sales_predictions_test = sgd_reg.predict(X_test_scaled)
grid_mse = mean_squared_error(sales_predictions_test, y_test)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.1107577677421454

In [15]:
# adds additional features to fit the nonlinear data.
poly_feat = PolynomialFeatures(degree=3, include_bias=False)
X_train_poly = poly_feat.fit_transform(X_train)
X_test_poly = poly_feat.fit_transform(X_test)

In [16]:
# linear regression with polynomial features.
lin_reg_poly = LinearRegression()
lin_reg_poly.fit(X_train_poly, y_train)

In [17]:
# measures training error.
sales_predictions_train = lin_reg_poly.predict(X_train_poly)
grid_mse = mean_squared_error(sales_predictions_train, y_train)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.7247525680072426

In [18]:
# measures test error.
sales_predictions_test = lin_reg_poly.predict(X_test_poly)
grid_mse = mean_squared_error(sales_predictions_test, y_test)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.1033813679483604

In [19]:

ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X_train_scaled, y_train)

In [20]:
# measures training error.
sales_predictions_train = ridge_reg.predict(X_train_scaled)
grid_mse = mean_squared_error(sales_predictions_train, y_train)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.7303022682486944

In [21]:
# measures test error.
sales_predictions_test = ridge_reg.predict(X_test_scaled)
grid_mse = mean_squared_error(sales_predictions_test, y_test)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.110511990644951

In [22]:

lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train_scaled, y_train)

In [23]:
# measures training error.
sales_predictions_train = lasso_reg.predict(X_train_scaled)
grid_mse = mean_squared_error(sales_predictions_train, y_train)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.730996680800453

In [24]:
# measures test error.
sales_predictions_test = lasso_reg.predict(X_test_scaled)
grid_mse = mean_squared_error(sales_predictions_test, y_test)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.1114185101970957

In [25]:

elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train_scaled, y_train)

In [26]:
# measures training error.
sales_predictions_train = elastic_net.predict(X_train_scaled)
grid_mse = mean_squared_error(sales_predictions_train, y_train)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.730996680800453

In [27]:
# measures test error.
sales_predictions_test = elastic_net.predict(X_test_scaled)
grid_mse = mean_squared_error(sales_predictions_test, y_test)
grid_rmse = np.sqrt(grid_mse)
grid_rmse

2.1114185101970957