# Linear Regression

- It's always a good idea to start with a simple model before adding complexity, especially in the case of time series data analysis, where overfitting can be a significant concern, and simpler models often provide more interpretable and generalizable insights.

## Imports

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Load Data

In [39]:
daily_df = pd.read_csv('data/clean_data/daily.csv').drop(columns='id')
monthly_df = pd.read_csv('data/clean_data/monthly.csv')

In [40]:
print(daily_df.shape)
daily_df.head()

(188340, 9)


Unnamed: 0,date,store_id,store_type,location_type,region_code,holiday,discount,orders,sales
0,2018-01-01,1,S1,L3,R1,1,1,9,7011.84
1,2018-01-01,253,S4,L2,R1,1,1,60,51789.12
2,2018-01-01,252,S3,L2,R1,1,1,42,36868.2
3,2018-01-01,251,S2,L3,R1,1,1,23,19715.16
4,2018-01-01,250,S2,L3,R4,1,1,62,45614.52


In [41]:
print(monthly_df.shape)
monthly_df.head()

(6205, 9)


Unnamed: 0,date,store_id,store_type,location_type,region_code,holiday,discount,orders,sales
0,2018-01,1,S1,L3,R1,5,15,1884,1145607.81
1,2018-01,2,S3,L1,R3,5,15,2144,1440212.91
2,2018-01,3,S4,L2,R1,5,15,3225,1881592.47
3,2018-01,4,S1,L1,R2,5,15,1871,1196211.06
4,2018-01,5,S1,L1,R3,5,15,2091,1445288.52


In [42]:
Xtrain_day, Xtest_day, ytrain_day, ytest_day = train_test_split(
                                                    daily_df.drop('sales', axis=1),
                                                    daily_df['sales'],
                                                    test_size=0.2,
                                                    random_state=42)

In [43]:
Xtrain_month, Xtest_month, ytrain_month, ytest_month= train_test_split(
                                                        monthly_df.drop('sales', axis=1),
                                                        monthly_df['sales'],
                                                        test_size=0.2,
                                                        random_state=42)

___

## Linear Regression

In [44]:
num_features = ['orders']
ohe_cols = ['store_type','location_type','region_code']

ct = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'), ohe_cols),
    ('poly', PolynomialFeatures(degree=2, include_bias=False), num_features),
])


In [45]:
lr_pipe = Pipeline([
    ('ct', ct),
    ('lr', LinearRegression())
])

In [46]:
lr_pipe.fit(Xtrain_day, ytrain_day)
print(lr_pipe.score(Xtrain_day, ytrain_day))
lr_pipe.score(Xtest_day, ytest_day)

0.9172791563787519


0.9151996030192711

In [50]:
Xtrain_day.columns

Index(['date', 'store_id', 'store_type', 'location_type', 'region_code',
       'holiday', 'discount', 'orders'],
      dtype='object')

In [49]:
lr_pipe['lr'].coef_

array([ 1.61806481e+03,  5.32795087e+03, -7.43099025e+02, -6.20291666e+03,
        1.66254173e+03,  5.65816559e+03, -1.53374975e+03, -2.30421433e+03,
       -3.48274324e+03, -1.69771525e+03, -2.09577981e+02,  9.60823720e+02,
        9.46469509e+02,  6.83416489e+02, -3.79240796e-01])

### Residual Analysis

___

## Ridge Regression

### Residual Analysis

___

## Lasso Regression

### Residual Analysis

___