# **Linear regression**

In [42]:
data_for_regression = full_week[['DAY_OF_WEEK','DELAY','ORIGIN_TIMESTAMP','SEQUENCE']]

In [43]:
timestamp = pd.to_datetime(data_for_regression['ORIGIN_TIMESTAMP'].astype(str))
seconds = timestamp.dt.second
minutes = timestamp.dt.minute
hours = timestamp.dt.hour

timestamp_seconds = seconds + minutes * 60 + hours * 60 * 60
data_for_regression["TIMESTAMP_SECONDS"] = timestamp_seconds
data_for_regression.drop(columns=["ORIGIN_TIMESTAMP",], inplace=True)

In [44]:
data_for_regression

Unnamed: 0,DAY_OF_WEEK,DELAY,SEQUENCE,TIMESTAMP_SECONDS
2100,Tuesday,0.0,1,27896
2101,Tuesday,66.0,2,28080
2102,Tuesday,19.0,3,28281
2103,Tuesday,17.0,4,28347
2104,Tuesday,-21.0,5,28375
...,...,...,...,...
5711,Sunday,5.0,12,31568
5712,Sunday,0.0,13,31632
5713,Sunday,-66.0,14,31675
5714,Sunday,-138.0,15,31831


source: *https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html*

Magic

In [45]:
one_hot_dow = OneHotEncoder(drop="first", sparse=False)
one_hot_seq = OneHotEncoder(drop="first", sparse=False)

A_dummies_tmp = one_hot_dow.fit_transform(data_for_regression["DAY_OF_WEEK"].to_numpy().reshape(-1, 1))
A_dummies = pd.DataFrame(A_dummies_tmp, columns=one_hot_dow.get_feature_names()).dropna()

B_dummies_tmp = one_hot_seq.fit_transform(data_for_regression["SEQUENCE"].to_numpy().reshape(-1, 1))
B_dummies = pd.DataFrame(B_dummies_tmp, columns=one_hot_seq.get_feature_names()).dropna()

In [46]:
# Naše stará verze, zdroj: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
# A_dumies = pd.get_dummies(data_for_regression['DAY_OF_WEEK'],drop_first=True).dropna()
# B_dumies = pd.get_dummies(data_for_regression['SEQUENCE'],drop_first=True).dropna()

In [47]:
merged = pd.concat([
    data_for_regression.reset_index(drop=True),
    A_dummies.reset_index(drop=True),
    B_dummies.reset_index(drop=True),
],axis='columns')

In [49]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 978 entries, 0 to 977
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   DAY_OF_WEEK        978 non-null    category
 1   DELAY              978 non-null    float64 
 2   SEQUENCE           978 non-null    int64   
 3   TIMESTAMP_SECONDS  978 non-null    int64   
 4   x0_Monday          978 non-null    float64 
 5   x0_Saturday        978 non-null    float64 
 6   x0_Sunday          978 non-null    float64 
 7   x0_Thursday        978 non-null    float64 
 8   x0_Tuesday         978 non-null    float64 
 9   x0_Wednesday       978 non-null    float64 
 10  x0_2               978 non-null    float64 
 11  x0_3               978 non-null    float64 
 12  x0_4               978 non-null    float64 
 13  x0_5               978 non-null    float64 
 14  x0_6               978 non-null    float64 
 15  x0_7               978 non-null    float64 
 16  x0_8    

In [50]:
final = merged.drop(['DAY_OF_WEEK','SEQUENCE'],axis='columns')

#### We will predict Y from X

In [51]:
X = final.drop(['DELAY'],axis='columns')

Přidání konstanty (const = 1) z teoretického hlediska (=intercept).

In [52]:
X = sm.add_constant(X)

In [53]:
Y = merged.DELAY

#### Metoda nejmenších čtevrců

zdroj: *https://www.statsmodels.org/devel/generated/statsmodels.regression.linear_model.OLS.html?highlight=ols#statsmodels.regression.linear_model.OLS*

In [54]:
sm_model = sm.OLS(Y, X)
sm_result = sm_model.fit()

Přehled 

In [55]:
sm_result.summary()

0,1,2,3
Dep. Variable:,DELAY,R-squared:,0.27
Model:,OLS,Adj. R-squared:,0.254
Method:,Least Squares,F-statistic:,16.8
Date:,"Sun, 31 May 2020",Prob (F-statistic):,7.189999999999999e-52
Time:,13:36:25,Log-Likelihood:,-6042.4
No. Observations:,978,AIC:,12130.0
Df Residuals:,956,BIC:,12240.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-591.5553,71.235,-8.304,0.000,-731.350,-451.761
TIMESTAMP_SECONDS,0.0198,0.002,8.624,0.000,0.015,0.024
x0_Monday,82.7690,13.795,6.000,0.000,55.697,109.841
x0_Saturday,16.1607,20.915,0.773,0.440,-24.883,57.205
x0_Sunday,-39.7629,20.758,-1.916,0.056,-80.500,0.974
x0_Thursday,35.4912,15.812,2.245,0.025,4.462,66.521
x0_Tuesday,55.4007,13.846,4.001,0.000,28.228,82.573
x0_Wednesday,68.4139,13.794,4.960,0.000,41.343,95.484
x0_2,19.6053,21.052,0.931,0.352,-21.708,60.919

0,1,2,3
Omnibus:,245.663,Durbin-Watson:,0.426
Prob(Omnibus):,0.0,Jarque-Bera (JB):,697.199
Skew:,1.261,Prob(JB):,4.0299999999999995e-152
Kurtosis:,6.278,Cond. No.,587000.0


#### Fitování modelu

In [56]:
model = LinearRegression()

In [57]:
model.fit(X.drop(columns="const"), Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [58]:
model.coef_

array([ 1.98052282e-02,  8.27690426e+01,  1.61607046e+01, -3.97628671e+01,
        3.54912499e+01,  5.54007061e+01,  6.84139499e+01,  1.96053389e+01,
        2.29975278e+01,  1.83723165e+01,  1.03532262e+00,  2.03841754e+01,
        4.16058980e+01,  6.84581019e+01,  7.82277019e+01,  6.56721603e+01,
        7.43963946e+01,  1.02866564e+02,  1.53388904e+02,  1.65325975e+02,
        1.60414099e+02])

In [59]:
data_for_regression.head()

Unnamed: 0,DAY_OF_WEEK,DELAY,SEQUENCE,TIMESTAMP_SECONDS
2100,Tuesday,0.0,1,27896
2101,Tuesday,66.0,2,28080
2102,Tuesday,19.0,3,28281
2103,Tuesday,17.0,4,28347
2104,Tuesday,-21.0,5,28375


#### Predikce

In [127]:
data_for_prediction = pd.DataFrame({
    "DAY_OF_WEEK": ["Tuesday"],
    "SEQUENCE": [13],
    "TIMESTAMP_SECONDS": [28000],
})

In [128]:
A_dummies_tmp = one_hot_dow.transform(data_for_prediction["DAY_OF_WEEK"].to_numpy().reshape(-1, 1))
A_dummies = pd.DataFrame(A_dummies_tmp, columns=one_hot_dow.get_feature_names()).dropna()

B_dummies_tmp = one_hot_seq.transform(data_for_prediction["SEQUENCE"].to_numpy().reshape(-1, 1))
B_dummies = pd.DataFrame(B_dummies_tmp, columns=one_hot_seq.get_feature_names()).dropna()

In [129]:
merged = pd.concat([
    data_for_prediction.reset_index(drop=True),
    A_dummies.reset_index(drop=True),
    B_dummies.reset_index(drop=True),
],axis='columns')

final = merged.drop(['DAY_OF_WEEK','SEQUENCE'],axis='columns')

In [130]:
final

Unnamed: 0,TIMESTAMP_SECONDS,x0_Monday,x0_Saturday,x0_Sunday,x0_Thursday,x0_Tuesday,x0_Wednesday,x0_2,x0_3,x0_4,...,x0_6,x0_7,x0_8,x0_9,x0_10,x0_11,x0_12,x0_13,x0_14,x0_15
0,28000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [131]:
model.predict(final)

array([171.7806759])