# 4.1 Linear Regression Model

## 4.1.1 Import libraries

In [23]:
import pandas as pd
import numpy as np

import warnings

from sklearn.neighbors import LocalOutlierFactor
from sklearn import datasets, linear_model, metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')

## 4.1.2 Load dataset

In [24]:
# no outliers
X_train_no = pd.read_csv('../dataset/no_outliers/X_train.csv')
X_test_no = pd.read_csv('../dataset/no_outliers/X_test.csv')
y_train_no = pd.read_csv('../dataset/no_outliers/y_train.csv')
y_test_no = pd.read_csv('../dataset/no_outliers/y_test.csv')

# original data
X_train_with = pd.read_csv('../dataset/with_outliers/X_train.csv')
X_test_with = pd.read_csv('../dataset/with_outliers/X_test.csv')
y_train_with = pd.read_csv('../dataset/with_outliers/y_train.csv')
y_test_with = pd.read_csv('../dataset/with_outliers/y_test.csv')

## 4.1.3 Model training and evaluation

In [25]:
model = LinearRegression()

### Without outliers

In [27]:
reg = model.fit(X_train_no, y_train_no)

In [28]:
pred_no = reg.predict(X_test_no)

In [29]:
pred2_no = np.array(pred_no).flatten()
pred2_no

array([0.82054053, 0.71948332, 0.79178193, 0.60754567, 0.61189722,
       0.69439322, 0.37885467, 0.17774356, 0.13055142, 0.19382167,
       0.64181612, 0.61431353, 0.23172503, 0.39351248, 0.48853896,
       0.23416289, 0.45753897, 0.52966265, 0.48296494, 0.83734826,
       0.34011356, 0.29130584, 0.45522435, 0.65969691, 0.60825841,
       0.62172779, 0.51556193, 0.74872801, 0.09752597, 0.51369892,
       0.50904461, 0.58737231, 0.70639886, 0.58396554, 0.82990136,
       0.97277379, 0.68736439, 0.50608642, 0.30604265, 0.17056992,
       0.27991999, 0.47395354, 0.24398121, 0.77292584, 0.60204261,
       0.62204647, 0.39381209, 0.44141737, 0.40060625, 0.77860739,
       0.4778154 , 0.73462769, 0.89031034, 0.83887849, 0.68256123,
       0.55394808, 0.36176806, 0.32885214, 0.8615979 , 0.84320029,
       0.57508769, 0.9030528 , 0.73080545, 0.14682104, 0.54671629,
       0.65461289, 0.33412581, 0.25206183, 0.65090183, 0.60491997,
       0.61644744, 0.84412341, 0.41043367, 0.37242051, 0.29593

In [30]:
df_no = pd.DataFrame(data = X_test_no, columns = X_test_no.columns)
df_no.head()

Unnamed: 0,season,year,month,holiday,weekday,weather,temperature,humidity,windspeed
0,0.666667,1.0,0.545455,0.0,0.0,0.0,0.838615,0.499402,0.33711
1,0.333333,1.0,0.363636,0.0,0.666667,0.0,0.676174,0.350479,0.492593
2,0.666667,1.0,0.636364,0.0,0.833333,0.5,0.894519,0.526316,0.456582
3,0.333333,1.0,0.272727,0.0,0.833333,0.0,0.435679,0.145335,0.67922
4,1.0,1.0,0.909091,0.0,0.166667,0.0,0.329145,0.313398,0.51062


In [31]:
df_no['y_test_count(with)'] = y_test_with

df_no['predict_count'] = pred2_no

df_no.head()

Unnamed: 0,season,year,month,holiday,weekday,weather,temperature,humidity,windspeed,y_test_count(with),predict_count
0,0.666667,1.0,0.545455,0.0,0.0,0.0,0.838615,0.499402,0.33711,0.69155,0.820541
1,0.333333,1.0,0.363636,0.0,0.666667,0.0,0.676174,0.350479,0.492593,0.535568,0.719483
2,0.666667,1.0,0.636364,0.0,0.833333,0.5,0.894519,0.526316,0.456582,0.143677,0.791782
3,0.333333,1.0,0.272727,0.0,0.833333,0.0,0.435679,0.145335,0.67922,0.156803,0.607546
4,1.0,1.0,0.909091,0.0,0.166667,0.0,0.329145,0.313398,0.51062,0.474979,0.611897


In [32]:
mape_no = metrics.mean_absolute_percentage_error(y_test_no, pred2_no)
mse_no = metrics.mean_squared_error(y_test_no, pred2_no)
rmse_no = np.sqrt(metrics.mean_squared_error(y_test_no, pred2_no))
r2_no = metrics.r2_score(y_test_no, pred2_no)

print('Mean Absolute Error:', mape_no)
print('Mean Squared Error:', mse_no)
print('Root Mean Squared Error:', rmse_no)
print('Root Square:', r2_no)

Mean Absolute Error: 0.20316398898483953
Mean Squared Error: 0.010412185085195525
Root Mean Squared Error: 0.10204011507831381
Root Square: 0.8188807369784254


In [33]:
print("Accuracy: {}%".format(r2_no * 100))

Accuracy: 81.88807369784254%


### With outliers

In [34]:
reg = model.fit(X_train_with, y_train_with)

In [35]:
pred = reg.predict(X_test_with)


In [36]:
pred2 = np.array(pred).flatten()
pred2

array([0.60523268, 0.63554391, 0.20553659, 0.13078415, 0.47687984,
       0.32944032, 0.40660713, 0.70324813, 0.66323184, 0.14698711,
       0.17716138, 0.13726535, 0.17533248, 0.53865147, 0.6327383 ,
       0.41358627, 0.71697761, 0.73659321, 0.43220689, 0.25144433,
       0.87439885, 0.38154652, 0.61015544, 0.49885937, 0.28304488,
       0.71447303, 0.72490858, 0.60061261, 0.74849211, 0.73370616,
       0.09417281, 0.57705451, 0.61681121, 0.62895702, 0.23408131,
       0.52326795, 0.84635611, 0.97490211, 0.32112013, 0.35462822,
       0.67819579, 0.17431712, 0.58845313, 0.46838579, 0.8368095 ,
       0.77259489, 0.38481929, 0.40244801, 0.42281706, 0.20679013,
       0.70149363, 0.83728941, 0.5666676 , 0.69266976, 0.51638795,
       0.43694033, 0.36911559, 0.73403107, 0.82902155, 0.50733193,
       0.68905493, 0.54708631, 0.59849863, 0.77854947, 0.73669225,
       0.38633451, 0.55990651, 0.54429325, 0.77336531, 0.59246534,
       0.62657292, 0.84289286, 0.48138957, 0.8157162 , 0.45003

In [37]:
df = pd.DataFrame(data = X_test_with, columns = X_test_with.columns)
df.head()

Unnamed: 0,season,year,month,holiday,weekday,weather,temperature,humidity,windspeed
0,0.666667,0.0,0.545455,0.0,1.0,0.0,0.794313,0.44378,0.436975
1,0.666667,0.0,0.545455,0.0,0.666667,0.0,0.874478,0.538876,0.307644
2,0.0,0.0,0.0,0.0,1.0,0.5,0.220499,0.319976,0.304389
3,0.0,0.0,0.090909,0.0,0.333333,0.5,0.168402,0.79483,0.02871
4,0.0,1.0,0.0,0.0,0.666667,0.5,0.357625,0.708732,0.081826


In [38]:
df['y_test_count(with)'] = y_test_with

df['predict_count'] = pred2

df.head()

Unnamed: 0,season,year,month,holiday,weekday,weather,temperature,humidity,windspeed,y_test_count(with),predict_count
0,0.666667,0.0,0.545455,0.0,1.0,0.0,0.794313,0.44378,0.436975,0.69155,0.605233
1,0.666667,0.0,0.545455,0.0,0.666667,0.0,0.874478,0.538876,0.307644,0.535568,0.635544
2,0.0,0.0,0.0,0.0,1.0,0.5,0.220499,0.319976,0.304389,0.143677,0.205537
3,0.0,0.0,0.090909,0.0,0.333333,0.5,0.168402,0.79483,0.02871,0.156803,0.130784
4,0.0,1.0,0.0,0.0,0.666667,0.5,0.357625,0.708732,0.081826,0.474979,0.47688


In [39]:
mape = metrics.mean_absolute_percentage_error(y_test_with, pred2)
mse = metrics.mean_squared_error(y_test_with, pred2)
rmse = np.sqrt(metrics.mean_squared_error(y_test_with, pred2))
r2 = metrics.r2_score(y_test_with, pred2)

print('Mean Absolute Error:', mape)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('Root Square:', r2)

Mean Absolute Error: 0.21613177289094146
Mean Squared Error: 0.0116497901542051
Root Mean Squared Error: 0.10793419362836366
Root Square: 0.8005602933174292


In [40]:
print("Accuracy: {}%".format(r2 * 100))

Accuracy: 80.05602933174292%
