In [128]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [129]:
df = pd.read_csv('D:\Hutson\learning-materials\AI&ML\Khóa 12-02AIMLDLCV nâng cao\Class\Datasets\day.csv')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


In [131]:
unused_columns = ['instant', 'dteday', 'casual', 'registered']
df = df.drop(unused_columns, axis=1)
df.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,1562
4,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,1600


## Data preprocessing

In [132]:
feature = 'cnt'
x = df.drop(feature, axis=1)
y = df[feature]

In [133]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [134]:
num_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='median')),
    ('num_scaler', StandardScaler())
])

In [135]:
numeric_columns = x_train.select_dtypes(include=[int, float]).columns
numeric_columns

Index(['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed'],
      dtype='object')

In [136]:
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, numeric_columns)]
    )

In [137]:
df

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,1562
4,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,1600
...,...,...,...,...,...,...,...,...,...,...,...,...
726,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,2114
727,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,3095
728,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,1341
729,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,1796


In [138]:
param_grid =  { 
    'kernel': ['linear', 'rbf'], 
    'C': [0.1, 1, 10], 
    'gamma': [0.1, 1, 10] }

model = GridSearchCV(SVR(), param_grid, cv=5)

reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', LinearRegression())])

reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)

In [139]:
for i, j in zip(y_pred, y_test):
    print("Predicted: ", i, "Actual: ", j)

Predicted:  6220.416612145106 Actual:  6606
Predicted:  1571.6057632139778 Actual:  1550
Predicted:  3043.967028078629 Actual:  3747
Predicted:  4307.202484001378 Actual:  6041
Predicted:  6765.659071662067 Actual:  7538
Predicted:  7231.536105964395 Actual:  7264
Predicted:  774.904146714166 Actual:  1605
Predicted:  2139.6066014898197 Actual:  2209
Predicted:  7112.745605248194 Actual:  7499
Predicted:  6458.992168786488 Actual:  5743
Predicted:  2899.520114198105 Actual:  1796
Predicted:  2118.7822368942975 Actual:  3068
Predicted:  4072.2422231250084 Actual:  4891
Predicted:  5451.212988187958 Actual:  5260
Predicted:  2384.5383859186613 Actual:  2133
Predicted:  2016.2439964955402 Actual:  2471
Predicted:  2383.719503495796 Actual:  2046
Predicted:  7428.59110259186 Actual:  8156
Predicted:  5679.62921960381 Actual:  5362
Predicted:  2937.1022198593246 Actual:  2298
Predicted:  7227.782544192331 Actual:  7697
Predicted:  8234.5057493547 Actual:  5463
Predicted:  5165.556254775544 

In [140]:
### r2 score is most common metric for regression (the higher the better)
print(reg.score(x_test, y_test))

0.8276670090367212
