# AI 221 Machine Exercise 2
# Name | SN | Batch
#
To replicate the results of the notebook, run all cells in sequence.

In [None]:
#Import libraries
import os
import sklearn as sk
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import root_mean_squared_error

In [None]:
seed = 413094539 #seed used for the report
#uncomment to generate a new seed
#seed = np.random.randint((2**31)-1)
rng = np.random.default_rng(seed)
penguin_split_seed = rng.integers((2**31)-1)
penguin_svc_state = rng.integers((2**31)-1)
bike_split_seed = rng.integers((2**31)-1)
bike_svr_state = rng.integers((2**31)-1)


print(f"Seed used for randomizer: {seed}")
print(f"Palmer Penguin Train-test Splitter Seed: {penguin_split_seed}")
print(f"Palmer Penguin SVM Classifier Seed: {penguin_svc_state}")
print(f"Bike Sharing Train-test Splitter Seed: {bike_split_seed}")
print(f"Bike Sharing SVM Classifier Seed: {bike_svr_state}")

## Problem 1: Palmer Penguin Species Data Set

In [None]:
#Load the palmer penguins dataset
df_p = pd.read_csv("./datasets/penguins_size.csv")
print(df_p)


## Problem 2: Predicting Bike Sharing Demand in Seoul, South Korea

In [None]:
df_p = pd.read_csv("./datasets/SeoulBikeData.csv",encoding_errors="replace")
print(df_p)

In [None]:
print(df_p["Rented Bike Count"])

In [31]:
#Preprocess data


#split dataframe to data and labels
X = df_p.loc[df_p["Seasons"]=="Winter"].iloc[:,2:10]
y = df_p.loc[df_p["Seasons"]=="Winter"].loc[:,["Rented Bike Count"]]
print(X)
print(y)

      Hour  Temperature(�C)  Humidity(%)  Wind speed (m/s)  Visibility (10m)  \
0        0             -5.2           37               2.2              2000   
1        1             -5.5           38               0.8              2000   
2        2             -6.0           39               1.0              2000   
3        3             -6.2           40               0.9              2000   
4        4             -6.0           36               2.3              2000   
...    ...              ...          ...               ...               ...   
2155    19              2.5           95               1.9               838   
2156    20              2.7           96               1.5              1479   
2157    21              2.4           95               2.5              1349   
2158    22              2.3           96               1.9              1207   
2159    23              1.8           96               1.2               745   

      Dew point temperature(�C)  Solar 

### 2.b.
* Split samples into 70% training and 30% testing data randomly with stratify=y.

In [32]:
#perform train test split : 70-30
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=bike_split_seed)
print(f"Number of training samples: {len(X_train)} -> {round(100.0*len(X_train)/len(X),2)}%")
print(f"Number of testing samples: {len(X_test)} -> {round(100.0*len(X_test)/len(X),2)}%")

Number of training samples: 1512 -> 70.0%
Number of testing samples: 648 -> 30.0%


* Construct pipeline with Standard Scaler and SVR
* Compute RMSE

In [33]:
#define parameters
kernel = "linear"
gamma = 3
epsilon = 0.4

#make pipeline with parameters
bike_svr_pipeline = Pipeline([
    ("scaler",StandardScaler()),
    ("svr",SVR(kernel=kernel,gamma=gamma,epsilon=epsilon))
])

#train data
bike_svr_pipeline.fit(X_train,y_train.values.ravel())

#get predicted values from logistic regression
train_prediction = bike_svr_pipeline.predict(X_train)
test_prediction = bike_svr_pipeline.predict(X_test)

#compute RMSE of pipeline
train_rmse = root_mean_squared_error(y_train,train_prediction)
test_rmse = root_mean_squared_error(y_test,test_prediction)
print(f"Kernel used: {kernel}")
print(f"Kernel parameter used: {gamma}")
print(f"Epsilon used: {epsilon}")
print(f"Training Set Error: {round(train_rmse,4)}")
print(f"Testing Set Error: {round(test_rmse,4)}")

Kernel used: linear
Kernel parameter used: 3
Epsilon used: 0.4
Training Set Error: 130.5076
Testing Set Error: 123.6246


### 2.c.
* Repeat for KRR instead of SVR

In [34]:
#define parameters
kernel = "linear"
gamma = 3
alpha = 0.4

#make pipeline with parameters
bike_krr_pipeline = Pipeline([
    ("scaler",StandardScaler()),
    ("krr",KernelRidge(kernel=kernel,gamma=gamma,alpha=alpha))
])

#train data
bike_krr_pipeline.fit(X_train,y_train.values.ravel())

#get predicted values from linear regression
train_prediction = bike_krr_pipeline.predict(X_train)
test_prediction = bike_krr_pipeline.predict(X_test)

#compute RMSE of pipeline
train_rmse = root_mean_squared_error(y_train,train_prediction)
test_rmse = root_mean_squared_error(y_test,test_prediction)
print(f"Kernel used: {kernel}")
print(f"Kernel parameter used: {gamma}")
print(f"Alpha used: {alpha}")
print(f"Training Set Error: {round(train_rmse,4)}")
print(f"Testing Set Error: {round(test_rmse,4)}")

Kernel used: linear
Kernel parameter used: 3
Alpha used: 0.4
Training Set Error: 260.8477
Testing Set Error: 254.1985


### 2.d.
* Repeat for Linear Regression instead of SVR

In [35]:
#make pipeline
bike_lin_pipeline = Pipeline([
    ("scaler",StandardScaler()),
    ("linreg",LinearRegression(fit_intercept=True))
])

#train data
bike_lin_pipeline.fit(X_train,y_train.values.ravel())

#get predicted values from linear regression
train_prediction = bike_lin_pipeline.predict(X_train)
test_prediction = bike_lin_pipeline.predict(X_test)

#compute RMSE of pipeline
train_rmse = root_mean_squared_error(y_train,train_prediction)
test_rmse = root_mean_squared_error(y_test,test_prediction)
print(f"Training Set Error: {round(train_rmse,4)}")
print(f"Testing Set Error: {round(test_rmse,4)}")

Training Set Error: 127.1177
Testing Set Error: 122.4427


* Compare results for all three models