<h1><center>SEIS 763 Machine Learning<br>Team 4 Project - Max Features<br>Wess Kilker&emsp;Tianyu Lei&emsp;Jason Xiao &emsp;Jessica Zastoupil</center></h1>

### Data Load, Feature Updates & Data Prep

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Use the encoding as latin1 to read this file as there are some special characters in the file
df = pd.read_csv('SeoulBikeData.csv',encoding='latin1')

# Filter out non functioning days
df = df.loc[df['Functioning Day'] == 'Yes']

#Add Weekday Column
df['DayOfWeek'] = pd.to_datetime(df['Date']).apply(lambda x: x.weekday())

# Move rented bike count to end and remove unneeded features: Date and Functioning Day
df = df[['Hour', 'Temperature(°C)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons', 'Holiday','DayOfWeek', 'Rented Bike Count']]

# Split into X and y
X = df.iloc[:,0:-1].values #All except the last column
y = df.iloc[:,-1].values #Only the last column

# Deal with categorical variables: OneHot Encoding
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
col_trans = make_column_transformer((OneHotEncoder(), [9,10]), remainder='passthrough')
X = col_trans.fit_transform(X)

# Drop excess column for each encoded feature
X=X[:,[1,2,3,5,6,7,8,9,10,11,12,13,14,15]]

# Normalizing the features
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = sc_X.fit_transform(X)

#Splitting the data into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
print(X_train.shape)
print(y_train.shape)

(5925, 14)
(5925,)


### PCA Transformation

In [None]:
#Find PCA number of components
#Took about 25 min.
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
for i in range(14, 1, -1):
    comp=i if i > 0 else None
    pcaObj = PCA(n_components=comp)
    X_train_PCA = pcaObj.fit_transform(X_train)
    X_test_PCA = pcaObj.transform(X_test)
    components_variance = pcaObj.explained_variance_ratio_

    #Fitting Logistic Regression to Training Set
    classifierObj = LogisticRegression(max_iter=5000, random_state=0)
    classifierObj.fit(X_train_PCA, y_train)


    #Model Accuracy
    print('The PCA with ' + str(i) + ' components model accuracy of the training set is: ' + str(classifierObj.score(X_train_PCA,y_train)))
    print('The PCA with ' + str(i) + ' components model accuracy of the test set is: ' + str(classifierObj.score(X_test_PCA,y_test)))

The PCA with 14 components model accuracy of the training set is: 0.05063291139240506
The PCA with 14 components model accuracy of the test set is: 0.0031496062992125984
The PCA with 13 components model accuracy of the training set is: 0.049789029535864976
The PCA with 13 components model accuracy of the test set is: 0.0031496062992125984
The PCA with 12 components model accuracy of the training set is: 0.043544303797468355
The PCA with 12 components model accuracy of the test set is: 0.001968503937007874
The PCA with 11 components model accuracy of the training set is: 0.04033755274261604
The PCA with 11 components model accuracy of the test set is: 0.002362204724409449
The PCA with 10 components model accuracy of the training set is: 0.036793248945147676
The PCA with 10 components model accuracy of the test set is: 0.002362204724409449
The PCA with 9 components model accuracy of the training set is: 0.02751054852320675
The PCA with 9 components model accuracy of the test set is: 0.00

In [None]:
#Applying PCA using number of components from best fit in code above
from sklearn.decomposition import PCA

pcaObj = PCA(n_components=14)
X_train_PCA = pcaObj.fit_transform(X_train)
X_test_PCA = pcaObj.transform(X_test)

print(X_train_PCA.shape)
print(X_test_PCA.shape)

(5925, 14)
(2540, 14)


### Models

#### Linear Regression

In [None]:
#Fitting Linear Regression to Training Set
from sklearn.linear_model import LinearRegression
import numpy as np

lrObj = LinearRegression()
lrObj.fit(X_train_PCA, y_train)

#Prediction on the Test Set
y_pred_lr = lrObj.predict(X_test_PCA)

#accuracy (R squared)
print('The linear model accuracy is: ' + str(lrObj.score(X_test_PCA,y_test)))

# mean squared error/Root Mean Square Error
from sklearn.metrics import mean_squared_error
mse_LR=round((mean_squared_error(y_test, y_pred_lr))/100, 4)
rmse_LR = round((np.sqrt(mse_LR))/100 ,4)

print("MSE:", mse_LR)
print("RMSE:",rmse_LR)

#K-Fold Cross Validation
print('Cross Validation')
from sklearn.model_selection import cross_val_score
modelAccuracies_LR = cross_val_score(estimator=lrObj, X=X_train_PCA, y=y_train, cv=10)
print('Model Accuracy Mean:',modelAccuracies_LR.mean())
print('Model Accuracy StDev:',modelAccuracies_LR.std())

The linear model accuracy is: 0.5369674634407413
MSE: 1874.214
RMSE: 0.4329
Cross Validation
Model Accuracy Mean: 0.5398046502372322
Model Accuracy StDev: 0.027462818358036992


#### Polynomial (Degrees = 2)

In [None]:
# Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

polyFeatureObj = PolynomialFeatures(degree=2)
poly_X_train = polyFeatureObj.fit_transform(X_train)
poly_X_test = polyFeatureObj.transform(X_test)

#Fit LinerR Model
lrObjAfterPoly = linear_model.LinearRegression()
model = lrObjAfterPoly.fit(poly_X_train, y_train)

#Prediction on the Test Set
poly_y_pred = lrObjAfterPoly.predict(poly_X_test)
score_poly = model.score(poly_X_test, y_test)

#accuracy (R squared)
print('The model accuracy after Polynomial (degree=2) is: ' + str(score_poly))

# mean squared error/Root Mean Square Error
from sklearn.metrics import mean_squared_error
mse_poly=round((mean_squared_error(y_test, poly_y_pred))/100, 4)
rmse_poly = round((np.sqrt(mse_poly))/100 ,4)

print("MSE:", mse_poly)
print("RMSE:",rmse_poly)

#K-Fold Cross Validation
print('Cross Validation')
from sklearn.model_selection import cross_val_score
modelAccuracies_poly = cross_val_score(estimator=lrObjAfterPoly, X=poly_X_train, y=y_train, cv=10)
print('Model Accuracy Mean:',modelAccuracies_poly.mean())
print('Model Accuracy StDev:',modelAccuracies_poly.std())

The model accuracy after Polynomial (degree=2) is: 0.6357086650330619
MSE: 1474.5398
RMSE: 0.384
Cross Validation
Model Accuracy Mean: 0.642394813169344
Model Accuracy StDev: 0.05650869906670755


#### Tree Forest

In [None]:
#Create and Fit a Regressor Object
from sklearn.ensemble import RandomForestRegressor

regObj = RandomForestRegressor(n_estimators=10, random_state=0)
regObj.fit(X_train_PCA,y_train)
reg_pred_y = regObj.predict(X_test_PCA)

score_reg = regObj.score(X_test_PCA,y_test)

#accuracy (R squared)
print('The model accuracy after Polynomial (degree=2) is: ' + str(score_reg))

# mean squared error/Root Mean Square Error
from sklearn.metrics import mean_squared_error
mse_reg=round((mean_squared_error(y_test, reg_pred_y))/100, 4)
rmse_reg = round((np.sqrt(mse_reg))/100 ,4)

print("MSE:", mse_reg)
print("RMSE:",rmse_reg)

#K-Fold Cross Validation
print('\nCross Validation:')
from sklearn.model_selection import cross_val_score
modelAccuracies_reg = cross_val_score(estimator=regObj, X=X_train_PCA, y=y_train, cv=10)
print('Model Accuracy Mean:',modelAccuracies_reg.mean())
print('Model Accuracy StDev:',modelAccuracies_reg.std())

#Visualizing after Tree Forest Regression
import matplotlib.pyplot as plt

#compare results vs actual
fig, ax = plt.subplots()
plt.hist(y_test - reg_pred_y)
plt.annotate("MSE: " + str(round(mse_reg,4)) + "\nRMSE: " +str(round(rmse_reg,4)) + "\nAccuracy: " + str(round(score_reg,4)), (-1500,700))
plt.title('Histogram of prediction errors')
plt.xlabel('Prediction error')
plt.ylabel('Frequency')
plt.show()

The model accuracy after Polynomial (degree=2) is: 0.7749862598617001
MSE: 910.7867
RMSE: 0.3018

Cross Validation:
Model Accuracy Mean: 0.7786229569461549
Model Accuracy StDev: 0.021592380402799295


#### KNN Regressor

In [None]:
#KNN - Regression
from sklearn.neighbors import KNeighborsRegressor

X_Results = np.empty((0), int)
Y_Results = np.empty((0), float)
for i in range(1,20, 1):
    KNeighborsRegressorObj = KNeighborsRegressor(n_neighbors=i, p=2, metric='minkowski')
    KNeighborsRegressorObj.fit(X_train_PCA, y_train)
    
    #Making predictions on the Test Set
    y_pred = KNeighborsRegressorObj.predict(X_test_PCA)
    X_Results = np.append(X_Results, [i], axis = 0)
    Y_Results = np.append(Y_Results, [1-KNeighborsRegressorObj.score(X_test_PCA,y_test)], axis = 0)
   
#plt.xlabel('kNN')
#plt.ylabel('Misclassification Rate')
#plt.plot(X_Results, Y_Results)
minY = min(Y_Results)
y_index = np.where(Y_Results == minY)

#Keep the best result for use later
KNeighborsRegressorObj = KNeighborsRegressor(n_neighbors=X_Results[y_index][0], p=2, metric='minkowski')
KNeighborsRegressorObj.fit(X_train_PCA, y_train)

#Making predictions on the Test Set
y_pred_knn = KNeighborsRegressorObj.predict(X_test_PCA)

#accuracy 
score_knn = KNeighborsRegressorObj.score(X_test_PCA,y_test)
print('The accuracy after kNN is: ' + str(score_knn))

# mean squared error/Root Mean Square Error
mse_knn=round((mean_squared_error(y_test, reg_pred_y))/100, 4)
rmse_knn = round((np.sqrt(mse_knn))/100 ,4)

print("MSE:", mse_knn)
print("RMSE:",rmse_knn)


#K-Fold Cross Validation
print('\nCross Validation with kNN set to ' + str(X_Results[y_index][0]) + ':')
from sklearn.model_selection import cross_val_score
modelAccuracies_knn = cross_val_score(estimator=KNeighborsRegressorObj, X=X_train_PCA, y=y_train, cv=10)
print('Model Accuracy Mean:',modelAccuracies_knn.mean())
print('Model Accuracy StDev:',modelAccuracies_knn.std())

#print('The best  misclassification rate is: ' + str(minY) + ', with kNN set to: ' + str(X_Results[y_index][0]))

The accuracy after kNN is: 0.75195453821425
MSE: 910.7867
RMSE: 0.3018

Cross Validation with kNN set to 4:
Model Accuracy Mean: 0.7622241254722171
Model Accuracy StDev: 0.020320443208732542
