In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler as sc
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import RandomForestClassifier as rf

# Function Design for Classifier
## Include data preparation and model eveluation

In [2]:
# Data preprocess
def knnprep(y):
    y = str(y)
    dataname = '../Data/'+y+'_ParkingLot.csv'
    df = pd.read_csv(dataname)
    df = df.drop(columns = ['SourceElementKey','BlockfaceName','PaidParkingRate','longitude','latitude'])
    df = df.fillna('')
    for c in df.columns:
        if c != 'ParkingSpaceCount':
            df[c] = df[c].astype('category').cat.codes
    return(df)
# Model evaluation for knn and random forest
class classmodelev():
    def __init__(self, df):
        self.y = df.PaidParkingArea
        self.X = df.drop(columns = ['PaidParkingArea'])
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X, self.y, test_size=0.25, random_state=5)
    def knnpr(self):
        error = []
        for k in range(1,20):
            Knn = knn(n_neighbors=k)
            Knn.fit(self.X_train,self.Y_train)
            e = sum(Knn.predict(self.X_test)==self.Y_test)/len(Knn.predict(self.X_test)==self.Y_test)
            error.append(e)
        print('Best k:', error.index(max(error))+1)
        print('Error rate:',max(error))
        Knn = knn(n_neighbors=error.index(max(error))+1)
        Knn.fit(self.X_train,self.Y_train)
        return(Knn)
    def rfpr(self):
        RandFor =  rf(random_state=0)
        RandFor.fit(self.X, self.y)
        e = sum(RandFor.predict(self.X_test)==self.Y_test)/len(RandFor.predict(self.X_test)==self.Y_test)
        print('Error rate:',e)
        return(RandFor)

In [3]:
df2018 = knnprep(2018)
df2018.head()

Unnamed: 0,SideOfStreet,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,ParkingCategory
0,4,1,2,6,2,1
1,3,2,8,3,4,1
2,5,1,6,8,7,1
3,7,2,14,18,2,1
4,0,1,8,13,0,1


# Model evaluation for each year

In [4]:
performance2018 = classmodelev(df2018)
print('knn')
model2018knn = performance2018.knnpr()
print('Random Forest')
model2018rf = performance2018.rfpr()

knn
Best k: 4
Error rate: 0.5396825396825397
Random Forest
Error rate: 0.8412698412698413


In [5]:
df2019 = knnprep(2019)
performance2019 = classmodelev(df2019)
print('knn')
model2019knn = performance2019.knnpr()
print('Random Forest')
model2019rf = performance2019.rfpr()

knn
Best k: 6
Error rate: 0.5800524934383202
Random Forest
Error rate: 0.8713910761154856


In [6]:
df2020 = knnprep(2020)
performance2020 = classmodelev(df2020)
print('knn')
model2020knn = performance2020.knnpr()
print('Random Forest')
model2020rf = performance2020.rfpr()

knn
Best k: 1
Error rate: 0.5989010989010989
Random Forest
Error rate: 0.8324175824175825


# Cross Year Validation

In [7]:
cv_y_knn = model2018knn.predict(performance2019.X_test)==performance2019.Y_test
print(sum(cv_y_knn)/len(cv_y_knn))
cv_y_knn = model2018knn.predict(performance2020.X_test)==performance2020.Y_test
print(sum(cv_y_knn)/len(cv_y_knn))
cv_y_rf = model2018rf.predict(performance2019.X_test)==performance2019.Y_test
print(sum(cv_y_rf)/len(cv_y_rf))
cv_y_rf = model2018rf.predict(performance2020.X_test)==performance2020.Y_test
print(sum(cv_y_rf)/len(cv_y_rf))

0.6377952755905512
0.6181318681318682
0.8503937007874016
0.7747252747252747


In [8]:
cv_y_knn = model2019knn.predict(performance2020.X_train)==performance2020.Y_train
print(sum(cv_y_knn)/len(cv_y_knn))
cv_y_knn = model2019knn.predict(performance2018.X_train)==performance2018.Y_train
print(sum(cv_y_knn)/len(cv_y_knn))
cv_y_rf = model2019rf.predict(performance2020.X_train)==performance2020.Y_train
print(sum(cv_y_rf)/len(cv_y_rf))
cv_y_rf = model2019rf.predict(performance2018.X_train)==performance2018.Y_train
print(sum(cv_y_rf)/len(cv_y_rf))

0.6584022038567493
0.6534040671971706
0.8512396694214877
0.830238726790451


In [9]:
cv_y_knn = model2020knn.predict(performance2018.X_train)==performance2018.Y_train
print(sum(cv_y_knn)/len(cv_y_knn))
cv_y_knn = model2020knn.predict(performance2019.X_train)==performance2019.Y_train
print(sum(cv_y_knn)/len(cv_y_knn))
cv_y_rf = model2020rf.predict(performance2018.X_train)==performance2018.Y_train
print(sum(cv_y_rf)/len(cv_y_rf))
cv_y_rf = model2020rf.predict(performance2019.X_train)==performance2019.Y_train
print(sum(cv_y_rf)/len(cv_y_rf))

0.74447391688771
0.7620297462817148
0.8222811671087533
0.8355205599300087
