In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler as sc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as lm
from sklearn.linear_model import LogisticRegression as logm
from sklearn.metrics import mean_absolute_error as mse
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.neighbors import KNeighborsClassifier as knn

# Function Design for Model
## Include data preparation and model eveluation

In [2]:
def prep(y):
    y_ = str(y)
    dataname = '../Data/'+y_+'_ParkingStat_SouthLakeUnion_Group.csv'
    df = pd.read_csv(dataname)
    df['Year'] = y
    return(df)

In [3]:
df = prep(2018)
for y in [2019,2020]:
    df_t = prep(y)
    df.append(df_t)
df['SourceElementKey'] = df['SourceElementKey'].astype('category').cat.codes
df['ParkingTimeLimitCategory'] = df['ParkingTimeLimitCategory'].astype('category').cat.codes
df = df.drop(columns = 'AvgPaidOccupancy')
df.head()

Unnamed: 0,SourceElementKey,Month,Hour,Weekday,ParkingSpaceCount,ParkingRatio,ParkingTimeLimitCategory,Year
0,0,1,8,2,12,0.477315,7,2018
1,0,1,9,2,12,0.522685,7,2018
2,0,1,10,2,12,0.553704,7,2018
3,0,1,11,2,12,0.565278,7,2018
4,0,1,12,2,12,0.54213,7,2018


In [4]:
scaler = sc()
scaler.fit(df.drop(columns = 'ParkingRatio'))
X = scaler.transform(df.drop(columns = 'ParkingRatio'))
y = df.ParkingRatio
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=5)

# Linear

In [5]:
Liner = lm()
Liner.fit(X_train, Y_train)

LinearRegression()

In [6]:
np.sqrt(mse(Liner.predict(X_test),Y_test))

0.4460976148248531

In [7]:
from sklearn.neural_network import MLPRegressor

In [8]:
regr = MLPRegressor(random_state=20, max_iter=1000,batch_size=150,activation='tanh',shuffle=True)
regr.fit(X_train, Y_train)

MLPRegressor(activation='tanh', batch_size=150, max_iter=1000, random_state=20)

In [9]:
np.sqrt(mse(regr.predict(X_test),Y_test))

0.4256925023157514

In [10]:
df2 = prep(2018)
df_t = prep(2019)
df2.append(df_t)
df2['SourceElementKey'] = df2['SourceElementKey'].astype('category').cat.codes
df2['ParkingTimeLimitCategory'] = df2['ParkingTimeLimitCategory'].astype('category').cat.codes
df2 = df2.drop(columns = 'AvgPaidOccupancy')
df2.head()

Unnamed: 0,SourceElementKey,Month,Hour,Weekday,ParkingSpaceCount,ParkingRatio,ParkingTimeLimitCategory,Year
0,0,1,8,2,12,0.477315,7,2018
1,0,1,9,2,12,0.522685,7,2018
2,0,1,10,2,12,0.553704,7,2018
3,0,1,11,2,12,0.565278,7,2018
4,0,1,12,2,12,0.54213,7,2018


In [11]:
scaler = sc()
scaler.fit(df2.drop(columns = 'ParkingRatio'))
X = scaler.transform(df2.drop(columns = 'ParkingRatio'))
y = df2.ParkingRatio
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=5)

In [12]:
Liner = lm()
Liner.fit(X_train, Y_train)

LinearRegression()

In [13]:
np.sqrt(mse(Liner.predict(X_test),Y_test))

0.4460976148248531

In [14]:
regr = MLPRegressor(random_state=20, max_iter=1000,batch_size=150,activation='tanh',shuffle=True)
regr.fit(X_train, Y_train)

MLPRegressor(activation='tanh', batch_size=150, max_iter=1000, random_state=20)

In [15]:
np.sqrt(mse(regr.predict(X_test),Y_test))

0.4256925023157514

In [16]:
df.columns

Index(['SourceElementKey', 'Month', 'Hour', 'Weekday', 'ParkingSpaceCount',
       'ParkingRatio', 'ParkingTimeLimitCategory', 'Year'],
      dtype='object')

In [17]:
df_max = df.groupby(['SourceElementKey','Month','Weekday',
            'Year']).max('ParkingRatio')['Hour'].reset_index()
df_max['maxflag'] = 1
df_max.head()

Unnamed: 0,SourceElementKey,Month,Weekday,Year,Hour,maxflag
0,0,1,2,2018,17,1
1,0,1,3,2018,17,1
2,0,1,4,2018,17,1
3,0,1,5,2018,17,1
4,0,1,6,2018,17,1


In [18]:
df = df.merge(df_max, on = ['SourceElementKey','Month','Weekday','Year','Hour'],how = 'left')
df = df.fillna(0)

In [19]:
df['maxflag'] = df['maxflag'].astype('category').cat.codes
df.head()

Unnamed: 0,SourceElementKey,Month,Hour,Weekday,ParkingSpaceCount,ParkingRatio,ParkingTimeLimitCategory,Year,maxflag
0,0,1,8,2,12,0.477315,7,2018,0
1,0,1,9,2,12,0.522685,7,2018,0
2,0,1,10,2,12,0.553704,7,2018,0
3,0,1,11,2,12,0.565278,7,2018,0
4,0,1,12,2,12,0.54213,7,2018,0


In [22]:
scaler = sc()
scaler.fit(df.drop(columns = ['ParkingRatio','maxflag']))
X = scaler.transform(df.drop(columns = ['ParkingRatio','maxflag']))
y = df.maxflag
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=5)

In [23]:
logis = logm()
logis.fit(X_train, Y_train)

LogisticRegression()

In [24]:
sum(logis.predict(X_test)==Y_test)/len(logis.predict(X_test)==Y_test)

0.9977386172471802

In [25]:
confusion_matrix(Y_test, logis.predict(X_test))

array([[32629,    23],
       [   59,  3550]])

In [26]:
Knn = knn(4)
Knn.fit(X_train, Y_train)

KNeighborsClassifier(n_neighbors=4)

In [27]:
sum(Knn.predict(X_test)==Y_test)/len(Knn.predict(X_test)==Y_test)

0.9898789332892088

In [28]:
confusion_matrix(Y_test, Knn.predict(X_test))

array([[32621,    31],
       [  336,  3273]])

In [29]:
Randf = rf()
Randf.fit(X_train, Y_train)

RandomForestClassifier()

In [30]:
sum(Randf.predict(X_test)==Y_test)/len(Randf.predict(X_test)==Y_test)

0.9988968864620391

In [31]:
confusion_matrix(Y_test, Randf.predict(X_test))

array([[32623,    29],
       [   11,  3598]])