In [34]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
# from yellowbrick.cluster import KElbowVisualizer
import glob
import os

In [3]:
parking_base_folder = '/Users/harsh/Desktop/Parking CSV Data/2023'

subfolders = [f.path for f in os.scandir(parking_base_folder) if f.is_dir()]

# Alternative way to get subfolders
# folders = ['01', '02', '03']
# subfolders = [os.path.join(parking_base_folder, f) for f in folders]

In [4]:
dfs = []
for subfolder in subfolders:
    all_files = glob.glob(os.path.join(subfolder, '*.csv'))
    all_files.sort(key=lambda x: os.path.basename(x))
    
    for file in all_files:
        df = pd.read_csv(file)
        dfs.append(df)

In [5]:
parking_final_df = pd.concat(dfs, ignore_index=True)

In [6]:
parking_final_df['occupied_lots'] = parking_final_df['total_lots'] - parking_final_df['available_lots']

In [7]:
parking_final_df.shape

(114693019, 7)

In [8]:
parking_final_df.head()

Unnamed: 0,fetch_timestamp,car_park_number,total_lots,available_lots,lot_type,update_timestamp,occupied_lots
0,2023-03-01T00:05:28,HE12,105,70,C,2023-03-01T00:04:13,35
1,2023-03-01T00:05:28,HLM,583,480,C,2023-03-01T00:04:24,103
2,2023-03-01T00:05:28,RHM,329,117,C,2023-03-01T00:04:13,212
3,2023-03-01T00:05:28,BM29,97,70,C,2023-03-01T00:04:38,27
4,2023-03-01T00:05:28,Q81,96,78,C,2023-03-01T00:04:12,18


In [9]:
info_base_folder = '/Users/harsh/Desktop/Pattern Recognition Systems/Project/Data/HDB'
parking_info_df = pd.read_csv(os.path.join(info_base_folder, 'HDBCarparkInformation.csv'))
parking_info_df.drop(labels=['gantry_height'], axis=1, inplace=True)

In [10]:
carpark_type = pd.Categorical(parking_info_df['car_park_type']).codes
carpark_system = pd.Categorical(parking_info_df['type_of_parking_system']).codes
carpark_short_term = pd.Categorical(parking_info_df['short_term_parking']).codes
carpark_free = pd.Categorical(parking_info_df['free_parking']).codes
carpark_night = pd.Categorical(parking_info_df['night_parking']).codes
carpark_deck = pd.Categorical(parking_info_df['car_park_decks']).codes
carpark_basement = pd.Categorical(parking_info_df['car_park_basement']).codes

parking_info_df['car_park_type'] = carpark_type
parking_info_df['type_of_parking_system'] = carpark_system
parking_info_df['short_term_parking'] = carpark_short_term
parking_info_df['free_parking'] = carpark_free
parking_info_df['night_parking'] = carpark_night
parking_info_df['car_park_decks'] = carpark_deck
parking_info_df['car_park_basement'] = carpark_basement

In [11]:
parking_info_df.rename(columns={'car_park_no': 'car_park_number'}, inplace=True)

In [12]:
parking_info_df.head()

Unnamed: 0,car_park_number,address,x_coord,y_coord,car_park_type,type_of_parking_system,short_term_parking,free_parking,night_parking,car_park_decks,car_park_basement
0,ACB,BLK 270/271 ALBERT CENTRE BASEMENT CAR PARK,30314.7936,31490.4942,0,1,3,0,1,1,1
1,ACM,BLK 98A ALJUNIED CRESCENT,33758.4143,33695.5198,4,1,3,2,1,5,0
2,AH1,BLK 101 JALAN DUSUN,29257.7203,34500.3599,5,1,3,2,1,0,0
3,AK19,BLOCK 253 ANG MO KIO STREET 21,28185.4359,39012.6664,5,0,1,0,0,0,0
4,AK31,BLK 302/348 ANG MO KIO STREET 31,29482.029,38684.1754,5,0,2,0,0,0,0


In [13]:
# merge the 2 results and then drop the columns that are not needed

resultant_df = pd.merge(parking_final_df, parking_info_df, on='car_park_number', how='inner')
resultant_df.drop(['fetch_timestamp', 'lot_type', 'address'], axis=1, inplace=True)
resultant_df.head()

Unnamed: 0,car_park_number,total_lots,available_lots,update_timestamp,occupied_lots,x_coord,y_coord,car_park_type,type_of_parking_system,short_term_parking,free_parking,night_parking,car_park_decks,car_park_basement
0,HE12,105,70,2023-03-01T00:04:13,35,26367.5806,30069.2434,5,1,3,2,1,0,0
1,HLM,583,480,2023-03-01T00:04:24,103,29354.6692,29687.508,4,1,3,0,1,11,0
2,RHM,329,117,2023-03-01T00:04:13,212,26359.4531,29876.1692,4,1,3,2,1,10,0
3,BM29,97,70,2023-03-01T00:04:38,27,26194.9184,29563.3295,0,1,3,0,1,1,1
4,Q81,96,78,2023-03-01T00:04:12,18,23531.2041,32206.3235,5,1,3,0,1,0,0


In [14]:
resultant_df['update_timestamp'] = pd.to_datetime(resultant_df['update_timestamp'])

resultant_df['update_year'] = resultant_df['update_timestamp'].dt.year
resultant_df['update_month'] = resultant_df['update_timestamp'].dt.month
resultant_df['update_day'] = resultant_df['update_timestamp'].dt.day
resultant_df['update_hour'] = resultant_df['update_timestamp'].dt.hour
resultant_df['update_minute'] = resultant_df['update_timestamp'].dt.minute
resultant_df['update_second'] = resultant_df['update_timestamp'].dt.second
resultant_df.drop('update_timestamp', axis=1, inplace=True)

resultant_df.head()

Unnamed: 0,car_park_number,total_lots,available_lots,occupied_lots,x_coord,y_coord,car_park_type,type_of_parking_system,short_term_parking,free_parking,night_parking,car_park_decks,car_park_basement,update_year,update_month,update_day,update_hour,update_minute,update_second
0,HE12,105,70,35,26367.5806,30069.2434,5,1,3,2,1,0,0,2023,3,1,0,4,13
1,HLM,583,480,103,29354.6692,29687.508,4,1,3,0,1,11,0,2023,3,1,0,4,24
2,RHM,329,117,212,26359.4531,29876.1692,4,1,3,2,1,10,0,2023,3,1,0,4,13
3,BM29,97,70,27,26194.9184,29563.3295,0,1,3,0,1,1,1,2023,3,1,0,4,38
4,Q81,96,78,18,23531.2041,32206.3235,5,1,3,0,1,0,0,2023,3,1,0,4,12


In [15]:
# X = resultant_df.drop('car_park_number', axis=1)
# y = resultant_df['car_park_number']

In [16]:
resultant_df_small = resultant_df[:10000]
X = resultant_df_small.drop('car_park_number', axis=1)
y = resultant_df_small['car_park_number']

In [17]:
X.head()

Unnamed: 0,total_lots,available_lots,occupied_lots,x_coord,y_coord,car_park_type,type_of_parking_system,short_term_parking,free_parking,night_parking,car_park_decks,car_park_basement,update_year,update_month,update_day,update_hour,update_minute,update_second
0,105,70,35,26367.5806,30069.2434,5,1,3,2,1,0,0,2023,3,1,0,4,13
1,583,480,103,29354.6692,29687.508,4,1,3,0,1,11,0,2023,3,1,0,4,24
2,329,117,212,26359.4531,29876.1692,4,1,3,2,1,10,0,2023,3,1,0,4,13
3,97,70,27,26194.9184,29563.3295,0,1,3,0,1,1,1,2023,3,1,0,4,38
4,96,78,18,23531.2041,32206.3235,5,1,3,0,1,0,0,2023,3,1,0,4,12


In [18]:
y_encoded = LabelEncoder().fit_transform(y)

In [19]:
np.unique(y_encoded).shape

(1913,)

In [21]:
# mutual info of each feature with classification output
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

y_coord                   8.106012
x_coord                   8.066020
total_lots                7.048319
occupied_lots             4.794506
available_lots            4.509299
car_park_type             4.267610
update_day                4.038055
update_year               3.987687
update_month              3.948502
type_of_parking_system    3.858566
short_term_parking        3.843631
night_parking             3.725494
free_parking              3.436559
car_park_decks            3.307474
update_second             2.885489
update_hour               0.496842
update_minute             0.348116
car_park_basement         0.119907
Name: MI Scores, dtype: float64

In [22]:
# correlation of features with each other
X.corr()

Unnamed: 0,total_lots,available_lots,occupied_lots,x_coord,y_coord,car_park_type,type_of_parking_system,short_term_parking,free_parking,night_parking,car_park_decks,car_park_basement,update_year,update_month,update_day,update_hour,update_minute,update_second
total_lots,1.0,0.855071,0.799269,-0.066114,0.309094,-0.172757,-0.02413,0.202029,0.246948,0.234276,0.325903,-0.032134,-0.047855,0.079342,-0.096118,-0.104974,-0.051731,0.030361
available_lots,0.855071,1.0,0.371821,-0.063306,0.207339,-0.188283,-0.010975,0.23022,0.174871,0.23612,0.373913,-0.031039,-0.090324,0.074502,-0.041633,-0.041476,-0.011971,0.040429
occupied_lots,0.799269,0.371821,1.0,-0.044992,0.313065,-0.091064,-0.03048,0.094865,0.239435,0.14576,0.150094,-0.021554,0.019012,0.055697,-0.123828,-0.139866,-0.078741,0.007497
x_coord,-0.066114,-0.063306,-0.044992,1.0,0.035153,-0.046214,-0.0374,-0.078405,-0.094815,-0.068371,-0.029927,0.047752,-0.01788,0.011879,0.04882,0.036255,0.03215,0.083779
y_coord,0.309094,0.207339,0.313065,0.035153,1.0,-0.160883,-0.048819,0.094569,0.079511,0.091798,0.237718,0.019906,-0.031972,0.04936,-0.120395,-0.167656,-0.083087,0.094044
car_park_type,-0.172757,-0.188283,-0.091064,-0.046214,-0.160883,1.0,-0.018547,-0.093538,0.036595,-0.119602,-0.461372,-0.708096,0.007462,-0.00108,0.002776,0.008891,0.0155,0.004625
type_of_parking_system,-0.02413,-0.010975,-0.03048,-0.0374,-0.048819,-0.018547,1.0,0.045648,0.029848,0.063622,0.027948,0.008386,0.136036,-0.077019,-0.026817,-0.027571,-0.029002,-0.025234
short_term_parking,0.202029,0.23022,0.094865,-0.078405,0.094569,-0.093538,0.045648,1.0,0.316773,0.940542,0.290905,-0.075367,-0.006537,-0.002295,-0.008113,-0.019831,-0.015421,0.040724
free_parking,0.246948,0.174871,0.239435,-0.094815,0.079511,0.036595,0.029848,0.316773,1.0,0.389271,0.228296,-0.177247,-0.031958,0.045809,-0.077349,-0.058566,-0.035733,0.058322
night_parking,0.234276,0.23612,0.14576,-0.068371,0.091798,-0.119602,0.063622,0.940542,0.389271,1.0,0.310725,-0.066978,0.004082,0.001772,-0.015215,-0.025427,-0.021011,0.028935


In [26]:
k_means = None
for k in range(2, 19):
    k_means = KMeans(n_clusters=k, n_init=5, max_iter=100, random_state=42)
    y_pred = k_means.fit_predict(X)
    
    silhouette_avg = silhouette_score(X, y_pred)
    print("n_clusters =", k, "Average silhouette_score =", silhouette_avg)

n_clusters = 2 Average silhouette_score = 0.4514876876261954
n_clusters = 3 Average silhouette_score = 0.4476132344192992
n_clusters = 4 Average silhouette_score = 0.48341005701563916
n_clusters = 5 Average silhouette_score = 0.4825903749358634
n_clusters = 6 Average silhouette_score = 0.5260770891071724
n_clusters = 7 Average silhouette_score = 0.511470968590754
n_clusters = 8 Average silhouette_score = 0.5209311821600884
n_clusters = 9 Average silhouette_score = 0.530852827090965
n_clusters = 10 Average silhouette_score = 0.5306200578744013
n_clusters = 11 Average silhouette_score = 0.5289636540276971
n_clusters = 12 Average silhouette_score = 0.5266252573529934
n_clusters = 13 Average silhouette_score = 0.517936792698994
n_clusters = 14 Average silhouette_score = 0.5048527594092851
n_clusters = 15 Average silhouette_score = 0.5023795495093933
n_clusters = 16 Average silhouette_score = 0.48027411982034984
n_clusters = 17 Average silhouette_score = 0.4910443108295518
n_clusters = 18 A

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

In [37]:
X_train.shape

(7000, 18)

In [38]:
y_train.shape

(7000,)

In [39]:
X_test.shape

(3000, 18)

In [40]:
y_test.shape

(3000,)

In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [None]:
print(f"Accuracy: {accuracy * 100:.2f}%")