# Data Preparation

In [None]:
!pip install python-dotenv

In [None]:
import glob
import os
import pandas as pd
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import os

In [None]:
load_dotenv()

# to load whole folder of historical data for year 2023:
# drive.mount('/content/drive')
# historical_parking_base_folder = '/content/drive/My Drive/NUS-ISS AIS Projects/Project 2/Data/Parking CSV Data/2023'
# %cd {base_folder}

historical_parking_df = pd.read_csv('/content/parking_data.csv')

numerical_features = ['total_lots', 'available_lots']
categorical_features = [
    'fetch_timestamp',
    'car_park_number',
    'lot_type',
    'update_timestamp'
]
historical_parking_df.head()

#### For parsing through subfolders

In [None]:
"""def fetch_data_from_subfolders():
    subfolders = [f.path for f in os.scandir(historical_parking_base_folder) if f.is_dir()]

    # read csv in a dataframe and put all the DataFrames in a list
    dfs = []
    for subfolder in subfolders:
        all_files = glob.glob(os.path.join(subfolder, '*.csv'))
        all_files.sort(key=lambda x: os.path.basename(x))

        for file in all_files:
            df = pd.read_csv(file)
            dfs.append(df)

    # concat all the dataframes
    return pd.concat(dfs, ignore_index=True)"""

In [None]:
"""def get_train_test_X_y(resultant_df, target=None, test_size=0.2):
    # prepare X & y (classification)
    X, y_encoded = get_X_y_encoded(resultant_df, target)

    # train and test split for X & y
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=test_size, random_state=42, stratify=y_encoded)

    print("Shape of X_train = ", X_train.shape)
    print("Shape of X_test = ", X_test.shape)
    print("Shape of y_train = ", y_train.shape)
    print("Shape of y_test = ", y_test.shape)

    return X_train, X_test, y_train, y_test"""

In [None]:
"""def get_X_y_encoded(resultant_df, target=None):
    if target is None:
        target = ['car_park_number']

    X = resultant_df.drop(columns=target).to_numpy()
    y = resultant_df[target]

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    return X, y_encoded"""

#### For Single File From Year 2023

In [None]:
load_dotenv()

def prepare_historical_parking_df_v2():
    # historical_parking_df = fetch_data_from_subfolders()

    # Access the global historical_parking_df
    historical_parking_df = pd.read_csv('/content/parking_data.csv')

    # feature engineering - updated timestamp
    historical_parking_df['update_timestamp'] = pd.to_datetime(historical_parking_df['update_timestamp'])

    # get month, day_of_week, hour
    historical_parking_df['month'] = historical_parking_df['update_timestamp'].dt.month
    historical_parking_df['day_of_week'] = historical_parking_df['update_timestamp'].dt.weekday
    historical_parking_df['hour'] = historical_parking_df['update_timestamp'].dt.hour

    # create cyclic features from month, day_of_week, hour
    historical_parking_df['sin_hour'] = np.sin(2 * np.pi * historical_parking_df['hour'] / 24)
    historical_parking_df['cos_hour'] = np.cos(2 * np.pi * historical_parking_df['hour'] / 24)

    historical_parking_df['sin_day_of_week'] = np.sin(2 * np.pi * historical_parking_df['day_of_week'] / 7)
    historical_parking_df['cos_day_of_week'] = np.cos(2 * np.pi * historical_parking_df['day_of_week'] / 7)

    historical_parking_df['sin_month'] = np.sin(2 * np.pi * historical_parking_df['month'] / 12)
    historical_parking_df['cos_month'] = np.cos(2 * np.pi * historical_parking_df['month'] / 12)

    # drop not required features
    historical_parking_df = historical_parking_df.drop(
        columns=['sin_hour', 'cos_hour', 'sin_month' , 'cos_month' , 'month', 'day_of_week', 'hour'])

    print("Historical parking data shape = ", historical_parking_df.shape)
    print("Historical parking data top 5")
    print(historical_parking_df.head())

    historical_parking_df = historical_parking_df[:10000]

    return historical_parking_df

In [None]:
historical_parking_df = prepare_historical_parking_df_v2()

historical_parking_df.head()

In [None]:
def prepare_parking_info_df_v2(folder_path = None):

    parking_info_df = pd.read_csv('/content/HDBCarparkInformation.csv')

    numerical_features = ['total_lots', 'available_lots', 'x_coord', 'y_coord']
    categorical_features = ['car_park_type', 'type_of_parking_system', 'short_term_parking',
    'free_parking', 'night_parking', 'car_park_basement']

    parking_info_df = (parking_info_df
           .drop(columns=['address', 'gantry_height'])
           .rename(columns={'car_park_no': 'car_park_number'}))

    encoder = OneHotEncoder()
    encoded_features = pd.DataFrame(encoder.fit_transform(parking_info_df[categorical_features]).toarray(),
            columns=encoder.get_feature_names_out())
    parking_info_df = parking_info_df.drop(columns=categorical_features).reset_index(drop=True)

    print("Car park static info shape = ", parking_info_df.shape)
    print("Car park static info top 5")
    print(parking_info_df.head())

    parking_info_df = parking_info_df[:10000]

    return pd.concat([parking_info_df, encoded_features], axis=1)

In [None]:
def prepare_resultant_df_v2():
    # fetch prepared car lot info (static)
    parking_info_df = prepare_parking_info_df_v2()

    # fetch prepared car parking data (historical)
    historical_parking_df = prepare_historical_parking_df_v2()

    # prepare a resultant DataFrame
    resultant_df = pd.merge(historical_parking_df, parking_info_df, on='car_park_number', how='inner')

    resultant_df = resultant_df[:10000]

    scaler = MinMaxScaler()
    resultant_df[numerical_features] = scaler.fit_transform(resultant_df[numerical_features])

    print("Resultant dataframe shape = ", resultant_df.shape)
    print("Resultant dataframe top 5")
    print(resultant_df.head())

    return resultant_df

In [None]:
if __name__ == '__main__':
    print(prepare_resultant_df_v2())

# Data Pre-Processing

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import glob

In [None]:
"""parking_base_folder = historical_parking_base_folder

subfolders = [f.path for f in os.scandir(parking_base_folder) if f.is_dir()]


dfs = []
for subfolder in subfolders:
    all_files = glob.glob(os.path.join(subfolder, '*.csv'))
    all_files.sort(key=lambda x: os.path.basename(x))

    for file in all_files:
        df = pd.read_csv(file)
        dfs.append(df)


!free -h

parking_final_df = pd.concat(dfs, ignore_index=True)"""

In [None]:
parking_final_df = prepare_historical_parking_df_v2()

In [None]:
parking_final_df['occupied_lots'] = parking_final_df['total_lots'] - parking_final_df['available_lots']

In [None]:
parking_final_df.head()

In [None]:
parking_info_df = pd.read_csv('/content/HDBCarparkInformation.csv')

parking_info_df['car_park_type'] = pd.Categorical(parking_info_df['car_park_type']).codes
parking_info_df['type_of_parking_system'] = pd.Categorical(parking_info_df['type_of_parking_system']).codes
parking_info_df['short_term_parking'] = pd.Categorical(parking_info_df['short_term_parking']).codes
parking_info_df['free_parking'] = pd.Categorical(parking_info_df['free_parking']).codes
parking_info_df['night_parking'] = pd.Categorical(parking_info_df['night_parking']).codes
parking_info_df['car_park_decks'] = pd.Categorical(parking_info_df['car_park_decks']).codes
parking_info_df['car_park_basement'] = pd.Categorical(parking_info_df['car_park_basement']).codes

parking_info_df.drop(labels=['gantry_height'], axis=1, inplace=True)
parking_info_df.rename(columns={'car_park_no': 'car_park_number'}, inplace=True)

In [None]:
parking_info_df.head()

In [None]:
# merge the 2 results and then drop the columns that are not needed

resultant_df = pd.merge(parking_final_df, parking_info_df, on='car_park_number', how='inner')
resultant_df.drop(['fetch_timestamp', 'lot_type', 'address'], axis=1, inplace=True)
resultant_df.head()

In [None]:
resultant_df['update_timestamp'] = pd.to_datetime(resultant_df['update_timestamp'])

resultant_df['update_year'] = resultant_df['update_timestamp'].dt.year
resultant_df['update_month'] = resultant_df['update_timestamp'].dt.month
resultant_df['update_day'] = resultant_df['update_timestamp'].dt.day
resultant_df['update_hour'] = resultant_df['update_timestamp'].dt.hour
resultant_df['update_minute'] = resultant_df['update_timestamp'].dt.minute
resultant_df['update_second'] = resultant_df['update_timestamp'].dt.second
resultant_df.drop('update_timestamp', axis=1, inplace=True)

resultant_df.head()

In [None]:
filtered_df = resultant_df[resultant_df['car_park_number'] == 'HE12']
filtered_df.head()

In [None]:
resultant_df_small = resultant_df[:100]
X = resultant_df_small.drop('car_park_number', axis=1)
y = resultant_df_small['car_park_number']

In [None]:
X.head()

In [None]:
y_encoded = LabelEncoder().fit_transform(y)

In [None]:
np.unique(y_encoded).shape

In [None]:
# mutual info of each feature with classification output
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

In [None]:
# correlation of features with each other
X.corr()

In [None]:
#Visualization of correlation
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
correlation_matrix = X.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Features')
plt.show()

# Support Vector Classifier

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Create an SVC model with default parameters
svc_model = SVC(kernel='linear', C=1.0, decision_function_shape='ovr')  # You can try other kernels like 'rbf' or 'poly'

# Fit the model to the training data
svc_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svc_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))