In [1]:
# Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Define Data Preparation Functions
def fetch_data_from_single_file(file_path):
    """Fetch data from a single CSV file."""
    return pd.read_csv(file_path)

def replace_timestamp_with_cyclic_features(target_df, timestamp_key):
    """Convert date and time features into cyclic features."""
    target_df[timestamp_key] = pd.to_datetime(target_df[timestamp_key])

    target_df['month'] = target_df[timestamp_key].dt.month
    target_df['day_of_week'] = target_df[timestamp_key].dt.weekday
    target_df['hour'] = target_df[timestamp_key].dt.hour

    # Create cyclic features from month, day_of_week, hour
    target_df['sin_hour'] = np.sin(2 * np.pi * target_df['hour'] / 24)
    target_df['cos_hour'] = np.cos(2 * np.pi * target_df['hour'] / 24)
    target_df['sin_day_of_week'] = np.sin(2 * np.pi * target_df['day_of_week'] / 7)
    target_df['cos_day_of_week'] = np.cos(2 * np.pi * target_df['day_of_week'] / 7)
    target_df['sin_month'] = np.sin(2 * np.pi * target_df['month'] / 12)
    target_df['cos_month'] = np.cos(2 * np.pi * target_df['month'] / 12)

    return target_df.drop(columns=[timestamp_key, 'month', 'day_of_week', 'hour'])

def prepare_historical_parking_df_v2(file_path):
    """Prepare historical parking data from a single CSV file, limiting to the first 1000 rows."""
    historical_parking_df = fetch_data_from_single_file(file_path)

    # Drop not required features if they exist
    columns_to_drop = ['fetch_timestamp', 'lot_type']
    historical_parking_df = historical_parking_df.drop(columns=[col for col in columns_to_drop if col in historical_parking_df.columns], errors='ignore')

    # Limit to first 1000 rows
    historical_parking_df = historical_parking_df.head(1000)

    # Replace timestamp with cyclic values for its features
    historical_parking_df = replace_timestamp_with_cyclic_features(
        historical_parking_df.copy(),
        'update_timestamp'
    )

    print("Historical parking data shape = ", historical_parking_df.shape)
    print("Historical parking data top 5")
    print(historical_parking_df.head())

    return historical_parking_df

def prepare_parking_info_df_v2(folder_path=None):
    """Prepare parking information DataFrame."""
    parking_info_df = pd.read_csv('/content/parking_data.csv')  # Adjust this path as necessary

    parking_info_df = (parking_info_df
           .drop(columns=['address', 'gantry_height'], errors='ignore')
           .rename(columns={'car_park_no': 'car_park_number'}))

    encoder = OneHotEncoder()
    encoded_features = pd.DataFrame(encoder.fit_transform(parking_info_df[['lot_type']]).toarray(),
            columns=encoder.get_feature_names_out())
    parking_info_df = parking_info_df.drop(columns=['lot_type'], errors='ignore').reset_index(drop=True)

    print("Car park static info shape = ", parking_info_df.shape)
    print("Car park static info top 5")
    print(parking_info_df.head())

    return pd.concat([parking_info_df, encoded_features], axis=1)

def prepare_resultant_df_v2(file_path):
    """Prepare the resultant DataFrame for modeling."""
    parking_info_df = prepare_parking_info_df_v2()
    historical_parking_df = prepare_historical_parking_df_v2(file_path)

    # Prepare a resultant DataFrame
    merged_df = pd.merge(historical_parking_df, parking_info_df, on='car_park_number', how='inner')

    # Identify columns for scaling
    numerical_features = ['total_lots_x', 'available_lots_x']  # Adjust to match merged DataFrame
    if not all(col in merged_df.columns for col in numerical_features):
        print(f"Warning: Some numerical features are missing from the merged DataFrame: {numerical_features}")

    # Scale numerical features
    scaler = MinMaxScaler()
    merged_df[numerical_features] = scaler.fit_transform(merged_df[numerical_features])

    print("Resultant dataframe shape = ", merged_df.shape)
    print("Resultant dataframe top 5")
    print(merged_df.head())

    return merged_df

# Load and Prepare the Data
single_csv_file_path = '/content/parking_data.csv'  # Ensure this is the correct path to your CSV
resultant_df = prepare_resultant_df_v2(single_csv_file_path)

# Slice to use only the first 1000 rows
resultant_df = resultant_df.head(1000)

# Display the shape and first few rows of the DataFrame
print("Resultant DataFrame Shape:", resultant_df.shape)
print(resultant_df.head())

# Check data types
print("Data types in resultant_df:\n", resultant_df.dtypes)

Car park static info shape =  (652935, 5)
Car park static info top 5
       fetch_timestamp car_park_number  total_lots  available_lots  \
0  2023-01-01T00:00:27            HE12         105              43   
1  2023-01-01T00:00:27             HLM           1               0   
2  2023-01-01T00:00:27             RHM         329             160   
3  2023-01-01T00:00:27            BM29          97              80   
4  2023-01-01T00:00:27             Q81           1               0   

      update_timestamp  
0  2022-12-31T23:59:59  
1  2023-01-01T00:00:07  
2  2022-12-31T23:59:59  
3  2023-01-01T00:00:00  
4  2023-01-01T00:00:08  
Historical parking data shape =  (1000, 9)
Historical parking data top 5
  car_park_number  total_lots  available_lots  sin_hour  cos_hour  \
0            HE12         105              43 -0.258819  0.965926   
1             HLM           1               0  0.000000  1.000000   
2             RHM         329             160 -0.258819  0.965926   
3          

In [6]:
# Prepare Features and Target Variable
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Define features and target
X = resultant_df.drop('available_lots_x', axis=1)  # Adjust 'available_lots_x' if it's not the target
y = resultant_df['available_lots_x']  # Adjust 'available_lots_x' if it's not the target

# Convert 'available_lots_x' to categorical for classification (adjust bins as needed)
y = pd.cut(y, bins=[-1, 0, 10, 50, 100], labels=['Low', 'Medium', 'High', 'Full'])

# Encode categorical features (if any)
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
if categorical_columns:
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_features = encoder.fit_transform(X[categorical_columns])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))
    X = pd.concat([X.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
    X = X.drop(categorical_columns, axis=1)  # Drop original categorical columns

print("Remaining columns in X before training:", X.columns)
print(X.dtypes)  # Check the datatypes

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Support Vector Classifier
model = SVC(kernel='linear', C=1, decision_function_shape='ovr')  # Adjust parameters as needed
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Remaining columns in X before training: Index(['total_lots_x', 'sin_hour', 'cos_hour', 'sin_day_of_week',
       'cos_day_of_week', 'sin_month', 'cos_month', 'total_lots_y',
       'available_lots_y', 'lot_type_C',
       ...
       'update_timestamp_2023-01-01T23:34:52',
       'update_timestamp_2023-01-01T23:35:04',
       'update_timestamp_2023-01-01T23:39:52',
       'update_timestamp_2023-01-01T23:40:04',
       'update_timestamp_2023-01-01T23:44:52',
       'update_timestamp_2023-01-01T23:45:04',
       'update_timestamp_2023-01-01T23:50:04',
       'update_timestamp_2023-01-01T23:50:22',
       'update_timestamp_2023-01-01T23:54:04',
       'update_timestamp_2023-01-01T23:54:22'],
      dtype='object', length=1065)
total_lots_x                            float64
sin_hour                                float64
cos_hour                                float64
sin_day_of_week                         float64
cos_day_of_week                         float64
                            