In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import joblib
import numpy as np
from bokeh.plotting import figure, output_file, save
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [6]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/dataset.csv')

In [7]:
df.head()

Unnamed: 0,ID,SystemCodeNumber,Capacity,Latitude,Longitude,Occupancy,VehicleType,TrafficConditionNearby,QueueLength,IsSpecialDay,LastUpdatedDate,LastUpdatedTime
0,0,BHMBCCMKT01,577,26.144536,91.736172,61,car,low,1,0,04-10-2016,07:59:00
1,1,BHMBCCMKT01,577,26.144536,91.736172,64,car,low,1,0,04-10-2016,08:25:00
2,2,BHMBCCMKT01,577,26.144536,91.736172,80,car,low,2,0,04-10-2016,08:59:00
3,3,BHMBCCMKT01,577,26.144536,91.736172,107,car,low,2,0,04-10-2016,09:32:00
4,4,BHMBCCMKT01,577,26.144536,91.736172,150,bike,low,2,0,04-10-2016,09:59:00


In [8]:
# Combine date and time into a single datetime column
df['DateTime'] = pd.to_datetime(df['LastUpdatedDate'] + ' ' + df['LastUpdatedTime'], dayfirst=True, errors='coerce')
df.dropna(subset=['DateTime'], inplace=True)


In [9]:
# Feature engineering
df['Hour'] = df['DateTime'].dt.hour
df['DayOfWeek'] = df['DateTime'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
df['OccupancyPercent'] = (df['Occupancy'] / df['Capacity']) * 100

In [10]:
df = pd.get_dummies(df, columns=['VehicleType', 'TrafficConditionNearby'], drop_first=True)

In [11]:
# Output directory
output_dir = '/content/drive/MyDrive/parking_analysis_results'
os.makedirs(output_dir, exist_ok=True)


In [25]:
print("Unique Lot IDs:", df['ID'].unique())
print("Sample selected ID:", df['ID'].unique()[0])
print("Lot sample size:", len(lot_sample))
print(lot_sample[['DateTime', 'DynamicPrice']].head())


Unique Lot IDs: [    0     1     2 ... 18365 18366 18367]
Sample selected ID: 0
Lot sample size: 1
             DateTime  DynamicPrice
0 2016-10-04 07:59:00       10.1602


In [12]:
# EDA: Hourly occupancy trend
plt.figure(figsize=(12, 6))
df.groupby('Hour')['OccupancyPercent'].mean().plot()
plt.title('Average Hourly Parking Occupancy')
plt.xlabel('Hour of Day')
plt.ylabel('Occupancy Percentage')
plt.grid()
plt.savefig(os.path.join(output_dir, 'hourly_occupancy.png'))
plt.close()

In [13]:
# Peak hours
peak_hours = df.groupby('Hour')['Occupancy'].mean().sort_values(ascending=False).head(5)
print("\nTop 5 Peak Hours:\n", peak_hours)
peak_hours.to_csv(os.path.join(output_dir, 'peak_hours.csv'))


Top 5 Peak Hours:
 Hour
14    870.914862
13    859.265977
12    849.285930
15    811.083416
11    805.390220
Name: Occupancy, dtype: float64


In [14]:
# Feature and target selection
feature_cols = ['Capacity', 'Hour', 'DayOfWeek', 'IsWeekend', 'IsSpecialDay', 'QueueLength']
feature_cols += [col for col in df.columns if col.startswith('VehicleType_') or col.startswith('TrafficConditionNearby_')]

In [15]:

X = df[feature_cols]
y = df['Occupancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Train model
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [17]:
# Evaluate model
y_pred = model.predict(X_test)
y_pred = np.clip(y_pred, a_min=0, a_max=None)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

In [18]:
print(f"\nModel Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


Model Performance:
MAE: 76.18
RMSE: 123.81
R² Score: 0.96


In [19]:
# Feature importance
importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
importance.to_csv(os.path.join(output_dir, 'feature_importance.csv'))

In [20]:
def calculate_dynamic_price(occupancy, capacity, base_price=10):
    occ_ratio = occupancy / capacity if capacity else 0
    price = base_price * (1 + 0.5 * occ_ratio)
    return min(max(price, 5), 20)
# Add predictions and pricing to original data
df['PredictedOccupancy'] = model.predict(X).clip(min=0)
df['DynamicPrice'] = df.apply(lambda row: calculate_dynamic_price(row['PredictedOccupancy'], row['Capacity']), axis=1)

In [27]:
print(df.columns)
print(df[['ID', 'SystemCodeNumber', 'Capacity', 'DateTime']].head(10))

Index(['ID', 'SystemCodeNumber', 'Capacity', 'Latitude', 'Longitude',
       'Occupancy', 'QueueLength', 'IsSpecialDay', 'LastUpdatedDate',
       'LastUpdatedTime', 'DateTime', 'Hour', 'DayOfWeek', 'IsWeekend',
       'OccupancyPercent', 'VehicleType_car', 'VehicleType_cycle',
       'VehicleType_truck', 'TrafficConditionNearby_high',
       'TrafficConditionNearby_low', 'PredictedOccupancy', 'DynamicPrice'],
      dtype='object')
   ID SystemCodeNumber  Capacity            DateTime
0   0      BHMBCCMKT01       577 2016-10-04 07:59:00
1   1      BHMBCCMKT01       577 2016-10-04 08:25:00
2   2      BHMBCCMKT01       577 2016-10-04 08:59:00
3   3      BHMBCCMKT01       577 2016-10-04 09:32:00
4   4      BHMBCCMKT01       577 2016-10-04 09:59:00
5   5      BHMBCCMKT01       577 2016-10-04 10:26:00
6   6      BHMBCCMKT01       577 2016-10-04 10:59:00
7   7      BHMBCCMKT01       577 2016-10-04 11:25:00
8   8      BHMBCCMKT01       577 2016-10-04 11:59:00
9   9      BHMBCCMKT01       577 2

In [28]:
# ✅ Pick the lot ID with the most data
# Use the true lot identifier
selected_lot_col = 'SystemCodeNumber'  # or replace with correct column
selected_id = df[selected_lot_col].value_counts().idxmax()
lot_sample = df[df[selected_lot_col] == selected_id].sort_values('DateTime')

# ✅ Ensure there is enough data to plot
if len(lot_sample) < 2:
    print(f"❌ Not enough data to plot for Lot ID {selected_id}. Only {len(lot_sample)} record(s).")
else:
    from bokeh.plotting import figure, output_file, save

    output_file(os.path.join(output_dir, 'dynamic_pricing_plot.html'))

    p = figure(
        title=f"Dynamic Pricing Over Time (Lot ID: {selected_id})",
        x_axis_type='datetime',
        width=800,
        height=400
    )
    p.line(lot_sample['DateTime'], lot_sample['DynamicPrice'], line_width=2, color='green', legend_label='Dynamic Price')
    p.xaxis.axis_label = 'Time'
    p.yaxis.axis_label = 'Price ($)'
    p.legend.location = 'top_left'

    save(p)


In [22]:
# Save model
joblib.dump(model, os.path.join(output_dir, 'occupancy_model.pkl'))

['/content/drive/MyDrive/parking_analysis_results/occupancy_model.pkl']

In [23]:
# Next day sample prediction
next_day_data = {
    'Capacity': [50, 50, 50],
    'Hour': [8, 12, 17],
    'DayOfWeek': [2, 2, 2],
    'IsWeekend': [0, 0, 0],
    'IsSpecialDay': [0, 0, 0],
    'QueueLength': [2, 5, 3]
}

In [24]:
for col in X.columns:
    if col not in next_day_data:
        next_day_data[col] = [0] * 3

next_day_df = pd.DataFrame(next_day_data)
predicted_occupancy = model.predict(next_day_df).clip(min=0)
predicted_price = [calculate_dynamic_price(o, 50) for o in predicted_occupancy]

prediction_df = pd.DataFrame({
    'Hour': next_day_data['Hour'],
    'PredictedOccupancy': predicted_occupancy,
    'PredictedPrice': predicted_price
})
prediction_df.to_csv(os.path.join(output_dir, 'sample_predictions.csv'), index=False)
print("\nPredicted Next Day Occupancy & Prices:\n", prediction_df)



Predicted Next Day Occupancy & Prices:
    Hour  PredictedOccupancy  PredictedPrice
0     8          166.147185              20
1    12          357.531399              20
2    17          311.031263              20
