# My exercises on loading a dataset and doing some basic analysis

In [None]:
import pandas as pd
import numpy as np
import sklearn
import datetime

BIRMINGHAM_PARKING_DATASET_URL = 'https://raw.githubusercontent.com/reddyprasade/Machine-Learning-Problems-DataSets/master/Regression/Parking%20Birmingham.csv'

bir_csv = np.genfromtxt(BIRMINGHAM_PARKING_DATASET_URL,delimiter=',', dtype=str)
header, data = bir_csv[0,1:], bir_csv[1:,1:]

bir_dataframe = pd.DataFrame(
    data,
    columns=header,
).astype({
    'SystemCodeNumber': str,
    'Capacity': int,
    'Occupancy': int,
    'LastUpdated': 'datetime64[s]'
})
bir_dataframe.head()

Grouping by the `SystemCodeNumber` to see how the occupancy rate varies by the parking lot.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20,5))
plt.figure.dpi = 300

grouped_by_lots = bir_dataframe.groupby('SystemCodeNumber')
for group in grouped_by_lots.groups.keys():
    current_group_timeseries = grouped_by_lots.get_group(group)
    plt.plot(
        current_group_timeseries['LastUpdated'],
        (current_group_timeseries['Occupancy'] / current_group_timeseries['Capacity']) * 100,
        label=current_group_timeseries['SystemCodeNumber'].values[0],
    )
plt.legend(
    loc=3,
    ncols=len(grouped_by_lots.groups.keys()) // 4,
    bbox_to_anchor=(0.0, 1.0)
)
plt.show()

It can be seen that there are gaps in the timeseries, where the occupancy drops significantly. This is likely due to the parking lot being closed at certain times or due to limited traffic in these areas. Next, average daily occupancies will be calculated to see if there are any patterns in the data.

In [None]:
for group in grouped_by_lots.groups.keys():
    current_group_timeseries = grouped_by_lots.get_group(group).copy()
    current_group_timeseries['LastUpdatedDay'] = current_group_timeseries['LastUpdated'].dt.date
    current_group_timeseries['LastUpdatedTime'] = current_group_timeseries['LastUpdated'].dt.time
    current_group_timeseries.drop('LastUpdated', axis=1, inplace=True)
    current_group_timeseries['OccupancyPercentage'] = (current_group_timeseries['Occupancy'] / current_group_timeseries['Capacity']) * 100
    grouped_by_days = current_group_timeseries.groupby('LastUpdatedDay')
    aggregated_values = []
    for day in grouped_by_days.groups.keys():
        current_group_timeseries_day = grouped_by_days.get_group(day)
        aggregated_values.append(current_group_timeseries_day["OccupancyPercentage"].values)
    aggregated_df = pd.DataFrame(aggregated_values)