In [None]:
#import the required libraries
import pandas as pd
import numpy as np

#Define input data directories
data_directory = './data'
output_directory = './parsed_data'

In [None]:
#Read in the pedestrian count and sensor location data
df = pd.read_csv(data_directory + '/Pedestrian_Counting_System_-_Monthly__counts_per_hour_.csv', dtype = str)
location_df = pd.read_csv(data_directory+'/Pedestrian_Counting_System_-_Sensor_Locations.csv', dtype = str)
location_df = location_df[['sensor_id','latitude','longitude']]

In [None]:
#Exclude sensor lcoations such based on sensor locaitons falling outside the melbourne cbd
exclude = ['75',
           '10',
           '50',
           '46',
           '71',
           '78',
           '51',
           '69'
          ]

df = df[~df.Sensor_ID.isin(exclude)]

In [None]:
#Add time based data enrichment
df.Date_Time = pd.to_datetime(df.Date_Time)

#Convert the time stamps into unix time
df['unix_time'] = pd.to_datetime(df.Date_Time).astype(np.int64) // 10**9

#Extract the hour form the date column
df['hour'] = pd.to_datetime(df.Date_Time).dt.hour

#Retain the required columns
df = df[['Sensor_ID','Hourly_Counts','unix_time','hour']]

In [None]:
#Merge in the lcoation data to get the lon/lat for each sensor
df = df.merge(location_df, how = 'inner', left_on = 'Sensor_ID', right_on = 'sensor_id')

#change the column names into a single convention
df.columns = ['sensor_id','count','unix_time','hour','sensor','lat','lon']
df = df[['sensor_id','count','unix_time','hour','lat','lon']]

In [None]:
#Remove the anything beyond the last six months
max_time = max(df.unix_time)
#Calculate the min time as six months back from the amx time
min_time = max_time - (86400 * 100)

#Filter the dataframe to only include observations in the last six months
df = df[df.unix_time > min_time]

df['date'] = pd.to_datetime(df['unix_time'],unit='s')
df = df.sort_values(by = 'date', ascending = False)

df['position'] = df.lon * df.lat
df['month'] = df.date.dt.month
df['day_of_week'] = df.date.dt.dayofweek

In [None]:
#Save the parsed dataset
df.to_csv(output_directory+'/data.csv', index = False)