# PTV data aggregation

This note will aggregate all public transport data (train, bus and tram) that we have filtered in raw/PTV folder. Will combine all data to determine station count includiong bus, train and tram station in each suburbs/SAL region 

In [1]:
import pandas as pd

## Read in the bus, train and tram data 

In [2]:
tram_stops_with_SAL = pd.read_csv("../data/raw/PTV/tram_stops_with_SAL.csv")
train_stops_with_SAL = pd.read_csv("../data/raw/PTV/train_stops_with_SAL.csv")
bus_stops_with_SAL = pd.read_csv("../data/raw/PTV/bus_stops_with_SAL.csv")

## First we will group by SAL name in each tram and train and bus stops adn count number of transport stops in the region 
For train and bus we will consider different type of train and tram seperately 

#### Group the Train data by SAL and train_type 

In [3]:
# Group the data by 'SAL_NAME21',  SAL_CODE21 and 'train_type' and count the number of 'TRAIN_STOP_ID' for each group
SAL_train_count = train_stops_with_SAL.groupby(['SAL_NAME21', 'SAL_CODE21', 'train_type'])['TRAIN_STOP_ID'].count().reset_index()
# Rename the 'TRAIN_STOP_ID' column to 'Count'
SAL_train_count = SAL_train_count.rename(columns={'TRAIN_STOP_ID': 'Count'})
# Pivot the table to have 'METRO_TRAIN' and 'REGIONAL_TRAIN' as columns
SAL_train_count = SAL_train_count.pivot(index=['SAL_NAME21', 'SAL_CODE21'], columns='train_type', values='Count').fillna(0)
SAL_train_count = SAL_train_count.reset_index()
SAL_train_count.reset_index(drop=True, inplace=True)
# Rename the columns to remove the 'train_type' column name 
SAL_train_count.columns.name = None

# change count type
SAL_train_count['METRO_TRAIN'] = SAL_train_count['METRO_TRAIN'].astype(int)
SAL_train_count['REGIONAL_TRAIN'] = SAL_train_count['REGIONAL_TRAIN'].astype(int)


In [4]:
SAL_train_count

Unnamed: 0,SAL_NAME21,SAL_CODE21,METRO_TRAIN,REGIONAL_TRAIN
0,Abbotsford,20002,2,0
1,Albion,20021,1,0
2,Alphington,20034,1,0
3,Altona,20035,2,0
4,Ararat,20053,0,1
...,...,...,...,...
240,Windsor,22805,1,0
241,Wodonga,22819,0,1
242,Woodend,22833,0,1
243,Yarragon,22913,0,1


#### Group by SAL for bus count

In [5]:
# Group the data by 'SAL_NAME21' and 'bus_type' and count the number of 'BUS_STOP_ID' for each group
SAL_bus_count = bus_stops_with_SAL.groupby(['SAL_NAME21', 'SAL_CODE21', 'bus_type'])['BUS_STOP_ID'].count().reset_index()

# Pivot the table to have 'REGIONAL_BUS', 'METRO_BUS', and 'SKYBUS_STOP' as columns
SAL_bus_count = SAL_bus_count.pivot(index=['SAL_NAME21', 'SAL_CODE21'], columns='bus_type', values='BUS_STOP_ID').fillna(0)

# Reset the index to have a regular integer index
SAL_bus_count.reset_index(inplace=True)

# Rename the columns for clarity
SAL_bus_count.columns.name = None

In [6]:
# convert to interger count 
SAL_bus_count['METRO_BUS'] = SAL_bus_count['METRO_BUS'].astype(int)
SAL_bus_count['REGIONAL_BUS'] = SAL_bus_count['REGIONAL_BUS'].astype(int)
SAL_bus_count['REGIONAL_COACH'] = SAL_bus_count['REGIONAL_COACH'].astype(int)
SAL_bus_count['SKYBUS'] = SAL_bus_count['SKYBUS'].astype(int)

SAL_bus_count

Unnamed: 0,SAL_NAME21,SAL_CODE21,METRO_BUS,REGIONAL_BUS,REGIONAL_COACH,SKYBUS
0,Abbotsford,20002,13,0,0,0
1,Aberfeldie,20003,20,0,0,0
2,Acheron,20005,2,0,0,0
3,Aintree,20011,20,0,0,0
4,Aireys Inlet,20013,0,4,4,0
...,...,...,...,...,...,...
1128,Yarroweyah,22921,0,0,2,0
1129,Yea,22924,0,0,1,0
1130,Yendon,22927,0,0,2,0
1131,Yinnar,22934,0,5,0,0


#### Group by SAL for tram count

In [7]:
# Group the data by 'SAL_NAME21' and count the number of 'TRAM_STOP_ID' for each group
SAL_tram_count = tram_stops_with_SAL.groupby(['SAL_NAME21', 'SAL_CODE21'])['TRAM_STOP_ID'].count().reset_index()

# Rename the 'TRAM_STOP_ID' column to 'Count' for clarity
SAL_tram_count = SAL_tram_count.rename(columns={'TRAM_STOP_ID': 'Tram_Count'})

In [8]:
SAL_tram_count

Unnamed: 0,SAL_NAME21,SAL_CODE21,Tram_Count
0,Abbotsford,20002,5
1,Airport West,20015,9
2,Albert Park,20018,21
3,Armadale,20066,34
4,Ascot Vale,20075,22
...,...,...,...
82,Toorak,22547,26
83,Travancore,22572,4
84,Vermont South,22628,4
85,West Melbourne,22757,8


## Merge all three count for public transports in each SAL

In [9]:
# Convert 'SAL_CODE21' column to str in all three dataframes
SAL_train_count['SAL_CODE21'] = SAL_train_count['SAL_CODE21'].astype(str)
SAL_tram_count['SAL_CODE21'] = SAL_tram_count['SAL_CODE21'].astype(str)
SAL_bus_count['SAL_CODE21'] = SAL_bus_count['SAL_CODE21'].astype(str)

# Merge the dataframes on 'SAL_NAME21' with an outer join
SAL_PTV_count = SAL_train_count.merge(SAL_tram_count, on=['SAL_NAME21', 'SAL_CODE21'], how='outer').merge(SAL_bus_count, on=['SAL_NAME21', 'SAL_CODE21'], how='outer')

# Fill NaN values with 0 for tram and bus counts
SAL_PTV_count = SAL_PTV_count.fillna(0)


In [10]:
SAL_PTV_count

Unnamed: 0,SAL_NAME21,SAL_CODE21,METRO_TRAIN,REGIONAL_TRAIN,Tram_Count,METRO_BUS,REGIONAL_BUS,REGIONAL_COACH,SKYBUS
0,Abbotsford,20002,2.0,0.0,5.0,13.0,0.0,0.0,0.0
1,Albion,20021,1.0,0.0,0.0,19.0,0.0,0.0,0.0
2,Alphington,20034,1.0,0.0,0.0,28.0,0.0,0.0,0.0
3,Altona,20035,2.0,0.0,0.0,65.0,0.0,0.0,0.0
4,Ararat,20053,0.0,1.0,0.0,0.0,63.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...
1137,Yarroweyah,22921,0.0,0.0,0.0,0.0,0.0,2.0,0.0
1138,Yea,22924,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1139,Yendon,22927,0.0,0.0,0.0,0.0,0.0,2.0,0.0
1140,Yinnar,22934,0.0,0.0,0.0,0.0,5.0,0.0,0.0


In [11]:
# Check for duplicate values in 'SAL_NAME21'
duplicates = SAL_PTV_count[SAL_PTV_count['SAL_NAME21'].duplicated(keep=False)]

# If 'duplicates' DataFrame is not empty, it means there are two or more rows with the same value in 'Column1'
if not duplicates.empty:
    print("There is duplicate in SAL_NAME21")
else:
    print("There are no duplicate in SAL_NAME21")

There are no duplicate in SAL_NAME21


### Create new feature to total count for public transport station in each suburb, and one that does not consider any regional transportation

In [12]:
# create new column find total public transport station
SAL_PTV_count['Total_Public_Transport_Count'] = SAL_PTV_count[['METRO_TRAIN', 'REGIONAL_TRAIN', 'Tram_Count', 'METRO_BUS', 'REGIONAL_BUS', 'REGIONAL_COACH', 'SKYBUS']].sum(axis=1)


# Create new column find total public transportation station without regional
SAL_PTV_count['Total_Public_Transport_Count_No_Regional'] = SAL_PTV_count[['METRO_TRAIN', 'Tram_Count', 'METRO_BUS']].sum(axis=1)


In [13]:
SAL_PTV_count

Unnamed: 0,SAL_NAME21,SAL_CODE21,METRO_TRAIN,REGIONAL_TRAIN,Tram_Count,METRO_BUS,REGIONAL_BUS,REGIONAL_COACH,SKYBUS,Total_Public_Transport_Count,Total_Public_Transport_Count_No_Regional
0,Abbotsford,20002,2.0,0.0,5.0,13.0,0.0,0.0,0.0,20.0,20.0
1,Albion,20021,1.0,0.0,0.0,19.0,0.0,0.0,0.0,20.0,20.0
2,Alphington,20034,1.0,0.0,0.0,28.0,0.0,0.0,0.0,29.0,29.0
3,Altona,20035,2.0,0.0,0.0,65.0,0.0,0.0,0.0,67.0,67.0
4,Ararat,20053,0.0,1.0,0.0,0.0,63.0,2.0,0.0,66.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1137,Yarroweyah,22921,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0
1138,Yea,22924,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1139,Yendon,22927,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0
1140,Yinnar,22934,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0


In [14]:
# Save this to curated data
SAL_PTV_count.to_csv('../data/curated/PTV_count_with_SAL.csv',index=False)