# PTV data
This notebook is to download ptv data, which included the public transport information about trains, trams, buses in Victoria state. Then we will preprocess the data and use the raw file for our analysis.

Important Notice! 

The url for PTV data is temporary generate and share by Victoria Datashare (https://datashare.maps.vic.gov.au/). If The link is expired, please follow the instruction in README.MD file to reorder the dataset


Please Run SA2_&_SAL_shapefile_preprocessing.ipynb first before run this notebook 

In [1]:
import os 
import fiona
import geopandas as gpd
import pandas as pd

## Run the download script to download ptv data file 


url = "https://s3.ap-southeast-2.amazonaws.com/cl-isd-prd-datashare-s3-delivery/Order_HI08BK.zip" 

In [2]:
%run ../scripts/PTV_Download.py


Begin download PTV data
complete download PTV data
Begin unzip PTV data
complete unzip PTV data


## Exaimine the datafile

In [3]:
ptv_layers = fiona.listlayers('../data/landing/PTV DATA/ll_gda94/filegdb/whole_of_dataset/victoria/PTV.gdb')
ptv_layers

['PTV_METRO_BUS_STOP',
 'PTV_METRO_TRAIN_STATION',
 'PTV_TRAIN_CORRIDOR_CENTRELINE',
 'PTV_TRAIN_STATION_BIKE_STORAGE',
 'PTV_METRO_BUS_ROUTE',
 'PTV_TRAIN_TRACK_CENTRELINE',
 'PTV_METRO_TRAM_ROUTE',
 'PTV_METRO_TRAM_STOP',
 'PTV_REGIONAL_BUS_ROUTE',
 'PTV_TRAM_TRACK_CENTRELINE',
 'PTV_TRAIN_STATION_PLATFORM',
 'PTV_REGIONAL_COACH_STOP',
 'PTV_REGIONAL_TRAIN_STATION',
 'PTV_REGIONAL_COACH_ROUTE',
 'PTV_REGIONAL_BUS_STOP',
 'PTV_SKYBUS_ROUTE',
 'PTV_SKYBUS_STOP',
 'PTV_TRAIN_CARPARK']

# Extract Train Data 

In [4]:
# load and preprocess train data
## including: 1. metro train, 2.regional train

gdf_REGIONAL_TRAIN= gpd.read_file('../data/landing/PTV DATA/ll_gda94/filegdb/whole_of_dataset/victoria/PTV.gdb', 
layer='PTV_REGIONAL_TRAIN_STATION')

gdf_METRO_TRAIN = gpd.read_file('../data/landing/PTV DATA/ll_gda94/filegdb/whole_of_dataset/victoria/PTV.gdb', 
layer='PTV_METRO_TRAIN_STATION')

df_mtrain_stops = pd.DataFrame({'TRAIN_STOP_ID': gdf_METRO_TRAIN['STOP_ID'], 'STOP_NAME': gdf_METRO_TRAIN['STOP_NAME'], 
'geometry': gdf_METRO_TRAIN['geometry'], 'type': pd.Series(["METRO_TRAIN" for x in range(len(gdf_METRO_TRAIN.index))])})

df_rtrain_stops = pd.DataFrame({'TRAIN_STOP_ID': gdf_REGIONAL_TRAIN['STOP_ID'], 'STOP_NAME': gdf_REGIONAL_TRAIN['STOP_NAME'],
'geometry': gdf_REGIONAL_TRAIN['geometry'], 'type': pd.Series(["REGIONAL_TRAIN" for x in range(len(gdf_REGIONAL_TRAIN.index))])})

df_train_stops = pd.concat([df_mtrain_stops, df_rtrain_stops], axis=0)

# Save to landing
df_train_stops.to_csv('../data/landing/PTV DATA/train_stops.csv',index=False)

## Match the public transport station location with the SAL region/suburb based on the geometry location (we define SAL region to be suburb)

## Import SAL data 

In [5]:
file_path = "../data/raw/victoria_region_gdf/SAL_region_gdf.geojson"
# Read the GeoPandas DataFrame from the specified file
SAL_gpd = gpd.read_file(file_path)

## Merge the Train stop data with SAL data by geometry join

In [6]:
#convert the train_data to geopandas
train_stops_gdf = gpd.GeoDataFrame(
    df_train_stops,
    geometry=df_train_stops['geometry']
)

In [7]:
# join by geometry join
train_stops_with_SAL = gpd.sjoin(train_stops_gdf, SAL_gpd, how='left', op='within')

  if (await self.run_code(code, result,  async_=asy)):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4283
Right CRS: EPSG:7844

  train_stops_with_SAL = gpd.sjoin(train_stops_gdf, SAL_gpd, how='left', op='within')


In [8]:
train_stops_with_SAL

Unnamed: 0,TRAIN_STOP_ID,STOP_NAME,geometry,type,index_right,SAL_NAME21,SAL_CODE21,SHAPE_Area
0,19970,Royal Park Railway Station (Parkville),POINT (144.95230 -37.78119),METRO_TRAIN,2037.0,Parkville,22038,0.000409
1,19971,Flemington Bridge Railway Station (North Melbo...,POINT (144.93932 -37.78814),METRO_TRAIN,1965.0,North Melbourne,21966,0.000241
2,19972,Macaulay Railway Station (North Melbourne),POINT (144.93617 -37.79427),METRO_TRAIN,1965.0,North Melbourne,21966,0.000241
3,19973,North Melbourne Railway Station (West Melbourne),POINT (144.94257 -37.80742),METRO_TRAIN,2756.0,West Melbourne,22757,0.000673
4,19974,Clifton Hill Railway Station (Clifton Hill),POINT (144.99542 -37.78866),METRO_TRAIN,573.0,Clifton Hill,20574,0.000178
...,...,...,...,...,...,...,...,...
105,47642,Epsom Railway Station (Epsom),POINT (144.32104 -36.70634),REGIONAL_TRAIN,878.0,Epsom,20879,0.000901
106,47647,Wyndham Vale Railway Station (Manor Lakes),POINT (144.60873 -37.87289),REGIONAL_TRAIN,1596.0,Manor Lakes,21597,0.001641
107,47648,Tarneit Railway Station (Tarneit),POINT (144.69471 -37.83217),REGIONAL_TRAIN,2450.0,Tarneit,22451,0.003909
108,48804,Cobblebank Railway Station (Cobblebank),POINT (144.60411 -37.71255),REGIONAL_TRAIN,589.0,Cobblebank,20590,0.000771


##### Check if there is unmatched data

In [9]:
train_stops_with_SAL[train_stops_with_SAL['SAL_NAME21'].isna()]

Unnamed: 0,TRAIN_STOP_ID,STOP_NAME,geometry,type,index_right,SAL_NAME21,SAL_CODE21,SHAPE_Area
8,20287,Albury Railway Station (Albury (NSW)),POINT (146.92452 -36.08426),REGIONAL_TRAIN,,,,


Notice this is stop in outside victoria, hence we will remove it 

In [10]:
## Remove rows where "SA2_NAME21" is None

train_stops_with_SAL = train_stops_with_SAL.dropna(subset=['SAL_NAME21'])


In [11]:
train_stops_with_SAL[train_stops_with_SAL['SAL_NAME21'].isna()]

Unnamed: 0,TRAIN_STOP_ID,STOP_NAME,geometry,type,index_right,SAL_NAME21,SAL_CODE21,SHAPE_Area


### Feature selection

In [12]:
# remove unnecessary columns
train_stops_with_SAL = train_stops_with_SAL.drop(columns=['STOP_NAME', 'geometry', 'index_right', 'SHAPE_Area'])

In [13]:
train_stops_with_SAL

Unnamed: 0,TRAIN_STOP_ID,type,SAL_NAME21,SAL_CODE21
0,19970,METRO_TRAIN,Parkville,22038
1,19971,METRO_TRAIN,North Melbourne,21966
2,19972,METRO_TRAIN,North Melbourne,21966
3,19973,METRO_TRAIN,West Melbourne,22757
4,19974,METRO_TRAIN,Clifton Hill,20574
...,...,...,...,...
105,47642,REGIONAL_TRAIN,Epsom,20879
106,47647,REGIONAL_TRAIN,Manor Lakes,21597
107,47648,REGIONAL_TRAIN,Tarneit,22451
108,48804,REGIONAL_TRAIN,Cobblebank,20590


In [14]:
# Rename the train type column
train_stops_with_SAL = train_stops_with_SAL.rename(columns={'type': 'train_type'})

In [15]:
# Save the this to raw data 

# make directory in raw data file to store filtered ptv data
output_relative_dir = '../data/raw/PTV'

if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    

train_stops_with_SAL.to_csv('../data/raw/PTV/train_stops_with_SAL.csv',index=False)

# Similalry with Bus 

In [16]:
# load and preprocess bus data
## including: 1. regional bus, 2. metro bus, 3. skybus, 4. regional coach

gdf_REGIONAL_BUS= gpd.read_file('../data/landing/PTV DATA/ll_gda94/filegdb/whole_of_dataset/victoria/PTV.gdb', 
layer='PTV_REGIONAL_BUS_STOP')

gdf_METRO_BUS= gpd.read_file('../data/landing/PTV DATA/ll_gda94/filegdb/whole_of_dataset/victoria/PTV.gdb', 
layer='PTV_METRO_BUS_STOP')

gdf_SKYBUS= gpd.read_file('../data/landing/PTV DATA/ll_gda94/filegdb/whole_of_dataset/victoria/PTV.gdb', 
layer='PTV_SKYBUS_STOP')

gdf_REGIONAL_COACH= gpd.read_file('../data/landing/PTV DATA/ll_gda94/filegdb/whole_of_dataset/victoria/PTV.gdb', 
layer='PTV_REGIONAL_COACH_STOP')

df_REGIONAL_BUS_stops = pd.DataFrame({'BUS_STOP_ID': gdf_REGIONAL_BUS['STOP_ID'], 'STOP_NAME': gdf_REGIONAL_BUS['STOP_NAME'], 
'geometry': gdf_REGIONAL_BUS['geometry'], 'type': pd.Series(["REGIONAL_BUS" for x in range(len(gdf_REGIONAL_BUS.index))])})

df_METRO_BUS = pd.DataFrame({'BUS_STOP_ID': gdf_METRO_BUS['STOP_ID'], 'STOP_NAME': gdf_METRO_BUS['STOP_NAME'], 
'geometry': gdf_METRO_BUS['geometry'], 'type': pd.Series(["METRO_BUS" for x in range(len(gdf_METRO_BUS.index))])})

df_SKYBUS = pd.DataFrame({'BUS_STOP_ID': gdf_SKYBUS['STOP_ID'], 'STOP_NAME': gdf_SKYBUS['STOP_NAME'], 
'geometry': gdf_SKYBUS['geometry'], 'type': pd.Series(["SKYBUS" for x in range(len(gdf_SKYBUS.index))])})

df_REGIONAL_COACH = pd.DataFrame({'BUS_STOP_ID': gdf_REGIONAL_COACH['STOP_ID'], 'STOP_NAME': gdf_REGIONAL_COACH['STOP_NAME'],
'geometry': gdf_REGIONAL_COACH['geometry'], 'type': pd.Series(["REGIONAL_COACH" for x in range(len(gdf_REGIONAL_COACH.index))])})

df_bus_stops = pd.concat([df_REGIONAL_BUS_stops, df_METRO_BUS, df_SKYBUS, df_REGIONAL_COACH], axis=0)
# Save to landing PTV
df_bus_stops.to_csv('../data/landing/PTV DATA/bus_stops.csv',index=False)

In [17]:
df_bus_stops

Unnamed: 0,BUS_STOP_ID,STOP_NAME,geometry,type
0,23195,Andrew St/Union St (Kilmore),POINT (144.95228 -37.29049),REGIONAL_BUS
1,23197,Wellington Square SC/Queen St (Wallan),POINT (144.97802 -37.41299),REGIONAL_BUS
2,23202,Aquamoves/Tom Collins Dr (Shepparton),POINT (145.39444 -36.38603),REGIONAL_BUS
3,23224,West End Caravan Park/Murray Valley Hwy (Yarra...,POINT (145.97705 -36.02218),REGIONAL_BUS
4,23225,Jane Rd/Fiona Dr (Yarrawonga),POINT (146.02596 -36.00671),REGIONAL_BUS
...,...,...,...,...
796,18671,east of Pechell St/High St (Yea),POINT (145.42398 -37.21116),REGIONAL_COACH
797,18672,General Store/Goulburn Valley Hwy (Molesworth),POINT (145.53905 -37.16731),REGIONAL_COACH
798,18673,General Store/Goulburn Valley Hwy (Molesworth),POINT (145.53916 -37.16735),REGIONAL_COACH
799,18674,Wrights St/Maroondah Hwy (Yarck),POINT (145.61708 -37.10119),REGIONAL_COACH


## Merge the Bus data with SAL region based on their geometry location 

#### first convert the df_bus_stops to geopandas df

In [18]:
#convert the train_data to geopandas
bus_stops_gdf = gpd.GeoDataFrame(
    df_bus_stops,
    geometry=df_bus_stops['geometry']
)

Merge bus_stops_gdf with our victoria region shapefile 

In [19]:
# merge bus stop data with SAL by geometry join
bus_stops_with_SAL = gpd.sjoin(bus_stops_gdf, SAL_gpd, how='left', op='within')

  if (await self.run_code(code, result,  async_=asy)):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4283
Right CRS: EPSG:7844

  bus_stops_with_SAL = gpd.sjoin(bus_stops_gdf, SAL_gpd, how='left', op='within')


#### Check if there is unmatched data

In [20]:
bus_stops_with_SAL[bus_stops_with_SAL['SAL_NAME21'].isna()]

Unnamed: 0,BUS_STOP_ID,STOP_NAME,geometry,type,index_right,SAL_NAME21,SAL_CODE21,SHAPE_Area
5,23227,Erne St/Sturt St (Mulwala (NSW)),POINT (146.00407 -35.98670),REGIONAL_BUS,,,,
550,20887,General Store/Mallee Hwy (Tooleybuc (NSW)),POINT (143.33680 -35.02790),REGIONAL_BUS,,,,
778,17419,Albury Railway Station (Albury (NSW)),POINT (146.92433 -36.08424),REGIONAL_BUS,,,,
2697,36979,Bridgekeepers Cottage/Murray St (Tooleybuc (NSW)),POINT (143.33631 -35.02999),REGIONAL_BUS,,,,
3055,37425,Ebden St/Wodonga Pl (South Albury (NSW)),POINT (146.90880 -36.08883),REGIONAL_BUS,,,,
...,...,...,...,...,...,...,...,...
5953,49415,Heriot St/Hague St (Lavington (NSW)),POINT (146.92719 -36.03872),REGIONAL_BUS,,,,
5954,49416,Hume Public School/Cheyenne Dr (Lavington (NSW)),POINT (146.94649 -36.04909),REGIONAL_BUS,,,,
5955,49417,Murray High School/Kaitlers Rd (Springdale Hei...,POINT (146.94546 -36.03331),REGIONAL_BUS,,,,
5956,49418,Murray High School/Kaitlers Rd (Lavington (NSW)),POINT (146.94555 -36.03350),REGIONAL_BUS,,,,


notice how these unmatched bus data were all regional bus with bus stop outside victoria state. 

we will simply disgard these data, assuming that these bus stop has no effect on housing renting in victoria

In [21]:
## Remove rows where "SAL_NAME21" is None
bus_stops_with_SAL = bus_stops_with_SAL.dropna(subset=['SAL_NAME21'])

In [22]:
bus_stops_with_SAL[bus_stops_with_SAL['SAL_NAME21'].isna()]

Unnamed: 0,BUS_STOP_ID,STOP_NAME,geometry,type,index_right,SAL_NAME21,SAL_CODE21,SHAPE_Area


### Feature selection on bus stop data

In [23]:
# remove unnecessary columns
bus_stops_with_SAL = bus_stops_with_SAL.drop(columns=['STOP_NAME', 'geometry', 'index_right', 'SHAPE_Area'])

In [24]:
# Rename the bus type column
bus_stops_with_SAL = bus_stops_with_SAL.rename(columns={'type': 'bus_type'})

In [25]:
bus_stops_with_SAL

Unnamed: 0,BUS_STOP_ID,bus_type,SAL_NAME21,SAL_CODE21
0,23195,REGIONAL_BUS,Kilmore,21352
1,23197,REGIONAL_BUS,Wallan,22661
2,23202,REGIONAL_BUS,Shepparton,22275
3,23224,REGIONAL_BUS,Yarrawonga,22919
4,23225,REGIONAL_BUS,Yarrawonga,22919
...,...,...,...,...
796,18671,REGIONAL_COACH,Yea,22924
797,18672,REGIONAL_COACH,Molesworth,21721
798,18673,REGIONAL_COACH,Molesworth,21721
799,18674,REGIONAL_COACH,Yarck,22908


In [26]:
# # Save the this in raw/PTV folder with other PTV datas
bus_stops_with_SAL.to_csv('../data/raw/PTV/bus_stops_with_SAL.csv',index=False)

# Tram

In [27]:
# load and preprocess tram data
gdf_METRO_TRAM = gpd.read_file('../data/landing/PTV DATA/ll_gda94/filegdb/whole_of_dataset/victoria/PTV.gdb', layer='PTV_METRO_TRAM_STOP')

df_tram_stops = pd.DataFrame(gdf_METRO_TRAM)

df_tram_stops.rename(columns={'STOP_ID': 'TRAM_STOP_ID'}, inplace=True)

# Save to landing/PTV data
df_tram_stops.to_csv('../data/landing/PTV DATA/tram_stops.csv',index=False)

In [28]:
df_tram_stops

Unnamed: 0,TRAM_STOP_ID,LATITUDE,STOP_NAME,LONGITUDE,TICKETZONE,ROUTES_USING_STOP,geometry
0,18730,-37.744359,134-Merribell Ave/Nicholson St (Coburg),144.977728,1,1,POINT (144.97773 -37.74436)
1,18732,-37.811375,44-Deepdene Park/Whitehorse Rd (Balwyn),145.068671,1,109,POINT (145.06867 -37.81138)
2,18733,-37.811750,45-Hardwicke St/Whitehorse Rd (Balwyn),145.071785,1,109,POINT (145.07179 -37.81175)
3,18734,-37.812242,46-Balwyn Cinema/Whitehorse Rd (Balwyn),145.075930,1,109,POINT (145.07593 -37.81224)
4,18735,-37.812919,47-Balwyn Rd/Whitehorse Rd (Balwyn),145.081524,12,109,POINT (145.08152 -37.81292)
...,...,...,...,...,...,...,...
1660,6037,-37.767614,34-Bent St/High St (Northcote),144.999096,1,86,POINT (144.99910 -37.76761)
1661,6038,-37.769413,33-Arthurton Rd/High St (Northcote),144.998900,1,86,POINT (144.99890 -37.76941)
1662,6039,-37.771138,32-Mitchell St/High St (Northcote),144.998569,1,86,POINT (144.99857 -37.77114)
1663,6040,-37.774712,31-Northcote Town Hall/High St (Northcote),144.997837,1,86,POINT (144.99784 -37.77471)


#### Check for duplicate values in 'Tram_STOP_ID'

In [29]:
# Check for duplicate values in 'BUS_STOP_ID'
duplicates = df_tram_stops[df_tram_stops['TRAM_STOP_ID'].duplicated(keep=False)]

# If 'duplicates' DataFrame is not empty, it means there are two or more rows with the same value in 'TRAM_STOP_ID'
if not duplicates.empty:
    print("There are duplicate value in TRAM_STOP_ID")
    print(duplicates)
else:
    print("There are no duplicate value in TRAM_STOP_ID")

There are no duplicate value in TRAM_STOP_ID


### Merge the Bus stop data with SAL data
first convert the df_tram_stops to geopandas df

In [30]:
#convert the train_data to geopandas
tram_stops_gdf = gpd.GeoDataFrame(
    df_tram_stops,
    geometry=df_tram_stops['geometry']
)

Merge with the victoria region shapefile 

In [31]:
# merge tram data with SAL by geomery location
tram_stops_with_SAL = gpd.sjoin(tram_stops_gdf, SAL_gpd, how='left', op='within')

  if (await self.run_code(code, result,  async_=asy)):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4283
Right CRS: EPSG:7844

  tram_stops_with_SAL = gpd.sjoin(tram_stops_gdf, SAL_gpd, how='left', op='within')


In [32]:
tram_stops_with_SAL

Unnamed: 0,TRAM_STOP_ID,LATITUDE,STOP_NAME,LONGITUDE,TICKETZONE,ROUTES_USING_STOP,geometry,index_right,SAL_NAME21,SAL_CODE21,SHAPE_Area
0,18730,-37.744359,134-Merribell Ave/Nicholson St (Coburg),144.977728,1,1,POINT (144.97773 -37.74436),595,Coburg,20596,0.000707
1,18732,-37.811375,44-Deepdene Park/Whitehorse Rd (Balwyn),145.068671,1,109,POINT (145.06867 -37.81138),727,Deepdene,20728,0.000090
2,18733,-37.811750,45-Hardwicke St/Whitehorse Rd (Balwyn),145.071785,1,109,POINT (145.07179 -37.81175),727,Deepdene,20728,0.000090
3,18734,-37.812242,46-Balwyn Cinema/Whitehorse Rd (Balwyn),145.075930,1,109,POINT (145.07593 -37.81224),122,Balwyn,20123,0.000449
4,18735,-37.812919,47-Balwyn Rd/Whitehorse Rd (Balwyn),145.081524,12,109,POINT (145.08152 -37.81292),122,Balwyn,20123,0.000449
...,...,...,...,...,...,...,...,...,...,...,...
1660,6037,-37.767614,34-Bent St/High St (Northcote),144.999096,1,86,POINT (144.99910 -37.76761),1970,Northcote,21971,0.000615
1661,6038,-37.769413,33-Arthurton Rd/High St (Northcote),144.998900,1,86,POINT (144.99890 -37.76941),1970,Northcote,21971,0.000615
1662,6039,-37.771138,32-Mitchell St/High St (Northcote),144.998569,1,86,POINT (144.99857 -37.77114),1970,Northcote,21971,0.000615
1663,6040,-37.774712,31-Northcote Town Hall/High St (Northcote),144.997837,1,86,POINT (144.99784 -37.77471),1970,Northcote,21971,0.000615


#### Check if there is unmatched tram data 

In [33]:
tram_stops_with_SAL[tram_stops_with_SAL['SAL_NAME21'].isna()]

Unnamed: 0,TRAM_STOP_ID,LATITUDE,STOP_NAME,LONGITUDE,TICKETZONE,ROUTES_USING_STOP,geometry,index_right,SAL_NAME21,SAL_CODE21,SHAPE_Area


#### Apply feature selection on tram stop data 

In [34]:
# remove unnecessary columns
tram_stops_with_SAL = tram_stops_with_SAL.drop(columns=['LATITUDE', 'LONGITUDE', 'STOP_NAME', 'geometry', 'index_right', 'SHAPE_Area', 'TICKETZONE', 'ROUTES_USING_STOP'])

In [35]:
tram_stops_with_SAL

Unnamed: 0,TRAM_STOP_ID,SAL_NAME21,SAL_CODE21
0,18730,Coburg,20596
1,18732,Deepdene,20728
2,18733,Deepdene,20728
3,18734,Balwyn,20123
4,18735,Balwyn,20123
...,...,...,...
1660,6037,Northcote,21971
1661,6038,Northcote,21971
1662,6039,Northcote,21971
1663,6040,Northcote,21971


In [36]:
# Save the this to raw/PTV folder
tram_stops_with_SAL.to_csv('../data/raw/PTV/tram_stops_with_SAL.csv',index=False)