# Preprocess school data
This notebook is to preprocess the school data. At the end, we will get the total number of schools in each suburb.

### Import packages

In [1]:
import pandas as pd

# read csv
csv_file_path = '../data/raw/schoollocations2022.csv'
school = pd.read_csv(csv_file_path,  encoding='ISO-8859-1')

In [4]:
import geopandas as gpd
from shapely.geometry import Point
import folium


## Read the file

In [2]:
# drop columns
columns_to_drop = ['Entity_Type','SCHOOL_NO', 'School_Status','Postal_Address_Line_1','Postal_Address_Line_2','Postal_Town','Postal_State','Postal_Postcode','Full_Phone_No','LGA_ID','LGA_Name']
school_drop = school.drop(columns=columns_to_drop)

In [3]:
school_drop

Unnamed: 0,Education_Sector,School_Name,School_Type,Address_Line_1,Address_Line_2,Address_Town,Address_State,Address_Postcode,X,Y
0,Government,Alberton Primary School,Primary,21 Thomson Street,,Alberton,VIC,3971,146.66660,-38.61771
1,Government,Allansford and District Primary School,Primary,Frank Street,,Allansford,VIC,3277,142.59039,-38.38628
2,Government,Avoca Primary School,Primary,118 Barnett Street,,Avoca,VIC,3467,143.47565,-37.08450
3,Government,Avenel Primary School,Primary,40 Anderson Street,,Avenel,VIC,3664,145.23472,-36.90137
4,Government,Warrandyte Primary School,Primary,5-11 Forbes Street,,Warrandyte,VIC,3113,145.21398,-37.74268
...,...,...,...,...,...,...,...,...,...,...
2294,Independent,Lysterfield Lake College,Primary,20 ABeckett Road,,NARRE WARREN NORTH,VIC,3804,145.31604,-37.97748
2295,Independent,Plenty River College,Secondary,"Unit 5, 9 Danaher Drive",,SOUTH MORANG,VIC,3752,145.08148,-37.64875
2296,Catholic,Holy Cross Catholic Primary School,Primary,2-14 Carlingford Rd,,MICKLEHAM,VIC,3064,144.90520,-37.53046
2297,Independent,Sidrah Gardens School,Primary,434-442 Belgrave-Hallam Road,,NARRE WARREN NORTH,VIC,3804,145.31589,-37.97324


In [5]:
# Get the shpaefile for SA2 region
suburbs = gpd.read_file("../data/landing/SAL_2021_AUST_GDA2020_SHP/SAL_2021_AUST_GDA2020.shp")


In [6]:
# select the subset that we care about
suburbs = suburbs.loc[suburbs['STE_NAME21']=='Victoria']
suburbs = suburbs[['SAL_CODE21','SAL_NAME21','geometry']]

In [7]:
# Convert the dataframe to a geodataframe
geometry = [Point(xy) for xy in zip(school_drop['X'], school_drop['Y'])]
school_drop_gdf = gpd.GeoDataFrame(school_drop, geometry=geometry)

In [8]:
# Use sjoin to find out which suburb each point belongs to
joined = gpd.sjoin(school_drop_gdf, suburbs, how="left", op="within")

# If your suburbs shapefile has a column named 'suburb_name' that specifies the name of the suburb
school_drop_gdf['SAL_NAME'] = joined['SAL_NAME21']
school_drop_gdf['SAL_CODE'] = joined['SAL_CODE21']

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:7844

  joined = gpd.sjoin(school_drop_gdf, suburbs, how="left", op="within")


In [11]:
school_drop_gdf

Unnamed: 0,Education_Sector,School_Name,School_Type,Address_Line_1,Address_Line_2,Address_Town,Address_State,Address_Postcode,X,Y,geometry,SAL_NAME,SAL_CODE
0,Government,Alberton Primary School,Primary,21 Thomson Street,,Alberton,VIC,3971,146.66660,-38.61771,POINT (146.66660 -38.61771),Alberton (Vic.),20019
1,Government,Allansford and District Primary School,Primary,Frank Street,,Allansford,VIC,3277,142.59039,-38.38628,POINT (142.59039 -38.38628),Allansford,20028
2,Government,Avoca Primary School,Primary,118 Barnett Street,,Avoca,VIC,3467,143.47565,-37.08450,POINT (143.47565 -37.08450),Avoca (Vic.),20087
3,Government,Avenel Primary School,Primary,40 Anderson Street,,Avenel,VIC,3664,145.23472,-36.90137,POINT (145.23472 -36.90137),Avenel,20086
4,Government,Warrandyte Primary School,Primary,5-11 Forbes Street,,Warrandyte,VIC,3113,145.21398,-37.74268,POINT (145.21398 -37.74268),Warrandyte,22702
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2294,Independent,Lysterfield Lake College,Primary,20 ABeckett Road,,NARRE WARREN NORTH,VIC,3804,145.31604,-37.97748,POINT (145.31604 -37.97748),Narre Warren North,21895
2295,Independent,Plenty River College,Secondary,"Unit 5, 9 Danaher Drive",,SOUTH MORANG,VIC,3752,145.08148,-37.64875,POINT (145.08148 -37.64875),South Morang,22311
2296,Catholic,Holy Cross Catholic Primary School,Primary,2-14 Carlingford Rd,,MICKLEHAM,VIC,3064,144.90520,-37.53046,POINT (144.90520 -37.53046),Mickleham,21675
2297,Independent,Sidrah Gardens School,Primary,434-442 Belgrave-Hallam Road,,NARRE WARREN NORTH,VIC,3804,145.31589,-37.97324,POINT (145.31589 -37.97324),Narre Warren North,21895


In [9]:
school_drop_gdf.to_csv("../data/curated/school_suburbs_SA2.csv", index=False)

In [10]:
school_drop_gdf.to_file("../data/curated/school_suburbs_SA2.shp")

  pd.Int64Index,
  school_drop_gdf.to_file("../data/curated/school_suburbs_SA2.shp")


In [52]:
# Group the data by 'SA2_NAME21' and count the number of 'School_Name' for each group
SA2_school_count = school_drop_gdf.groupby(['SAL_NAME', 'SAL_CODE'])['School_Name'].count().reset_index()

# Rename the 'School_Name' column to 'Count' for clarity
SA2_school_count = SA2_school_count.rename(columns={'School_Name': 'School_Count'})

In [53]:
SA2_school_count

Unnamed: 0,SAL_NAME,SAL_CODE,School_Count
0,Abbotsford (Vic.),20002,2
1,Aberfeldie,20003,4
2,Aintree,20011,1
3,Aireys Inlet,20013,1
4,Airly,20014,1
...,...,...,...
916,Yea,22924,3
917,Yering,22930,1
918,Yinnar,22934,1
919,Yinnar South,22935,1


In [54]:
# Check for duplicate values in 'School_Name'
duplicates = SA2_school_count[SA2_school_count['SAL_NAME'].duplicated(keep=False)]

# If 'duplicates' DataFrame is not empty, it means there are two or more rows with the same value in 'Column1'
if not duplicates.empty:
    print("There is duplicate in SAL_NAME")
else:
    print("There are no duplicate in SAL_NAME")

There are no duplicate in SAL_NAME


In [55]:
SA2_school_count.to_csv('../data/curated/School_count_with_SA2.csv',index=False)