In [29]:
# Importing necessary packages

import pandas as pd
import numpy as np
import geocoder
from geopy.geocoders import Nominatim
import warnings
warnings.filterwarnings("ignore")

In [30]:
#Loading the kindergarten csv file
df = pd.read_csv('kindergarten.csv')

# creating a dataframe df1 with only latitude and longitude columns
df1 = df[['Latitude','Longitude']]

# showing only first 5 rows
df1.head(5)


Unnamed: 0,Latitude,Longitude
0,-38.037796,145.266534
1,-37.811354,145.081836
2,-37.831777,144.825369
3,-37.051792,143.731878
4,-37.561997,144.715655


In [31]:
# Checking any records have null values

print(df1.isnull().sum())

Latitude     1
Longitude    1
dtype: int64


In [32]:
# Deleting the records where both latitude and longitude are null

df1 = df1.dropna(axis=0, subset=['Latitude','Longitude'])
print(df1.isnull().sum())

Latitude     0
Longitude    0
dtype: int64


In [33]:
# reverse geocoding using geolocator.reverse to get the address from latitude and longitude
geolocator = Nominatim(user_agent="monash university research project_moms care")

df1['address'] = df1.apply(lambda row: geolocator.reverse((row['Latitude'], row['Longitude']),timeout=10), axis=1)
df1

Unnamed: 0,Latitude,Longitude,address
0,-38.037796,145.266534,"(Hallam Road, Hampton Park, City of Casey, Vic..."
1,-37.811354,145.081836,"(100, Balwyn Road, Balwyn, City of Boroondara,..."
2,-37.831777,144.825369,"(Grand Star Receptions, 499, Grieve Parade, Al..."
3,-37.051792,143.731878,"(Alma Street, Maryborough, Shire of Central Go..."
4,-37.561997,144.715655,"(Spavin Drive, Sunbury, City of Hume, Victoria..."
5,-37.871206,145.011624,"(Stop 44: Otira Road, Balaclava Road, Caulfiel..."
6,-37.982668,145.086776,"(Acacia Avenue, Mentone, City of Kingston, Vic..."
7,-37.814672,144.997483,"(17, Belgium Avenue, Richmond Housing Estate, ..."
8,-37.948480,145.144315,"(Balmoral Avenue, Springvale, City of Greater ..."
9,-37.838788,144.919549,"(The Boulevard, Garden City, Port Melbourne, C..."


In [35]:
#Exporting to csv
df1.to_csv('kindergarten_final.csv',index = False)

In [54]:
#Loading the childcare_final csv file
df1 = pd.read_csv('kindergarten_final.csv')
df1

Unnamed: 0,Latitude,Longitude,address
0,-38.037796,145.266534,"Hallam Road, Hampton Park, City of Casey, Vict..."
1,-37.811354,145.081836,"100, Balwyn Road, Balwyn, City of Boroondara, ..."
2,-37.831777,144.825369,"Grand Star Receptions, 499, Grieve Parade, Alt..."
3,-37.051792,143.731878,"Alma Street, Maryborough, Shire of Central Gol..."
4,-37.561997,144.715655,"Spavin Drive, Sunbury, City of Hume, Victoria,..."
5,-37.871206,145.011624,"Stop 44: Otira Road, Balaclava Road, Caulfield..."
6,-37.982668,145.086776,"Acacia Avenue, Mentone, City of Kingston, Vict..."
7,-37.814672,144.997483,"17, Belgium Avenue, Richmond Housing Estate, R..."
8,-37.948480,145.144315,"Balmoral Avenue, Springvale, City of Greater D..."
9,-37.838788,144.919549,"The Boulevard, Garden City, Port Melbourne, Ci..."


In [55]:
# splitting the address column into address,country,postcode,state,city council and suburb columns
df1[['address','country']] = df1['address'].str.rsplit(pat=',',expand=True, n=1)

In [56]:
df1[['address','postcode']] = df1['address'].str.rsplit(pat=',',expand=True, n=1)
df1[['address','state']] = df1['address'].str.rsplit(pat=',',expand=True, n=1)
df1[['address','city council']] = df1['address'].str.rsplit(pat=',',expand=True, n=1)
df1[['address','suburb']] = df1['address'].str.rsplit(pat=',',expand=True, n=1)
df1.head(20)

Unnamed: 0,Latitude,Longitude,address,country,postcode,state,city council,suburb
0,-38.037796,145.266534,Hallam Road,Australia,3976,Victoria,City of Casey,Hampton Park
1,-37.811354,145.081836,"100, Balwyn Road",Australia,3103,Victoria,City of Boroondara,Balwyn
2,-37.831777,144.825369,"Grand Star Receptions, 499, Grieve Parade",Australia,3025,Victoria,City of Hobsons Bay,Altona North
3,-37.051792,143.731878,"Alma Street, Maryborough",Australia,3465,Victoria,Loddon Mallee,Shire of Central Goldfields
4,-37.561997,144.715655,Spavin Drive,Australia,3429,Victoria,City of Hume,Sunbury
5,-37.871206,145.011624,"Stop 44: Otira Road, Balaclava Road",Australia,3161,Victoria,City of Glen Eira,Caulfield North
6,-37.982668,145.086776,Acacia Avenue,Australia,3194,Victoria,City of Kingston,Mentone
7,-37.814672,144.997483,"17, Belgium Avenue, Richmond Housing Estate",Australia,3121,Victoria,City of Yarra,Richmond
8,-37.94848,145.144315,Balmoral Avenue,Australia,3171,Victoria,City of Greater Dandenong,Springvale
9,-37.838788,144.919549,"The Boulevard, Garden City",Australia,3207,Victoria,City of Port Phillip,Port Melbourne


In [57]:
# joining suburb into address column
df1['address'] = df1['address'] + ',' + df1['suburb']
df1

Unnamed: 0,Latitude,Longitude,address,country,postcode,state,city council,suburb
0,-38.037796,145.266534,"Hallam Road, Hampton Park",Australia,3976,Victoria,City of Casey,Hampton Park
1,-37.811354,145.081836,"100, Balwyn Road, Balwyn",Australia,3103,Victoria,City of Boroondara,Balwyn
2,-37.831777,144.825369,"Grand Star Receptions, 499, Grieve Parade, Alt...",Australia,3025,Victoria,City of Hobsons Bay,Altona North
3,-37.051792,143.731878,"Alma Street, Maryborough, Shire of Central Gol...",Australia,3465,Victoria,Loddon Mallee,Shire of Central Goldfields
4,-37.561997,144.715655,"Spavin Drive, Sunbury",Australia,3429,Victoria,City of Hume,Sunbury
5,-37.871206,145.011624,"Stop 44: Otira Road, Balaclava Road, Caulfield...",Australia,3161,Victoria,City of Glen Eira,Caulfield North
6,-37.982668,145.086776,"Acacia Avenue, Mentone",Australia,3194,Victoria,City of Kingston,Mentone
7,-37.814672,144.997483,"17, Belgium Avenue, Richmond Housing Estate, R...",Australia,3121,Victoria,City of Yarra,Richmond
8,-37.948480,145.144315,"Balmoral Avenue, Springvale",Australia,3171,Victoria,City of Greater Dandenong,Springvale
9,-37.838788,144.919549,"The Boulevard, Garden City, Port Melbourne",Australia,3207,Victoria,City of Port Phillip,Port Melbourne


In [58]:
# converting postcode to int datatype and filling null values with '0'
# replacing 0 with np.nan
df1.postcode = pd.to_numeric(df1.postcode, errors='coerce').fillna(0).astype(np.int64)
df1['postcode'].dtype
df1['postcode'] = df1['postcode'].replace(0, np.nan)

In [59]:
# checking for null values
print(df1.isnull().sum())

Latitude          0
Longitude         0
address          12
country           0
postcode        157
state             0
city council      1
suburb           12
dtype: int64


In [60]:
# filling the null values of postcode column
# groupby suburb, and transform('first') 
df1['postcode'] = df1.groupby("suburb")["postcode"].transform("first")

In [61]:
print(df1.isnull().sum())

Latitude          0
Longitude         0
address          12
country           0
postcode        124
state             0
city council      1
suburb           12
dtype: int64


In [62]:
# dropping the unrequired columns
df1 = df1.drop(['country','state','city council'], axis=1)

In [63]:
# creating a dataframe df2 with only place name column
df2 = df[['Place Name']]

In [64]:
# merging two dataframes df1 and df2
df3=df1.join(df2)
df3

Unnamed: 0,Latitude,Longitude,address,postcode,suburb,Place Name
0,-38.037796,145.266534,"Hallam Road, Hampton Park",3976.0,Hampton Park,123KIDS
1,-37.811354,145.081836,"100, Balwyn Road, Balwyn",3103.0,Balwyn,3 APPLES CHILDCARE CENTRE & KINDERGARTEN
2,-37.831777,144.825369,"Grand Star Receptions, 499, Grieve Parade, Alt...",3025.0,Altona North,5 STAR CHILDCARE & EARLY LEARNING CENTRE
3,-37.051792,143.731878,"Alma Street, Maryborough, Shire of Central Gol...",3465.0,Shire of Central Goldfields,A G LEECH KINDERGARTEN
4,-37.561997,144.715655,"Spavin Drive, Sunbury",3429.0,Sunbury,A STEP AHEAD EARLY LEARNING CENTRE
5,-37.871206,145.011624,"Stop 44: Otira Road, Balaclava Road, Caulfield...",3161.0,Caulfield North,ABELES LIBERMAN PRESCHOOL
6,-37.982668,145.086776,"Acacia Avenue, Mentone",3194.0,Mentone,ACACIA AVENUE PRESCHOOL
7,-37.814672,144.997483,"17, Belgium Avenue, Richmond Housing Estate, R...",3121.0,Richmond,ACACIA CHILDRENS CENTRE
8,-37.948480,145.144315,"Balmoral Avenue, Springvale",3171.0,Springvale,ACTIVE LEARNING CHILDCARE: BALMORAL AVENUE
9,-37.838788,144.919549,"The Boulevard, Garden City, Port Melbourne",3207.0,Port Melbourne,ADA MARY ABECKETT CHILDRENS CENTRE INC


In [65]:
#Renaming columns
df3.columns = ['lat', 'long', 'address', 'postcode', 'suburb','name']

In [66]:
# checking the number of null values in the csv file in all the columns
print(df3.isnull().sum())

lat           0
long          0
address      12
postcode    124
suburb       12
name          0
dtype: int64


In [67]:
# delete the records where suburb has null values
df3 = df3.dropna(axis=0, subset=['suburb'])
print(df3.isnull().sum())

lat           0
long          0
address       0
postcode    112
suburb        0
name          0
dtype: int64


In [72]:
# converting postcode to int datatype and filling null values with '0'
df3.postcode = pd.to_numeric(df3.postcode, errors='coerce').fillna(0).astype(np.int64)
df3['postcode'].dtype

dtype('int64')

In [69]:
# dropping the suburb column
df3 = df3.drop(['suburb'], axis=1)

In [None]:
df3 = df3.drop_duplicates(subset=['name'], keep='first')
df3

In [70]:
#Exporting to csv
df3.to_csv('kindergarten_final_file.csv',index = False)