In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from geopy.geocoders import Nominatim
import math
import time
import csv
import re


In [2]:
df_aed = pd.read_parquet('aed_locations.parquet.gzip')

In [3]:
df_aed['lat'] = None
df_aed['lon'] = None

In [181]:
df_aed.loc[6093,'postal_code']

2200.0

Cleaning the data

In [4]:
#cleaning abbreviations
for i in range(len(df_aed)):
    text = df_aed.loc[i, 'address']
    if (not text == None):
        if (re.search('Ch.',text)):
            new_text = text.replace('Ch.', 'Chaussée')
            df_aed.loc[i, 'address'] = new_text
        if (re.search('Blvd.',text)):
            new_text = text.replace('Blvd.', 'Boulevard')
            df_aed.loc[i, 'address'] = new_text
        if (re.search('sestweg',text)):
            new_text = text.replace('sestweg', 'steenweg')
            df_aed.loc[i, 'address'] = new_text 


Make structured query instead

In [5]:
geolocator = Nominatim(user_agent='Isabel')

2350 missing addresses

In [7]:
counter = 0
not_found = 0
for number in range(len(df_aed)):
    #making structure query for addresses with numbers and unstructured query for addresses without numbers
    structured_query = {}
    if not pd.isna(df_aed['postal_code'][number]):
        structured_query['postalcode']= int(df_aed.loc[number,'postal_code'])         
    structured_query['country']= 'Belgium'
    if not pd.isna(df_aed['address'][number]):
        if not pd.isna(df_aed['number'][number]):
            structured_query['street'] = f"{int(df_aed.loc[number,'number'])} {df_aed.loc[number,'address']}"
            location=geolocator.geocode(structured_query, timeout=15)
        else:
            #making unstructured search string
            address = ''
            if not pd.isna(df_aed['address'][number]):
                address = str(df_aed['address'][number])
                if not pd.isna(df_aed['number'][number]):
                    address = address+" "+str(int(df_aed['number'][number]))
                if not pd.isna(df_aed['postal_code'][number]):
                    address = address +" ,"+str(int(df_aed['postal_code'][number]))
                if not pd.isna(df_aed['municipality'][number]):
                    address = address +" ,"+str(df_aed['municipality'][number])
                address = address + ' , Belgium'
            location=geolocator.geocode(address, timeout=15)
    if location == None:
        not_found += 1
        if (not_found % 10 == 0):
            print('number of not found addresses:'+ str(not_found))
    else:
        df_aed.loc[counter,'lon'] =location.longitude
        df_aed.loc[counter,'lat'] =location.latitude
    counter += 1
    if (counter % 50 ==0):
        print('sleeping....')
        time.sleep(5)
        print('awake and count is: '+str(counter))


sleeping....
awake and count is: 50
number of not found addresses:10
sleeping....
awake and count is: 100
number of not found addresses:20
sleeping....
awake and count is: 150
number of not found addresses:30
sleeping....
awake and count is: 200
number of not found addresses:40
sleeping....
awake and count is: 250
number of not found addresses:50
sleeping....
awake and count is: 300
sleeping....
awake and count is: 350
number of not found addresses:60
sleeping....
awake and count is: 400
sleeping....
awake and count is: 450
number of not found addresses:70
sleeping....
awake and count is: 500
sleeping....
awake and count is: 550
number of not found addresses:80
sleeping....
awake and count is: 600
number of not found addresses:90
sleeping....
awake and count is: 650
number of not found addresses:100
sleeping....
awake and count is: 700
sleeping....
awake and count is: 750
number of not found addresses:110
sleeping....
awake and count is: 800
sleeping....
awake and count is: 850
number 

In [8]:
not_found

2114

In [9]:
df_aed.loc[df_aed['lon'].isnull()].iloc[1:20]

Unnamed: 0,id,type,address,number,postal_code,municipality,province,location,public,available,hours,lat,lon
15,97.0,,Sint-Pieterskerstraat,,1090.0,Brussel,Bruxelles-Brussel,,,,,,
16,98.0,,Wemmelsteenweg,100.0,1090.0,Brussel,Bruxelles-Brussel,,,,,,
27,167.0,,Vanhalstraat,1.0,1500.0,Halle,Vlaams-Brabant,,y,,,,
30,218.0,,Cours des mineurs,,4000.0,Liège,Liège,,,,,,
38,226.0,,Rue J. Stassart,15.0,4367.0,Crisnée,Liège,,,,,,
46,234.0,,Rue av. Peltzer,40.0,4020.0,Liège,Liège,,,,,,
47,235.0,,Route du Canada,157.0,4910.0,La Reid,Liège,,,,,,
73,349.0,,Haven 1968 Sint Annalaan,1.0,9130.0,Kallo,Oost-Vlaanderen,,Y,,,,
74,354.0,,Bld Leopold II,44.0,1080.0,Bruxelles,Bruxelles-Brussel,,,,,,
76,356.0,,Bld. Du jardin Botanique,,1000.0,Bruxelles,Bruxelles-Brussel,,,,,,


In [10]:
file_path = 'aed_coordinates2.csv'

In [11]:
# Write the list to a CSV file
df_aed.to_csv(file_path, index=False)
print("CSV file saved successfully.")

CSV file saved successfully.


In [103]:
df_aed['lat'].min()

-62.1881163

handle the aed addresses that were not read the first time, use a different geolocator


In [79]:
px.set_mapbox_access_token('pk.eyJ1Ijoia2V6aWFoZHV0dCIsImEiOiJjbHZnaXo3Y3cwcW16Mmpudnh6anRzZXp5In0.JawqtWmD9uBcGjDAt8C5zg')
center_lat = 50.5  # Latitude of the center of the country
center_lon = 4.3517  # Longitude of the center of the country
zoom_level = 7       # Zoom level (1-20)
# Create the base map
map_base = px.scatter_mapbox(lat=[center_lat], lon=[center_lon], zoom=zoom_level)

# Update the layout of the base map
map_base.update_layout(mapbox_style="light")

# Add the scatter plot layer with the coordinate locations
aed_location_layer = px.scatter_mapbox(df_aed, lat='lat', lon='lon')

# Add the scatter plot layer to the base map
map_with_aed_points = map_base.add_trace(aed_location_layer.data[0])

# Show the map
map_with_aed_points.show()