In [1]:
import pandas as pd
import requests
import time
import re
from geopy.geocoders import Nominatim

In [2]:
df_aed = pd.read_parquet('aed_locations.parquet.gzip')
geolocator = Nominatim(user_agent='Isabel')

In [3]:
class AEDcoordinates_cleaner:
    def __init__(self, dataset):
        self.dataset = dataset.copy()
        self.dataset['lat'] = None
        self.dataset['lon'] = None
        self.abbrev = False
        self.addresses_made = False
        self.cleaning_numbers = False

    def cleaning_house_numbers(self):
        self.dataset = self.dataset.dropna(subset =['number']).reset_index(drop= True)
        self.dataset = self.dataset.dropna(subset =['address']).reset_index(drop= True)
        self.dataset = self.dataset.dropna(subset =['postal_code']).reset_index(drop= True)
        self.cleaning_numbers = True
        print('cleaned data')

    def cleaning_abbreviations(self):
        self.dataset['address'] = self.dataset['address'].astype(str)  # Ensure all entries are strings
        replacements = {
        r'Ch\.': 'Chaussée',
        r'Blvd\.': 'Boulevard',
        'Bld': 'Boulevard',
        'Bvd': 'Boulevard',
        'sestweg': 'sesteenweg',
        r'Av\.' : 'Avenue'
        }
        self.dataset['address'] = self.dataset['address'].replace(replacements, regex=True)
        self.abbrev = True
        print('cleaned abbreviations')

    def make_address(self):
        self.dataset['full_address'] = ''
        for number in range(len(self.dataset)):
            address = ''
            if not pd.isna(self.dataset['address'][number]):
                address = str(self.dataset['address'][number])
                if not pd.isna(self.dataset['number'][number]):
                    address = address+" "+str(int(self.dataset['number'][number]))
                if not pd.isna(self.dataset['postal_code'][number]):
                    address = address +" ,"+str(int(self.dataset['postal_code'][number]))       
                if not pd.isna(self.dataset['municipality'][number]):
                    address = address +" ,"+str(self.dataset['municipality'][number])
            self.dataset.loc[number,'full_address'] = address
        self.addresses_made = True
        print('made addresses')
    
    def is_initialized(self):
        if self.abbrev == False:
            self.cleaning_abbreviations()
        if self.addresses_made == False:
            self.make_address()
        if self.cleaning_numbers == False:
            self.cleaning_house_numbers()
        print('is initialized')

    def use_Nominatim_geocoder(self):
        self.is_initialized()
        self.not_found = 0
        for number in range(len(self.dataset)):
            structured_query = {}
            structured_query['postalcode']= int(self.dataset.loc[number,'postal_code'])         
            structured_query['country']= 'Belgium'
            structured_query['street'] = f"{int(self.dataset.loc[number,'number'])} {self.dataset.loc[number,'address']}"
            location=geolocator.geocode(structured_query, timeout=15)
            
            if location is not None:
                self.dataset.loc[number,'lon'] =location.longitude
                self.dataset.loc[number,'lat'] =location.latitude
            else:
                self.not_found +=1
            if (number % 50 ==0):
                time.sleep(5)

    def use_openroute_geocoder(self, limit):
        self.is_initialized()
        self.counter = 1
        for row in range(len(self.dataset)):
            if self.counter <= limit:
                if pd.isna(self.dataset.loc[row, 'lat']): 
                    url_start = 'https://api.openrouteservice.org/geocode/search?'
                    key = 'api_key=5b3ce3597851110001cf6248d11d604663d24d9395ef7a513daed710&text='
                    url_end = '&boundary.country=BE'
                    address = self.dataset.loc[row, 'full_address']
                    new_address = address.replace(' ', '%')
                    url = url_start+ key+ new_address+url_end
                    response = requests.get(url)
                    if response.status_code == 200:
                        if (self.counter % 50 ==0):
                            time.sleep(10)
                        self.counter+= 1
                        data = response.json()
                        #get the coordinates
                        if not data['features'] == []:
                            coordinates = data['features'][0]['geometry']['coordinates']
                            #save longitude and latitude in the dataframe
                            self.dataset.loc[row,'lon'] =coordinates[0]
                            self.dataset.loc[row,'lat'] =coordinates[1]
            else:
                print('limit reached')
                break


In [4]:
#split the dataset in smaller portions to ensure the geocoders run smoothly
df_aed_1 = df_aed[:3000]
df_aed_2 = df_aed[3001:6000].reset_index(drop=True)
df_aed_3 = df_aed[6001:9000].reset_index(drop=True)
df_aed_4 = df_aed[9001:12000].reset_index(drop=True)
df_aed_5 = df_aed[12001:].reset_index(drop=True)

In [5]:
aed_coordinates1 = AEDcoordinates_cleaner(df_aed_1)

aed_coordinates2 = AEDcoordinates_cleaner(df_aed_2)

aed_coordinates3 = AEDcoordinates_cleaner(df_aed_3)

aed_coordinates4 = AEDcoordinates_cleaner(df_aed_4)

aed_coordinates5 = AEDcoordinates_cleaner(df_aed_5)

In [6]:
aed_coordinates1.use_Nominatim_geocoder()
print('1 done')
print(aed_coordinates1.not_found)
aed_coordinates2.use_Nominatim_geocoder()
print('2 done')
print(aed_coordinates2.not_found)

cleaned abbreviations
made addresses
cleaned data
is initialized
1 done
330
cleaned abbreviations
made addresses
cleaned data
is initialized
2 done
321


In [7]:
aed_coordinates3.use_Nominatim_geocoder()
print('3 done')
print(aed_coordinates3.not_found)
aed_coordinates4.use_Nominatim_geocoder()
print('4 done')
print(aed_coordinates4.not_found)
aed_coordinates5.use_Nominatim_geocoder()
print('5 done')
print(aed_coordinates5.not_found)

cleaned abbreviations
made addresses
cleaned data
is initialized
3 done
395
cleaned abbreviations
made addresses
cleaned data
is initialized
4 done
274
cleaned abbreviations
made addresses
cleaned data
is initialized
5 done
238


In [8]:
aed_coordinates1.not_found+aed_coordinates2.not_found+aed_coordinates3.not_found+aed_coordinates4.not_found+aed_coordinates5.not_found

1558

In [9]:
limit = 5000 #maximum 5000
aed_coordinates1.use_openroute_geocoder(limit)

is initialized


In [10]:
limit = limit -aed_coordinates1.not_found
if limit > 0:
    aed_coordinates2.use_openroute_geocoder(limit)
    print('2 done')
limit = limit -aed_coordinates2.not_found
if limit > 0:
    aed_coordinates3.use_openroute_geocoder(limit)
    print('3 done')
limit = limit -aed_coordinates3.not_found
if limit > 0:
    aed_coordinates4.use_openroute_geocoder(limit)
    print('4 done')
limit = limit -aed_coordinates4.not_found
if limit > 0:
    aed_coordinates5.use_openroute_geocoder(limit)

is initialized
2 done
is initialized
3 done
is initialized
4 done
is initialized


In [11]:
df_aed_w_coords = pd.DataFrame()
aed_coordinates_list = [aed_coordinates1.dataset, aed_coordinates2.dataset, aed_coordinates3.dataset, aed_coordinates4.dataset, aed_coordinates5.dataset]
df_aed_w_coords = pd.concat(aed_coordinates_list, ignore_index=True)
filepath = 'all_aed_coordinates.csv'
df_aed_w_coords.to_csv(filepath, index=False)

In [12]:
filepath = 'all_aed_coordinates.csv'

df_aed = pd.read_csv(filepath)

In [13]:
df_aed_missing = df_aed.loc[df_aed['lat'].isna()]

In [15]:
len(df_aed_missing)

964

In [16]:
df_aed_missing = df_aed.loc[df_aed['lat'].isna()]
df_aed_missing_nonumber = df_aed_missing.dropna(subset=['number']).reset_index()
df_aed_missing_nonumber.loc[df_aed_missing_nonumber['municipality']=='Bruxelles']['address'].value_counts()

address
Station Gare du Midi        9
Avenure de Tervueren        8
Boulevard Simon Bolivas     6
Boulevard Simon Bolivas     4
Bouleard Simon Bolivar      3
                           ..
Drève de Prieuré            1
Rue de la Cible             1
Rue de la Perle ar)         1
Chée de Vilvorde            1
Boulevard Simon bolivas     1
Name: count, Length: 70, dtype: int64

In [20]:
df_aed_missing_nonumber.loc[df_aed_missing_nonumber['municipality']=='Brugge']['address'].value_counts()

address
Malesteenweg            1
Kasernevest             1
Buiten de smedenvest    1
Name: count, dtype: int64