This code identifies rows in a file with geodata that do not contain values for latitudes and longitudes.

It performs two actions:

1) Missing geodata can in some cases by copied from other rows where the same place name is mentioned. The script does that where possible.

2) If no geodata are known for the place names in question, the script writes the rows with missing information to a new EXCEL table.

In [1]:
# import relevant packages
import pandas as pd
import os

# input addresses in EXCEL format and read

#infile="https://github.com/MonikaBarget/GeoHumTutorials/blob/master/Colab_Geocoding/Addresses_AP3.xlsx?raw=true"

addresses_df = pd.read_excel('Ortsontologie-Geocoded-geprüft_updatesJuly2024.xlsx', dtype=str)
display(addresses_df)

Unnamed: 0,place_name_old,place_name,place_editorial_comment,place_suffix,place_community,place_region_1,place_region_2,place_continent,place_variant_1,place_variant_2,...,place_source,place_address_full,place_geonames_id,place_geonames_address,place_lat_geonames,place_lng_geonames,place_lat_google,place_lng_google,place_google_address,Column 22
0,Aachen,Aachen,,,,,,Europa,,,...,ProfAPI,"Aachen, Europe",3247449,Aachen,50.77664,6.08342,50.7753455,6.083886800000001,"Aachen, Germany",
1,Aquisgr,Aachen,,,,,,Europa,,,...,Universitätsmatrikeln,"Aachen, Europe",3247449,Aachen,50.77664,6.08342,50.7753455,6.083886800000001,"Aachen, Germany",
2,Aquisgr.,Aachen,,,,,,Europa,,,...,Universitätsmatrikeln,"Aachen, Europe",3247449,Aachen,50.77664,6.08342,50.7753455,6.083886800000001,"Aachen, Germany",
3,Aquisgran,Aachen,,,,,,Europa,,,...,Universitätsmatrikeln,"Aachen, Europe",3247449,Aachen,50.77664,6.08342,50.7753455,6.083886800000001,"Aachen, Germany",
4,Aquisgranensis,Aachen,,,,,,Europa,,,...,Universitätsmatrikeln,"Aachen, Europe",3247449,Aachen,50.77664,6.08342,50.7753455,6.083886800000001,"Aachen, Germany",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4257,Gröleck,Gröleck,unbekannt,,,,,,,,...,,,,,,,,,,
4258,Hartöller,Hartöller,unbekannt,,,,,,,,...,,,,,,,,,,
4259,Hesselbach,"Hesselbach, Odenwald",,,,,,,,,...,,,,,,,,,,
4260,Noviomagiensis,Speyer,unklar,,,,,,,,...,,,,,,,,,,


In [2]:
# Fill missing geodata based on identical place names
df = addresses_df.sort_values('place_name')  # Ensure sorting to forward fill correctly

# Columns to fill
geodata_columns = ['place_lat_google', 'place_lng_google', 'place_lat_geonames', 'place_lng_geonames']

for col in geodata_columns:
    df[col] = df.groupby('place_name')[col].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

# Save the updated dataframe to a new Excel file
updated_file = "Ortsontologie_Geocoded_known-geodata-added.xlsx"
df.to_excel(updated_file, index=False)

print(f"Updated data saved to {updated_file}")

Updated data saved to Ortsontologie_Geocoded_known-geodata-added.xlsx


In [3]:
# Select rows where neither Geonames nor Google geolocations are provided

missing_geolocation_df = df[
    df['place_lat_geonames'].isna() & 
    df['place_lng_geonames'].isna() & 
    df['place_lat_google'].isna() & 
    df['place_lng_google'].isna()
]

# Save selected data to a new Excel file
missing_geolocation_file_path = "AP3_Missing_Geolocation_Data.xlsx"
missing_geolocation_df.to_excel(missing_geolocation_file_path, index=False)

print("Done!")

Done!
