# Geoparsing locations of Madagascar from AidData's dataset collection

In [2]:
#Importing modules
import pandas as pd
import geopandas as gpd
import folium
import json
import numpy as np

from geopy.geocoders import GeoNames

The data package I had was altered during the formation process so the field that I was meant to inspect didn't work as it should - the  UTF-8 encoding had dropped out so the Spacy wasn't able to read it. Fortunately, the original data set was still easily accessible and I just reread it and combined it with manually collected data, which was _again_ fortunately the part of the data that didn't contain any special characters.

In the merging process the data didn't work as expected in the first place. Data was read in CSV and xlsx so it probably caused the problem - some fields were in integer form and some of them, in float. For example, years attained some extra zeros after merging process. I decided to handle only necessary fields, not all of them, so the data has some errors. Let's see how far it can take us like this.

There were also some problems with column names, because duplicates caused suffixes to indicate the origin of the column name, which had to be handled for the latter parts of the processing.

## Finally, we have appropriate data to geocode!

In [58]:
# Reading the cleaned data
fgcode = pd.read_csv(r'D:\UH_Madagascar\Data\geocoded_by_IA\combination_spacy_geocoders_origdata_cleannames.csv')
#fgcode = pd.read_csv(r'name_explosion.csv') #This one goes for every roeeee

#Filling Nan-values in column 
fgcode['place_names'] = fgcode['place_names'].fillna(0)

# Selecting rows with other than 0-values
fgcode_g = fgcode[(fgcode['g_lat'] != 0)]# & (fgcode['id'] <=6999)] #he last index with 1300 head is 1299-12773, so next time between 1299 and 12773
fgcode_n = fgcode[(fgcode['n_lat'] != 0)]
fgcode_p = fgcode[(fgcode['p_lat'] != 0)]

# Free version of GeoNames has daily quota limitations, which reguired some filtering
#fgcode = fgcode[(fgcode['place_names'] >= 7000)]# & (fgcode['id'] <=6999)] #he last index with 1300 head is 1299-12773, so next time between 1299 and 12773
fgcode_n

  fgcode = pd.read_csv(r'D:\UH_Madagascar\Data\geocoded_by_IA\combination_spacy_geocoders_origdata_cleannames.csv')


Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,ent_label,coordinates_pel,p_lon,p_lat,coordinates_nom,n_lon,n_lat,coordinates_gn,g_lon,g_lat
9,56986129,27739963,2006,France,FR,Europe,Public sector,AFD,1,Madagascar,...,LOC,"[[-18.628414, 46.704055]]",-18.628414,46.704055,"[[-18.9249604, 46.4416422]]",-18.924960,46.441642,"[[-20.0, 47.0]]",-20.00000,47.00000
23,56955232,18646796,2002,European Communities (EC),0,Europe,0,CEC,1,Madagascar,...,GPE,"[[-18.628414, 46.704055]]",-18.628414,46.704055,"[[-18.9249604, 46.4416422]]",-18.924960,46.441642,"[[-20.0, 47.0]]",-20.00000,47.00000
29,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1,Madagascar,...,GPE,"[[-18.628414, 46.704055], [-18.146694, 49.3954...",-18.628414,46.704055,"[[-20.53211125, 47.2427255833]]",-20.532111,47.242726,[],0.00000,0.00000
47,56859894,31285615,2007,France,FR,Europe,Public sector,AFD,1,Madagascar,...,LOC,"[[-18.628414, 46.704055]]",-18.628414,46.704055,"[[-18.9249604, 46.4416422]]",-18.924960,46.441642,"[[-20.0, 47.0]]",-20.00000,47.00000
58,56665431,38909239,2008,European Communities (EC),0,Europe,"PUBLIC SECTOR (donor, recipient, other)",EDF,1,Madagascar,...,LOC,"[[-21.316258, 47.872655]]",-21.316258,47.872655,"[[-22.81679105, 47.83403105]]",-22.816791,47.834031,"[[-21.3, 47.9]]",-21.30000,47.90000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15622,0,0,2018,0,0,0,0,0,1,0,...,LOC,"[[-18.628414, 46.704055]]",-18.628414,46.704055,"[[-18.9249604, 46.4416422]]",-18.924960,46.441642,"[[-20.0, 47.0]]",-20.00000,47.00000
15625,0,0,2018,0,0,0,0,0,1,0,...,LOC,"[[-22.20984, 45.977182], [-21.45267, 47.08569]]",-22.209840,45.977182,"[[-21.456444, 47.085149]]",-21.456444,47.085149,"[[-21.45267, 47.08569]]",-21.45267,47.08569
15626,0,0,2018,0,0,0,0,0,1,0,...,LOC,"[[-18.628414, 46.704055]]",-18.628414,46.704055,"[[-18.9249604, 46.4416422]]",-18.924960,46.441642,"[[-20.0, 47.0]]",-20.00000,47.00000
15651,0,0,2018,0,0,0,0,0,1,0,...,LOC,"[[-18.628414, 46.704055]]",-18.628414,46.704055,"[[-18.9249604, 46.4416422]]",-18.924960,46.441642,"[[-20.0, 47.0]]",-20.00000,47.00000


In [50]:
# Geocoder definition for GeoNames
# Username and registration in geonames-portal is mandatory.
# Also, it works correctly if the placenames are stored in lists '[]', 
# otherwise it tries to recognise separate letters.

from geopy.distance import geodesic


geolocator = GeoNames(username = "username")

# Madagascar center and buffer in kilometers

madagascar_center = (-20.0, 47.0)
buffer_distance_km = 900

# Function to check if coordinates are within the bounding area
def is_within_buffer(coords, center, buffer_km):
    """Determines whether the coordinates are within the buffer or not. 
    Returns false if not.
    """
    # Ensure valid coordinates
    if not coords:
        return False
    # Calculate distance from the center
    distance = geodesic(coords, center).km
    return distance <= buffer_km

# Function to geocode place names and filter by bounding area
def geocode_place(place_names):
    """Seeks for place names that are connected to the coordinates/locations in API.
    Then checks that the coordinates are filling the condition using
    the former definition.
    Returns an error message on failure
    """
    filtered_coordinates = []
    for place in place_names:
        try:
            location = geolocator.geocode(place, timeout=20) # Timeout added so the reties do not hit api limits
            if location:
                coords = (location.latitude, location.longitude)
                if is_within_buffer(coords, madagascar_center, buffer_distance_km):
                    filtered_coordinates.append(coords)
        except Exception as e:
            print(f"Error geocoding {place}: {e}")
    return filtered_coordinates

I noticed that many place names are not detected - most of them are in Madagascar. I was wondering whether the NLP finds commonly used names with lower effort (with less iterations) and then 'prioritizes' these, or gives hits for these and could not process the rest. -- After indspection noticed that I am using small versions of both NLP's ("en_core_web_sm") instead of medium or large ("en_core_web_md"/"en_core_web_lg")

Examples:
- aiddata_id: 94543904 > Mahajanga
- 906000839711 > Melaky
- 74583066 > Vavatenina

At first the code above found places all over the world, so I decided to frame it little bit, so it would select only locations inside a buffer of 500 km from Madagascar center and if the Biodiversity relatedness is classified as '1', meaning that it is highly related. However, the definitions should be tweaked little bit more.

In [64]:
# Geocode the 'place_names' column and filter coordinates
# Returns data into new column

fgcode['coordinates_nom'] = fgcode['place_names'].apply(geocode_place)

In [67]:
# Returned coordinates are stored in a form which might loose its structure if saved in csv or text-form.
# It would e the best to store them in json, so the coordinate tupples do not loose the structure if analysed afterwards.
# I saved the results into own file after each run.

import json 

fgcode.to_json("gn_expl_coords_1k.json", orient = "records", indent = 4)

In [79]:
# After reading these the coordinate data contains proper features for MultiPoints. 
# Though, they will be exploded and combined!

rows2 = pd.read_json(r'gn_expl_coords_2_5k.json')
rows3 = pd.read_json(r'gn_expl_coords_7k.json')
rows4 = pd.read_json(r'gn_expl_coords_10k.json')
rows5 = pd.read_json(r'gn_expl_coords_12k.json')
rows6 = pd.read_json(r'gn_expl_coords_1k.json')



### V Merge doesn't work here - it destroys coordinate tuple datatype..

 -> Do we need them or are we exploding lists anyway?? They could still be identified by id, yet not combined as the order can't be maintained

In [None]:
combined_df

In [54]:
rows1 = [rows2, rows3, rows4, rows5, rows6]
rows1

[     aiddata_id  aiddata_2_id  year                      donor donor_iso  \
 0      52934868      24423141  2005                     Norway        NO   
 1      52934868      24423141  2005                     Norway        NO   
 2      52947947      35447495  2008                     France        FR   
 3      52965543      35685717  2008                    Germany        DE   
 4      52965543      35685717  2008                    Germany        DE   
 ..          ...           ...   ...                        ...       ...   
 671    51552253      22762665  2004              United States        US   
 672    51596814      28132081  2006                      Italy        IT   
 673    51703192      16511106  2001              United States        US   
 674    51990763      15218315  2000  European Communities (EC)         0   
 675    52124936      30380235  2006  European Communities (EC)         0   
 
                 donor_region  \
 0                     Europe   
 1      

In [58]:
# This changes the type of nested lists - they are not recognised as lists of coordinate pairs for some reason after concat
geonams = pd.concat([pd.DataFrame(table) for table in rows1], ignore_index=True)


Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,2.Ecosystem_name,WB_location,WB_GeoLocID,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id,place_names,coordinates_gn
0,52934868,24423141,2005,Norway,NO,Europe,DET NORSKE MISJONSSELSKAP,MFA,1,Madagascar,...,0,0,0,0,0,0,0,5418,[MADAGASCAR],"[[-20.0, 47.0]]"
1,52934868,24423141,2005,Norway,NO,Europe,DET NORSKE MISJONSSELSKAP,MFA,1,Madagascar,...,0,0,0,0,0,0,0,5418,[MADAGASCAR],"[[-20.0, 47.0]]"
2,52947947,35447495,2008,France,FR,Europe,0,MINEFI,1,Madagascar,...,0,0,0,0,0,0,0,5419,[APD],[]
3,52965543,35685717,2008,Germany,DE,Europe,Federal Ministry for Economic Cooperation and ...,BMZ,1,Madagascar,...,0,0,0,0,0,0,0,5422,[Entwicklung],[]
4,52965543,35685717,2008,Germany,DE,Europe,Federal Ministry for Economic Cooperation and ...,BMZ,1,Madagascar,...,0,0,0,0,0,0,0,5422,[Sub-Sahara Afrika],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3906,116717842,0,2012,Italy,IT,Europe,Malagasy Red Cross,MISC,1,Madagascar,...,0,0,0,0,0,0,0,1984,[Midongy],"[[-23.3431, 46.99673]]"
3907,116796362,0,2012,Switzerland,CH,Europe,0,SDC,1,Madagascar,...,0,0,0,0,0,0,0,1990,[Madagascar],"[[-20.0, 47.0]]"
3908,116796362,0,2012,Switzerland,CH,Europe,0,SDC,1,Madagascar,...,0,0,0,0,0,0,0,1990,[Menabe],"[[-20.2, 45.07]]"
3909,116796362,0,2012,Switzerland,CH,Europe,0,SDC,1,Madagascar,...,0,0,0,0,0,0,0,1990,[Vakinankaratra],"[[-19.7, 46.9]]"


In [59]:
geonams.to_json("gn_expl_coords_comb.json", orient = "records", indent = 4)

In [None]:
# All geocoded datasets are now exploded one by one so there is no multiplyed coordinates 
# CHECK CODE 'Cleaning_Annotation_data' for coordinate separation - no the data could be combined 
expl_noms = noms.explode('coordinates_nom')
expl_pel = pelias.explode('coordinates_pel')
expl_geonams = geonams.explode('coordinates_gn')

In [None]:
# This was the first idea to combine coordinates into one dataframe - could be the final dataset, but does not work before explosions
combined_df = pd.merge(pelias, combination, 
                           on= ['id'],
                               suffixes = ('', '_x'),
                               how='outer')


combined_df = combined_df[[c for c in combined_df.columns if not c.endswith('_x')]]

In [None]:
#combined_df = combined_df.drop_duplicates(subset=['id'])

In [None]:
# Saving the progress, or the results in .json, so the lists are stored properly instead of csv format converting them into strings
import json
combined_df.to_json("extr_coords_merge.json", orient = "records", indent = 4)

In [None]:
# Calculating places

total_places_count = combined_df.copy()
total_places_count['coordinates_gn'] = combined_df['coordinates_gn'].astype('str')

total_places_count = total_places_count[total_places_count['coordinates_gn'] != '[]']
total_places_count = total_places_count.fillna('0')
#print(f"Total places detected: {total_places_count}")
total_places_count 

In [19]:
fgcode['place_names']

9             Madagascar
23            MADAGASCAR
28       North and South
29       Madagascar YWCA
30            Kenya YMCA
              ...       
15834              TrodT
15835             pr(nTe
15836          la France
15837              Niger
15838             succFs
Name: place_names, Length: 3928, dtype: object

In [4]:
import matplotlib

In [60]:
# Create a Folium map with only MDG locations for all data
import folium
madagascar_center = (-20.0, 47.0)

m = folium.Map(location = madagascar_center, zoom_start = 5.45, tiles = "CartoDB positron")

# Creating feature groups for each geocoder
Pelias = folium.FeatureGroup('Pelias').add_to(m)
Nominatim = folium.FeatureGroup('Nominatim').add_to(m)
GeoNames = folium.FeatureGroup('GeoNames').add_to(m)

for _, row in fgcode_g.iterrows():
        folium.CircleMarker(
            location = [row['g_lon'], row['g_lat']],
            color = "#7f31cc",
            fill = True,
            opacity = 0.3,
            radius = 5,
            fill_opacity = 0.3,
            popup=f"Places: {row['place_names']}",
        ).add_to(GeoNames)

for _, row in fgcode_p.iterrows():
        folium.CircleMarker(
            location=[row['p_lon'], row['p_lat']],
            color = "#f5be40",
            fill = True,
            opacity = 0.3,
            radius = 5,
            fill_opacity = 0.3,
            popup=f"Places: {row['place_names']}",
        ).add_to(Pelias)

for _, row in fgcode_n.iterrows():
        folium.CircleMarker(
            location=[row['n_lon'], row['n_lat']],
            color = "#4091f5",
            fill = True,
            opacity = 0.3,
            radius = 5,
            fill_opacity = 0.3,
            popup=f"Places: {row['place_names']}",
        ).add_to(Nominatim)

folium.LayerControl().add_to(m)

<folium.map.LayerControl at 0x1c72fa8a7d0>

In [61]:
m

In [7]:
# This is for coordinate tuples that disappear after saving data as csv
# Create a Folium map with only MDG locations
import folium
madagascar_center = (-20.0, 47.0)

# Initialize Folium map centered on the first valid location
m = folium.Map(location = madagascar_center, zoom_start = 5.45)


# Add markers for each coordinate tuple 

for _, row in fgcode.iterrows():
    for coord in row['coordinates_gn']:  # Each `coord` is a tuple (lon, lat)
        folium.Marker(
            location=[coord[0], coord[1]],
            color="#3186cc",
            popup=f"Places: {', '.join(row['place_names'])}"
        ).add_to(m)


folium.LayerControl().add_to(m)


#print("Filtered map has been saved as Aiddata_mdg_map_filtered.html")
m



IndexError: string index out of range

In [62]:
# Save map to a HTML file
m.save("Aiddata_mdg_map_all.html")