# Geoparsing locations of Madagascar from AidData's dataset collection

In [2]:
# Large packages of Spacy were included in 10th of Jan, 2025

#!python -m spacy download en_core_web_lg
#!python -m spacy download fr_core_news_lg

#!python -m spacy download en_core_web_trf
#!python -m spacy download fr_core_news_trf
# ^^ Transformers had problems with bert tokenizer - maybe some issues in versioning ?


For some reason I had several compatability problems when modules of Spacy and Pandas were overlapping. I tried to replace pandas with Openpyxl, which didn't work in the process. Eventually, the problem just disappeared.


In [1]:
#Importing modules
import pandas as pd
import geopandas as gpd
import folium

#import openpyxl
import requests  # To make Google Maps Geocoding API requests
import spacy #NLP
from spacy.language import Language
from spacy_langdetect import LanguageDetector  # For language detection
import numpy as np
import json

from geopy.geocoders import Nominatim # OSM geocoding

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
# from tokenizers import BertWordPieceTokenizer #Bert tokenizer would be reasonable to incude into the process
# pip install mordecai3 # Mordecai should be considered as well
#from spacy.tokens import Doc

The data package I had was altered during the formation process so the field that I was meant to inspect didn't work as it should - the  UTF-8 encoding had dropped out so the Spacy wasn't able to read it. Fortunately, the original data set was still easily accessible and I just reread it and combined it with manually collected data, which was _again_ fortunately the part of the data that didn't contain any special characters.

In the merging process the data didn't work as expected in the first place. Data was read in CSV and xlsx so it probably caused the problem - some fields were in integer form and some of them, in float. For example, years attained some extra zeros after merging process. I decided to handle only necessary fields, not all of them, so the data has some errors. Let's see how far it can take us like this.

There were also some problems with column names, because duplicates caused suffixes to indicate the origin of the column name, which had to be handled for the latter parts of the processing.

## Geocoding

First, we just have to detect the place names from that descriptive text field. In the last round, on 5th on Dec, I excluded all the other values from the classification of Biodiversity relatedness, so only related fields are in. The procedure increased hits from around 2000 to 400.

In [None]:
# Creating uniformed id for each funding incidence
fvalida = pd.read_csv(r'D:\UH_Madagascar\combined_df_rem_w_br.csv', encoding='utf-8')
fvalida = fvalida.fillna(0)

fvalida['id'] = fvalida.index + 1
fvalida['id']

fvalida.to_csv('combined_df_rem_w_br_wids.csv', index=False, encoding='utf-8')

In [2]:
#This is for full data - if only geocoding, use the set with exploded place names for full geoparsing use full set
fgcode = pd.read_csv(r'D:\UH_Madagascar\combined_df_rem_w_br_wids.csv', encoding='utf-8')

# Additionally, replace Nans with '0' and exclude them
fgcode.fillna("", inplace = True)

fgcode

  fgcode = pd.read_csv(r'D:\UH_Madagascar\combined_df_rem_w_br_wids.csv', encoding='utf-8')


Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,1.Ecosystem_name,2. Ecosystem_code,2.Ecosystem_name,WB_location,WB_GeoLocID,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id
0,57106484,20554281,2003,Japan,JP,Far East Asia,0,JICA,1.0,Madagascar,...,0,0.0,0,0,0,0,0,0,0,1
1,57103097,10374723,1995,European Communities (EC),0,Europe,0,EDF,1.0,Madagascar,...,0,0.0,0,0,0,0,0,0,0,2
2,57051787,37965681,2008,United States,US,North & Central America,"PUBLIC SECTOR (donor, recipient, other)",MCC,1.0,Madagascar,...,0,0.0,0,0,0,0,0,0,0,3
3,57053923,29709047,2006,United States,US,North & Central America,NGO in Donor Country,AID,1.0,Madagascar,...,0,0.0,0,0,0,0,0,0,0,4
4,57055442,17327719,2002,Germany,DE,Europe,0,Found,1.0,Madagascar,...,0,0.0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14268,0,0,2018,0,0,0,0,0,1.0,0,...,0,0.0,0,0,0,0,0,0,0,14269
14269,0,0,2018,0,0,0,0,0,1.0,0,...,0,0.0,0,0,0,0,0,0,0,14270
14270,0,0,2018,0,0,0,0,0,1.0,0,...,0,0.0,0,0,0,0,0,0,0,14271
14271,0,0,2018,0,0,0,0,0,1.0,0,...,0,0.0,0,0,0,0,0,0,0,14272


In [3]:

# Function to load spaCy models
def get_lang_detector(nlp, name):
    return LanguageDetector()

# Loading Natural Language processing libraries of English and French
# NLP is AI based library, which contains language models and pretrained pipelines 

#nlp_en = spacy.load("en_core_web_lg", disable=['tagger', 'lemmatizer', 'textcat'])
#nlp_fr = spacy.load("fr_core_news_lg", disable=['tagger', 'lemmatizer', 'textcat'])
nlp_en = spacy.load("en_core_web_lg")
nlp_fr = spacy.load("fr_core_news_lg")

# Add language detection to english model
Language.factory("language_detector", func=get_lang_detector)
nlp_en.add_pipe('language_detector', last=True)

# Add language detection to english model
Language.factory("language_detector", func=get_lang_detector)
nlp_fr.add_pipe('language_detector', last=True)


<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x21e1f5d71d0>

In [6]:
pip show spacy

Name: spacy
Version: 3.8.4
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: C:\Hyapp\Anaconda3-2023.07-2\Lib\site-packages
Requires: catalogue, cymem, jinja2, langcodes, murmurhash, numpy, packaging, preshed, pydantic, requests, setuptools, spacy-legacy, spacy-loggers, srsly, thinc, tqdm, typer, wasabi, weasel
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Function for place name extraction

places_df = pd.DataFrame({
    "text": fgcode['long_description']
})

# Function to extract place names
def extract_place_names(text):
    """Extracts place names from the text string. 
    Returns empty object on failure.
    """
    if not isinstance(text, str) or not text.strip():
        return []
    
    # Detect language - English
    """Determines by language recognition whether to use
    English or French library. 
    Returns empty object on failure
    """
    doc = nlp_en(text)
    lang = doc._.language['language']
    
    # Process with the corresponding spaCy model

    if lang == 'en':
        doc = nlp_en(text)
    elif lang == 'fr':
        doc = nlp_fr(text)
    else:
        doc = nlp_en(text)  # For unsupported languages // comparison did not show any differences
        #return = [] #used to be empty list
        
    # Extract named entities (GPE= Geopolitical Entity, LOC= Location)
    place_names = [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]
    ent_start = [ent.start_char for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]
    ent_end = [ent.end_char for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]
    ent_label = [ent.label_ for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]
    return place_names, ent_start, ent_end, ent_label


I noticed that many place names are not detected - most of them are in Madagascar. I was wondering whether the NLP finds commonly used names with lower effort (with less iterations) and then 'prioritizes' these, or gives hits for these and could not process the rest. -- After indspection noticed that I am using small versions of both NLP's ("en_core_web_sm") instead of medium or large ("en_core_web_md"/"en_core_web_lg")

Examples:
- aiddata_id: 94543904 > Mahajanga
- 906000839711 > Melaky
- 74583066 > Vavatenina

In [5]:
# Apply the function to the text column
# Return results into three new columns
fgcode[['place_names', 'ent_start', 'ent_end', 'ent_label']] = fgcode['long_description'].apply(lambda x: pd.Series(extract_place_names(x)))
# Print the resulted DataFrame
fgcode

In [6]:
fgcode

Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,WB_GeoLocID,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id,place_names,ent_start,ent_end,ent_label
0,57106484,20554281,2003,Japan,JP,Far East Asia,0,JICA,1.0,Madagascar,...,0,0,0,0,0,1,[],[],[],[]
1,57103097,10374723,1995,European Communities (EC),0,Europe,0,EDF,1.0,Madagascar,...,0,0,0,0,0,2,[],[],[],[]
2,57051787,37965681,2008,United States,US,North & Central America,"PUBLIC SECTOR (donor, recipient, other)",MCC,1.0,Madagascar,...,0,0,0,0,0,3,[],[],[],[]
3,57053923,29709047,2006,United States,US,North & Central America,NGO in Donor Country,AID,1.0,Madagascar,...,0,0,0,0,0,4,[],[],[],[]
4,57055442,17327719,2002,Germany,DE,Europe,0,Found,1.0,Madagascar,...,0,0,0,0,0,5,[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14268,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14269,[],[],[],[]
14269,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14270,[],[],[],[]
14270,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14271,[],[],[],[]
14271,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14272,[],[],[],[]


In [16]:
# Exploding formed lists of names

fgcode_expl = fgcode.explode(['place_names', 'ent_start', 'ent_end', 'ent_label'])

# For easing out the processing nuls where dropped out at this point
#fgcode_expl = fgcode_expl.dropna(subset=['place_names'])


In [20]:
# Consider filling nans only in columns to be used - replace 'column_name'
fgcode_expl = fgcode_expl['column_name'].fillna(0)
fgcode_expl

Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,WB_GeoLocID,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id,place_names,ent_start,ent_end,ent_label
0,57106484,20554281,2003,Japan,JP,Far East Asia,0,JICA,1.0,Madagascar,...,0,0,0,0,0,1,0,0,0,0
1,57103097,10374723,1995,European Communities (EC),0,Europe,0,EDF,1.0,Madagascar,...,0,0,0,0,0,2,0,0,0,0
2,57051787,37965681,2008,United States,US,North & Central America,"PUBLIC SECTOR (donor, recipient, other)",MCC,1.0,Madagascar,...,0,0,0,0,0,3,0,0,0,0
3,57053923,29709047,2006,United States,US,North & Central America,NGO in Donor Country,AID,1.0,Madagascar,...,0,0,0,0,0,4,0,0,0,0
4,57055442,17327719,2002,Germany,DE,Europe,0,Found,1.0,Madagascar,...,0,0,0,0,0,5,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14268,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14269,0,0,0,0
14269,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14270,0,0,0,0
14270,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14271,0,0,0,0
14271,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14272,0,0,0,0


In [24]:
# Saving the progress - in json if there are list structures that has to be recognised or additionally to csv
fgcode_expl.to_json("spacy_explson_id_all_rows.json", orient = "records", indent = 4)

In [22]:
# Nominatim works in lists so I had to form locations back to lists

fgcode_expl['place_names'] = fgcode_expl['place_names'].str.split(', ')
fgcode_expl


Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,WB_GeoLocID,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id,place_names,ent_start,ent_end,ent_label
0,57106484,20554281,2003,Japan,JP,Far East Asia,0,JICA,1.0,Madagascar,...,0,0,0,0,0,1,,0,0,0
1,57103097,10374723,1995,European Communities (EC),0,Europe,0,EDF,1.0,Madagascar,...,0,0,0,0,0,2,,0,0,0
2,57051787,37965681,2008,United States,US,North & Central America,"PUBLIC SECTOR (donor, recipient, other)",MCC,1.0,Madagascar,...,0,0,0,0,0,3,,0,0,0
3,57053923,29709047,2006,United States,US,North & Central America,NGO in Donor Country,AID,1.0,Madagascar,...,0,0,0,0,0,4,,0,0,0
4,57055442,17327719,2002,Germany,DE,Europe,0,Found,1.0,Madagascar,...,0,0,0,0,0,5,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14268,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14269,,0,0,0
14269,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14270,,0,0,0
14270,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14271,,0,0,0
14271,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,14272,,0,0,0


In [6]:
#fgcode.to_csv('data_n_dtctd_names_br_id_en.csv', index=False) 


In [23]:
fgcode_expl.to_csv('testi11.csv', index=False) 

At first the code above found places all over the world, so I decided to frame it little bit, so it would select only locations inside a buffer of 500 km from Madagascar center and if the Biodiversity relatedness is classified as '1', meaning that it is highly related. However, the definitions should be tweaked little bit more.

In [21]:
from geopy.distance import geodesic

# Initialize the geocoder
geolocator = Nominatim(user_agent="place_mapper")

# Madagascar center and buffer in kilometers
madagascar_center = (-20.0, 47.0)
buffer_distance_km = 900

# Function to check if coordinates are within the bounding area
def is_within_buffer(coords, center, buffer_km):
    """Determines whether the coordinates are within the buffer or not. 
    Returns false if not.
    """
    # Ensure valid coordinates
    if not coords:
        return False
    # Calculate distance from the center
    distance = geodesic(coords, center).km
    return distance <= buffer_km

# Function to geocode place names and filter by bounding area
def geocode_place(place_names):
    """Seeks for place names that are connected to the coordinates/locations in API.
    Then checks that the coordinates are filling the condition using
    the former definition.
    Returns an error message on failure
    """
    filtered_coordinates = []
    for place in place_names:
        try:
            location = geolocator.geocode(place, timeout=10) # Timeout added so the reties does not 
            if location:
                coords = (location.latitude, location.longitude)
                if is_within_buffer(coords, madagascar_center, buffer_distance_km):
                    filtered_coordinates.append(coords)
        except Exception as e:
            print(f"Error geocoding {place}: {e}")
    return filtered_coordinates



In [22]:
# Geocode the 'place_names' column and filter coordinates
fgcode_expl['coordinates_nom'] = fgcode_expl['place_names'].apply(geocode_place)

In [23]:
fgcode_expl

Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id,place_names,ent_start,ent_end,ent_label,coordinates_nom
9,56986129,27739963,2006,France,FR,Europe,Public sector,AFD,1.0,Madagascar,...,0,0,0,0,10,[Madagascar],32,42,LOC,"[(-18.9249604, 46.4416422)]"
23,56955232,18646796,2002,European Communities (EC),0,Europe,0,CEC,1.0,Madagascar,...,0,0,0,0,24,[MADAGASCAR],27,37,GPE,"[(-18.9249604, 46.4416422)]"
28,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0,0,0,0,29,[North and South],98,113,LOC,[]
28,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0,0,0,0,29,[Madagascar YWCA],206,221,GPE,"[(-20.53211125, 47.2427255833445)]"
28,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0,0,0,0,29,[Kenya YMCA],223,233,GPE,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14240,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,14241,[TrodT],289,294,LOC,[]
14240,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,14241,[pr(nTe],684,690,LOC,[]
14240,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,14241,[la France],695,704,LOC,[]
14240,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,14241,[Niger],923,928,LOC,[]


In [24]:
# Saving the progress, or the results in .json, so the lists are stored properly instead of csv format converting them into strings
# If no lists are used, then csv is suitable, too

fgcode_expl.to_json("nominatim_coords_explson_id.json", orient = "records", indent = 4)

In [25]:
fgcode_expl.to_csv("nominatim_coords_explson_id.csv", index=False)

In [18]:
# Here we select only rows with detected place names (and coordinates now that they have been found, too)

total_places_count = fgcode.copy()
total_places_count['coordinates_nom'] = fgcode['coordinates_nom'].astype('str')

total_places_count = total_places_count[total_places_count['coordinates_nom'] != '[]']
total_places_count = total_places_count.fillna('0')
#print(f"Total places detected: {total_places_count}")
total_places_count 

Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,2.Ecosystem_name,WB_location,WB_GeoLocID,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id,place_names,coordinates_nom
9,56986129,27739963,2006,France,FR,Europe,Public sector,AFD,1.0,Madagascar,...,0,0,0,0,0,0,0,10,[Madagascar],"[(-18.9249604, 46.4416422)]"
28,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0,0,0,0,0,0,0,29,"[North and South, Madagascar YWCA, Kenya YMCA,...","[(-20.53211125, 47.2427255833445)]"
43,56859894,31285615,2007,France,FR,Europe,Public sector,AFD,1.0,Madagascar,...,0,0,0,0,0,0,0,44,[Madagascar],"[(-18.9249604, 46.4416422)]"
54,56665431,38909239,2008,European Communities (EC),0,Europe,"PUBLIC SECTOR (donor, recipient, other)",EDF,1.0,Madagascar,...,0,0,0,0,0,0,0,55,"[Vatovavy Fitovinany, Côte Est, Madagascar]","[(-22.81679105, 47.83403105), (-18.9249604, 46..."
116,56054399,28299352,2006,Norway,NO,Europe,Norconsult,NORAD,1.0,Madagascar,...,0,0,0,0,0,0,0,117,[Madagascar],"[(-18.9249604, 46.4416422)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14044,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,14045,"[Sarodrano, Revik, Madagascar]","[(-24.070266, 46.0918266), (-18.9249604, 46.44..."
14048,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,14049,[Madagascar],"[(-18.9249604, 46.4416422)]"
14051,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,14052,"[Fianarantsoa, Madagascar]","[(-21.456444, 47.085149), (-18.9249604, 46.441..."
14076,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,14077,[Madagascar],"[(-18.9249604, 46.4416422)]"


In [None]:
total_places_count['coordinates'] = total_places_count['coordinates'].astype('str')

coords = total_places_count[total_places_count['coordinates'] != '[]']
coords

In [None]:
# Create a Folium map with only MDG locations
import folium
madagascar_center = (-20.0, 47.0)

# Initialize Folium map centered on the first valid location
m = folium.Map(location = madagascar_center, zoom_start = 5.45)


# Add markers for each coordinate tuple 

for _, row in fgcode.iterrows():
    for coord in row['coordinates']:  # Each `coord` is a tuple (lon, lat)
        folium.Marker(
            location=[coord[0], coord[1]],
            popup=f"Places: {', '.join(row['place_names'])}"
        ).add_to(m)


folium.LayerControl().add_to(m)


#print("Filtered map has been saved as Aiddata_mdg_map_filtered.html")
m



In [None]:
# Save map to a HTML file
m.save("Aiddata_mdg_map_filtered_nominatim.html")

#### As visually demonstrated above, geocoded locations with SpaCy language recognition doesn't take us further than that. I decided to extend the research with additional data that I will provide to the language detector. In best scenario it would then find more locations from the dataset.
