# Geocoding locations of Madagascar from AidData set
### PART 2.5 - Pelias

In [1]:
#Importing modules
import pandas as pd
import geopandas as gpd
import folium
import json
import numpy as np

#import openpyxl
import requests  # To make Google Maps Geocoding API requests
#import spacy #NLP
#from spacy.language import Language
#from spacy_langdetect import LanguageDetector  # For language detection

#from geopy.geocoders import Nominatim # OSM geocoding
from geopy.geocoders import Pelias 
from geopy.distance import geodesic
#from langdetect import detect
#from langdetect.lang_detect_exception import LangDetectException
# from tokenizers import BertWordPieceTokenizer #Bert tokenizer would be reasonable to incude into the process
# pip install mordecai3 # Mordecai should be considered as well
#from spacy.tokens import Doc

## Finally, we have appropriate data to geocode!

I noticed that many place names are not detected - most of them are in Madagascar. I was wondering whether the NLP finds commonly used names with lower effort (with less iterations) and then 'prioritizes' these, or gives hits for these and could not process the rest. -- After indspection noticed that I am using small versions of both NLP's ("en_core_web_sm") instead of medium or large ("en_core_web_md"/"en_core_web_lg")

Examples:
- aiddata_id: 94543904 > Mahajanga
- 906000839711 > Melaky
- 74583066 > Vavatenina

In [2]:
# I will create one unified dataset for all geocoded results
# So importing that one

fgcode = pd.read_json(r'D:\UH_Madagascar\spacy_e_s_explson_id.json', encoding='utf-8')

fgcode

Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,WB_GeoLocID,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id,place_names,ent_start,ent_end,ent_label
0,56986129,27739963,2006,France,FR,Europe,Public sector,AFD,1,Madagascar,...,0,0,0,0,0,10,[Madagascar],32,42,LOC
1,56955232,18646796,2002,European Communities (EC),0,Europe,0,CEC,1,Madagascar,...,0,0,0,0,0,24,[MADAGASCAR],27,37,GPE
2,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1,Madagascar,...,0,0,0,0,0,29,[North and South],98,113,LOC
3,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1,Madagascar,...,0,0,0,0,0,29,[Madagascar YWCA],206,221,GPE
4,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1,Madagascar,...,0,0,0,0,0,29,[Kenya YMCA],223,233,GPE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916,0,0,2018,0,0,0,0,0,1,0,...,0,0,0,0,0,14241,[TrodT],289,294,LOC
3917,0,0,2018,0,0,0,0,0,1,0,...,0,0,0,0,0,14241,[pr(nTe],684,690,LOC
3918,0,0,2018,0,0,0,0,0,1,0,...,0,0,0,0,0,14241,[la France],695,704,LOC
3919,0,0,2018,0,0,0,0,0,1,0,...,0,0,0,0,0,14241,[Niger],923,928,LOC


In [5]:
#fgcode = fgcode.fillna('0')
#fgcode['place_names'] = fgcode['place_names'].astype('string')


In [21]:
# Exlcude zeros so number of rows decreases.
fgcode = fgcode['place_names'].fillna(0)
fgcode = fgcode[(fgcode['place_names'] != 0)]

# Create lists. Without lists coder tries to identify them since they're not just empty lists
fgcode['place_names'] = fgcode['place_names'].str.split(', ')

fgcode

Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,2. Ecosystem_code,2.Ecosystem_name,WB_location,WB_GeoLocID,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id,place_names
9,56986129,27739963,2006,France,FR,Europe,Public sector,AFD,1.0,Madagascar,...,0.0,0,0,0,0,0,0,0,10,[Madagascar]
23,56955232,18646796,2002,European Communities (EC),0,Europe,0,CEC,1.0,Madagascar,...,0.0,0,0,0,0,0,0,0,24,[MADAGASCAR]
28,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0.0,0,0,0,0,0,0,0,29,[North and South]
29,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0.0,0,0,0,0,0,0,0,29,[Madagascar YWCA]
30,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0.0,0,0,0,0,0,0,0,29,[Kenya YMCA]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15817,0,0,2018,0,0,0,0,0,1.0,0,...,0.0,0,0,0,0,0,0,0,14241,[TrodT]
15818,0,0,2018,0,0,0,0,0,1.0,0,...,0.0,0,0,0,0,0,0,0,14241,[pr(nTe]
15819,0,0,2018,0,0,0,0,0,1.0,0,...,0.0,0,0,0,0,0,0,0,14241,[la France]
15820,0,0,2018,0,0,0,0,0,1.0,0,...,0.0,0,0,0,0,0,0,0,14241,[Niger]


In [3]:
#Testing that the domain is working and that the geocoder is running there correctly

import pprint 
url = "http://vm5121.kaj.pouta.csc.fi:4000/v1/search"
params = {'text': 'Antemoro'}
response = requests.get(url, params=params)

if response.status_code == 200:
    try:
        response_json = response.json()
        pprint.pprint(response_json, indent = 4, width=80)
    except request.exception.JSONDecodeError:
        print("Response not valid json.")
else:
    print(f"Request failed with status code {response.status_code}")

{   'features': [],
    'geocoding': {   'attribution': 'http://vm5121.kaj.pouta.csc.fi:4000/attribution',
                     'engine': {   'author': 'Mapzen',
                                   'name': 'Pelias',
                                   'version': '1.0'},
                     'query': {   'lang': {   'defaulted': True,
                                              'iso6391': 'en',
                                              'iso6393': 'eng',
                                              'name': 'English',
                                              'via': 'default'},
                                  'layers': [   'venue',
                                                'street',
                                                'country',
                                                'macroregion',
                                                'region',
                                                'county',
                                                'localad

In [28]:
# Calling Pelias - geocoder
# Madagascar center and buffer in kilometers
madagascar_center = (-20.0, 47.0)
buffer_distance_km = 900

geolocator = "http://vm5121.kaj.pouta.csc.fi:4000/v1/search"

def geocode_pelias(place_name, madagascar_center = (-20.0, 47.0), buffer_distance_km = 900):
    """Write this later"""
    params = {"text":place_name}
    try:
        response= requests.get(geolocator, params = params, timeout = 10)
        #response.raise_for_status()
        data = response.json()
        filtered_coordinates = []
        
        if data.get("features"):
            for feature in data["features"]:
                latitude = feature["geometry"]["coordinates"][1]
                longitude = feature["geometry"]["coordinates"][0]
                coords = (latitude,longitude)
                
                if madagascar_center and buffer_distance_km:
                    distance = geodesic(madagascar_center, coords).km
                    if distance <= buffer_distance_km:
                        filtered_coordinates.append(coords)
                else:
                    filtered_coordinates.append(coords)
                    
        return filtered_coordinates
    except Exception as e:
         print(f"Error geocoding {place_name}: {e}")
    return []

In [29]:
# Applying the geocoder for place names and creating a new field for coordinates

fgcode['coordinates_pel'] = fgcode['place_names'].apply(geocode_pelias)

In [24]:
# Checking results

fgcode

Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,donor_iso,donor_region,implementing_agency,financing_agency,crs_bi_multi,recipient,...,2.Ecosystem_name,WB_location,WB_GeoLocID,WB_GeoLocName,WB_Latitude,WB_Longitude,WB_Country,id,place_names,coordinates_pel
9,56986129,27739963,2006,France,FR,Europe,Public sector,AFD,1.0,Madagascar,...,0,0,0,0,0,0,0,10,[Madagascar],"[(-18.628414, 46.704055)]"
23,56955232,18646796,2002,European Communities (EC),0,Europe,0,CEC,1.0,Madagascar,...,0,0,0,0,0,0,0,24,[MADAGASCAR],"[(-18.628414, 46.704055)]"
28,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0,0,0,0,0,0,0,29,[North and South],[]
29,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0,0,0,0,0,0,0,29,[Madagascar YWCA],"[(-18.628414, 46.704055), (-18.146694, 49.3954..."
30,56972700,28299995,2006,Norway,NO,Europe,Fredskorpset,MFA,1.0,Madagascar,...,0,0,0,0,0,0,0,29,[Kenya YMCA],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15817,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,14241,[TrodT],[]
15818,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,14241,[pr(nTe],[]
15819,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,14241,[la France],[]
15820,0,0,2018,0,0,0,0,0,1.0,0,...,0,0,0,0,0,0,0,14241,[Niger],[]


In [29]:
# Saving the data

fgcode.to_csv('data_n_dtctd_plcs_pelias_w_coords_1_3.csv', index=False)

In [None]:
import matplotlib

In [31]:
# Create a Folium map with only MDG locations
import folium


# Initialize Folium map centered on the first valid location
m = folium.Map(location = madagascar_center, zoom_start = 5.45)


# Add markers for each coordinate tuple 
for _, row in fgcode.iterrows():
    for coord in row['coordinates']:  # Each `coord` is a tuple (lat, lon)
        folium.Marker(
            location=[coord[0], coord[1]],
            popup=f"Places: {(row['place_names'])}" #This is not working well, so maby reversed geocoding?
        ).add_to(m)


folium.LayerControl().add_to(m)


#print("Filtered map has been saved as Aiddata_mdg_map_filtered.html")
m



In [32]:
# Saving the map as HTML file
m.save("Aiddata_mdg_map_filtered_spacy_lrg_pelias_1_3.html")

In [None]:
google_nominatim_merged = pd.merge(fgcode, nominatim_results, 
                                       on= ['aiddata_id'],
                                           suffixes = ('', '_x'),
                                           how='inner'))
