# Geoparsing


The goal of this notebook is to apply geoparsing to the corpus : 
- build a referential for continents and countries in French
- retrive LOC entites from a text
- implement functions to identify continents, countries and cities from a list of LOC entities
- structure the set of entities as follows : continents/countries/regions/cities /miscellaneous.

__Imports__

In [1]:
import geonamescache
import geopy
import pandas as pd

import spacy
try: 
    print("fr_core_news_sm loaded")
    nlp = spacy.load("fr_core_news_sm") # load pre-trained models for French
except:
    print("fr loaded")
    nlp=spacy.load('fr') # fr calls fr_core_news_sm 
    


fr_core_news_sm loaded


In [2]:
# data
news_df=pd.read_csv("labeled_articles_clean.csv")

news_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,year,title,text,url,geo,topic,clean_text,clean_title,pre_title,pre_text,geo_code,topic_code
0,0,0,1988,Tintin dans l'espace,Trois semaines à bord de la station soviétique...,https://www.lexpress.fr/informations/tintin-da...,fr,sc,semain bord station soviet jean-loup chrétien ...,tintin espac,tintin espac,semain bord station soviet jean-loup chrétien ...,3,5
1,1,1,1988,Le faux suicide de Robert Boulin,1979 : son corps est découvert en forêt de Ram...,https://www.lexpress.fr/actualite/politique/le...,fr,ju,corp découvert forêt rambouillet fauss pist né...,faux suicid robert boulin,faux suicid robert boulin,corp découvert forêt rambouillet fauss pist né...,3,2
2,2,2,1988,Des pierres contre les certitudes,"Rideaux de fer baissés, silhouettes furtives, ...",https://www.lexpress.fr/actualite/monde/proche...,me,po,rideau baiss silhouet furtiv jérusalem arab to...,pierr contr certitud,pierr contr certitud,rideau baiss silhouet furtiv jérusalem arab to...,5,4
3,3,3,1988,"Otages: soudain, mercredi soir...",""" Je lui ai dit: ""Ça suffit"", et j'ai raccroch...",https://www.lexpress.fr/informations/otages-so...,me,ju,suff raccroch trop souvent échaud trop souvent...,otag soudain mercred soir,otag soudain mercred soir,suff raccroch trop souvent échaud trop souvent...,5,2
4,4,4,1988,Les secrets de la planète rouge,"S'il existe, dans le système solaire, un seul ...",https://www.lexpress.fr/actualite/sciences/les...,spa,sc,exist system solair seul endroit exobiolog dés...,secret planet roug,secret planet roug,exist system solair seul endroit exobiolog dés...,6,5


In [3]:
# data from opendata.gouv

country_df = pd.read_csv("./liste_197_etats_2020.csv", encoding = "ISO-8859-1", delimiter=';')
country_df.head(10)

Unnamed: 0,NOM,NOM_ALPHA,CODE,ARTICLE,NOM_LONG,CAPITALE
0,Afghanistan,Afghanistan,AFG,l',République islamique d'Afghanistan,Kaboul
1,Afrique du Sud,Afrique du Sud,ZAF,l',République d'Afrique du Sud,Prétoria
2,Albanie,Albanie,ALB,l',République d'Albanie,Tirana
3,Algérie,Algérie,DZA,l',République algérienne démocratique et populaire,Alger
4,Allemagne,Allemagne,DEU,l',République fédérale d'Allemagne,Berlin
5,Andorre,Andorre,AND,l',Principauté d'Andorre,Andorre-la-Vieille
6,Angola,Angola,AGO,l',République d'Angola,Luanda
7,Antigua-et-Barbuda,Antigua-et-Barbuda,ATG,,Antigua-et-Barbuda,Saint John's
8,Arabie saoudite,Arabie saoudite,SAU,l',Royaume d'Arabie saoudite,Riyad
9,Argentine,Argentine,ARG,l',République argentine,Buenos Aires


## 1. Build tools

### 1.1 Build Continent and Country lists in French

using geonamescache

In [4]:
gc = geonamescache.GeonamesCache()

In [5]:
# Basic examples for getting started

continents = gc.get_continents()
print(continents.keys())
print(continents['EU'])

countries = gc.get_countries()
print(countries.keys())
print(countries['GB'])


dict_keys(['AF', 'AS', 'EU', 'NA', 'OC', 'SA', 'AN'])
{'lng': '9.14062', 'geonameId': 6255148, 'timezone': {'gmtOffset': 1, 'timeZoneId': 'Europe/Vaduz', 'dstOffset': 2}, 'bbox': {'east': 41.73303985595703, 'south': 27.6377894797159, 'north': 80.76416015625, 'west': -24.532675386662543, 'accuracyLevel': 0}, 'toponymName': 'Europe', 'asciiName': 'Europe', 'astergdem': 439, 'fcl': 'L', 'population': 741000000, 'wikipediaURL': 'en.wikipedia.org/wiki/Europe', 'adminName5': '', 'srtm3': 443, 'adminName4': '', 'adminName3': '', 'alternateNames': [{'isPreferredName': True, 'name': '유럽', 'lang': 'ko'}, {'name': 'ยุโรป', 'lang': 'th'}, {'name': 'ヨーロッパ', 'lang': 'ja'}, {'name': 'an Eoraip', 'lang': 'ga'}, {'name': 'Avrupa', 'lang': 'tr'}, {'name': 'Châu Âu', 'lang': 'vi'}, {'name': 'Eiropa', 'lang': 'lv'}, {'isColloquial': True, 'name': 'El viejo continente', 'lang': 'es'}, {'name': 'Eropa', 'lang': 'id'}, {'name': 'Eurohpá', 'lang': 'se'}, {'isPreferredName': True, 'name': 'Euroopa', 'lang': 'e

In [64]:
# Build dictionnary
# keys = continent names in French
# values = dictionnaries with keys 'continantCode', 'country_codes', 'country_code_iso3', 'country_names'

my_continents_fr = dict() # continents.keys()
my_continent_codes = dict() # basic continent_code : french name

for continent_code in continents.keys():
    for dico in continents[continent_code]['alternateNames']:
        # get the french name ?
        if dico['lang']=='fr':
            name_fr = dico['name']
            cont_dico = {'continentCode':continent_code}
            my_continents_fr[name_fr]=cont_dico
            my_continent_codes[continent_code]=name_fr
            if name_fr!='Antarctique':
                my_continents_fr[name_fr]['country_codes']=continents[continent_code]['cc2'].split(',')
            else:
                my_continents_fr[name_fr]['country_codes']=[]
                            
for key in my_continents_fr.keys():
    my_continents_fr[key]['country_code_iso3']=[]
    my_continents_fr[key]['country_names']=[]
    set_noms_fr = set()
    for country_code in my_continents_fr[key]['country_codes']:
        iso3 = countries[country_code]['iso3']
        my_continents_fr[key]['country_code_iso3'].append(iso3)
        
        try:            
            set_noms_fr.add(country_df[country_df['CODE']==iso3]['NOM'].tolist()[0])
            set_noms_fr.add(country_df[country_df['CODE']==iso3]['NOM_ALPHA'].tolist()[0])
            set_noms_fr.add(country_df[country_df['CODE']==iso3]['NOM_LONG'].tolist()[0])            
        except:
            print(countries[country_code])
        set_noms_fr.add(countries[country_code]['name'])
            
    my_continents_fr[key]['country_names']=list(set_noms_fr)

my_continents_fr

{'geonameid': 935317, 'name': 'Reunion', 'iso': 'RE', 'iso3': 'REU', 'isonumeric': 638, 'fips': 'RE', 'continentcode': 'AF', 'capital': 'Saint-Denis', 'areakm2': 2517, 'population': 776948, 'tld': '.re', 'currencycode': 'EUR', 'currencyname': 'Euro', 'phone': '262', 'postalcoderegex': '^((97|98)(4|7|8)\\d{2})$', 'languages': 'fr-RE', 'neighbours': ''}
{'geonameid': 3370751, 'name': 'Saint Helena', 'iso': 'SH', 'iso3': 'SHN', 'isonumeric': 654, 'fips': 'SH', 'continentcode': 'AF', 'capital': 'Jamestown', 'areakm2': 410, 'population': 7460, 'tld': '.sh', 'currencycode': 'SHP', 'currencyname': 'Pound', 'phone': '290', 'postalcoderegex': '^(STHL1ZZ)$', 'languages': 'en-SH', 'neighbours': ''}
{'geonameid': 7909807, 'name': 'South Sudan', 'iso': 'SS', 'iso3': 'SSD', 'isonumeric': 728, 'fips': 'OD', 'continentcode': 'AF', 'capital': 'Juba', 'areakm2': 644329, 'population': 8260490, 'tld': '', 'currencycode': 'SSP', 'currencyname': 'Pound', 'phone': '211', 'postalcoderegex': '', 'languages': '

{'Afrique': {'continentCode': 'AF',
  'country_codes': ['AO',
   'BF',
   'BI',
   'BJ',
   'BW',
   'CD',
   'CF',
   'CG',
   'CI',
   'CM',
   'CV',
   'DJ',
   'DZ',
   'EG',
   'ER',
   'ET',
   'GA',
   'GH',
   'GM',
   'GN',
   'GQ',
   'GW',
   'KE',
   'KM',
   'LR',
   'LS',
   'LY',
   'MA',
   'MG',
   'ML',
   'MR',
   'MU',
   'MW',
   'MZ',
   'NA',
   'NE',
   'NG',
   'RE',
   'RW',
   'SC',
   'SD',
   'SH',
   'SL',
   'SN',
   'SO',
   'SS',
   'ST',
   'SZ',
   'TD',
   'TG',
   'TN',
   'TZ',
   'UG',
   'YT',
   'ZA',
   'ZM',
   'ZW'],
  'country_code_iso3': ['AGO',
   'BFA',
   'BDI',
   'BEN',
   'BWA',
   'COD',
   'CAF',
   'COG',
   'CIV',
   'CMR',
   'CPV',
   'DJI',
   'DZA',
   'EGY',
   'ERI',
   'ETH',
   'GAB',
   'GHA',
   'GMB',
   'GIN',
   'GNQ',
   'GNB',
   'KEN',
   'COM',
   'LBR',
   'LSO',
   'LBY',
   'MAR',
   'MDG',
   'MLI',
   'MRT',
   'MUS',
   'MWI',
   'MOZ',
   'NAM',
   'NER',
   'NGA',
   'REU',
   'RWA',
   'SYC',
   'SDN',
  

In [16]:
# Final lists for continents and countries

list_countries_fr = [] # contains country names in French and in English

for key in my_continents_fr.keys():
    list_countries_fr+=[string.lower() for string in my_continents_fr[key]['country_names']]

print(list_countries_fr)

list_continents_fr = [key.lower() for key in my_continents_fr.keys()] # continent names in French
print(list_continents_fr)

print(my_continent_codes) # basic dic : code : name

['benin', 'algérie', 'république togolaise', 'burkina', 'république gabonaise', 'kenya', 'zambia', 'angola', 'guinée-bissao', 'lésotho', 'mauritania', 'république de sierra leone', 'rwanda', 'république du bénin', 'equatorial guinea', 'eswatini', 'royaume du lesotho', "république arabe d'égypte", 'république du mozambique', 'république du soudan', 'république de zambie', 'république de maurice', 'érythrée', 'zambie', 'sudan', 'sao tome and principe', 'république fédérale de somalie', 'egypt', 'gambia', 'djibouti', 'guinée équatoriale', 'maurice', 'zimbabwe', 'éthiopie', 'état de libye', 'république des seychelles', 'mali', 'sénégal', 'république du botswana', 'république du zimbabwé', 'south africa', 'tunisie', 'burkina faso', 'reunion', 'kénya', 'cameroon', 'mauritanie', 'république du kénya', 'république du rwanda', 'maroc', 'république islamique de mauritanie', 'république du ghana', 'république du libéria', 'gambie', 'comoros', 'lesotho', 'union des comores', "république de côte d'

In [128]:
# utility function : from a list of entities, look for continent names and country names
# store them in a dictionnary

def continent_info(input_list, ref = ['afrique', 'asie', 'europe', 'amérique du nord', 'océanie', 'amérique du sud', 'antarctique', 'amérique']):
    geo_dic={key:[] for key in ['cont', 'country', 'country_code', 'state', 'city', 'misc']}
    for string in set(input_list):
        if string in ref:
            geo_dic['cont'].append(string)
    return geo_dic

def country_info(input_list, ref = list_countries_fr):
    geo_dic={key:[] for key in ['cont', 'country', 'country_code', 'state', 'city', 'misc']}
    for string in set(input_list):
        if string in ref:
            geo_dic['cont'].append(string)
    return geo_dic


print(continent_info(['israël', 'genève', 'new york', 'abou ammar', 'alger', 'israël', 'aviv', 'koweït', 'palestine', 'terre', 'jordanie', 'liban', 'beyrouth', 'syrie', 'israël', 'liban', 'damas', 'jérusalem', 'syrie', 'beyrouth', 'tunis', 'egypte', 'israël', 'syrie', 'damas', 'tripoli', 'iran', 'jordanie', 'egypte', 'palestine', 'israël', 'genève', 'israël', 'rusé', 'habile', 'new york', 'kenya', 'amérique du sud', 'genève', 'israël', 'new york', 'jérusalem', 'diplomatique', 'washington']))
print(continent_info(['afrique', 'asie', 'afrique']))
print(country_info(['israël', 'genève', 'new york', 'abou ammar', 'alger', 'israël', 'aviv', 'koweït', 'palestine', 'terre', 'jordanie', 'liban', 'beyrouth', 'syrie', 'israël', 'liban', 'damas', 'jérusalem', 'syrie', 'beyrouth', 'tunis', 'egypte', 'israël', 'syrie', 'damas', 'tripoli', 'iran', 'jordanie', 'egypte', 'palestine', 'israël', 'genève', 'israël', 'rusé', 'habile', 'new york', 'kenya', 'amérique du sud', 'genève', 'israël', 'new york', 'jérusalem', 'diplomatique', 'washington']))
print(country_info(['afrique', 'asie', 'afrique']))

my_ref_zones = ['afrique', 'asie', 'europe', 'amérique du nord', 'amérique', 'océanie', 'amérique du sud', 'amérique latine', 'antarctique', 'moyen-orient']

def continent_only(input_list, ref = my_ref_zones):
    cont_list=set()
    for string in set(input_list):
        if string in ref:
            cont_list.add(string)
    return cont_list

def country_only(input_list, ref = list_countries_fr):
    country_list=set()
    for string in set(input_list):
        if string in ref:
            country_list.add(string)
    return country_list

print(continent_only(['israël', 'genève', 'new york', 'abou ammar', 'alger', 'israël', 'aviv', 'koweït', 'palestine', 'terre', 'jordanie', 'liban', 'beyrouth', 'syrie', 'israël', 'liban', 'damas', 'jérusalem', 'syrie', 'beyrouth', 'tunis', 'egypte', 'israël', 'syrie', 'damas', 'tripoli', 'iran', 'jordanie', 'egypte', 'palestine', 'israël', 'genève', 'israël', 'rusé', 'habile', 'new york', 'kenya', 'amérique du sud', 'genève', 'israël', 'new york', 'jérusalem', 'diplomatique', 'washington']))
print(continent_only(['afrique', 'asie', 'afrique', 'amérique latine']))
print(country_only(['israël', 'genève', 'new york', 'abou ammar', 'alger', 'israël', 'aviv', 'koweït', 'palestine', 'terre', 'jordanie', 'liban', 'beyrouth', 'syrie', 'israël', 'liban', 'damas', 'jérusalem', 'syrie', 'beyrouth', 'tunis', 'egypte', 'israël', 'syrie', 'damas', 'tripoli', 'iran', 'jordanie', 'egypte', 'palestine', 'israël', 'genève', 'israël', 'rusé', 'habile', 'new york', 'kenya', 'amérique du sud', 'genève', 'israël', 'new york', 'jérusalem', 'diplomatique', 'washington']))
print(country_only(['afrique', 'asie', 'afrique', 'amérique latine']))


{'cont': ['amérique du sud'], 'country': [], 'country_code': [], 'state': [], 'city': [], 'misc': []}
{'cont': ['afrique', 'asie'], 'country': [], 'country_code': [], 'state': [], 'city': [], 'misc': []}
{'cont': ['koweït', 'israël', 'syrie', 'jordanie', 'iran', 'kenya', 'liban'], 'country': [], 'country_code': [], 'state': [], 'city': [], 'misc': []}
{'cont': [], 'country': [], 'country_code': [], 'state': [], 'city': [], 'misc': []}
{'amérique du sud'}
{'amérique latine', 'afrique', 'asie'}
{'koweït', 'israël', 'jordanie', 'iran', 'kenya', 'syrie', 'liban'}
set()


### 1.2 Write utility functions to retrieve info about cities

using geopy

In [77]:
from geonamescache.mappers import country

iso_to_cont_mapper = country(from_key='iso', to_key='continentcode')

# test
print(my_continent_codes[iso_to_cont_mapper('DE')]) 

print(countries['DE'])

Europe
{'geonameid': 2921044, 'name': 'Germany', 'iso': 'DE', 'iso3': 'DEU', 'isonumeric': 276, 'fips': 'GM', 'continentcode': 'EU', 'capital': 'Berlin', 'areakm2': 357021, 'population': 81802257, 'tld': '.de', 'currencycode': 'EUR', 'currencyname': 'Euro', 'phone': '49', 'postalcoderegex': '^(\\d{5})$', 'languages': 'de', 'neighbours': 'CH,PL,NL,DK,BE,CZ,LU,FR,AT'}


In [144]:
def city_info(city_list):
    from geopy.geocoders import Nominatim
    from geopy.exc import GeocoderTimedOut    
    geopy.geocoders.options.default_user_agent = "my-application2"
    geolocator = Nominatim(timeout=2)

    loc_list=[]
    loc_dic = {key:[] for key in ['cont', 'country', 'country_code', 'state', 'city', 'misc']}

    for city in city_list : 
        print('\n', city)
        try:
            location = geolocator.geocode(city, addressdetails=True, language="fr")
            if location:
                print(location.latitude, location.longitude)
                loc_list.append(location)
                
                address = location.raw['address'] 
                print(address)
                
                if len(set(['shop', 'amenity', 'building', 'neighbourhood', 'leisure', 'hamlet', 'locality', 'isolated_dwelling'])&address.keys())>0:
                    pass     # discard the entity because of high probability of mistake           
                    
                elif 'city' in address.keys(): # city, or road in a city
                    loc_dic['misc'].append(address.get('tourism', '').lower() )
                    loc_dic['city'].append(address.get('city', '').lower() )
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )
                elif 'town' in address.keys(): # town, or road in a town 
                    loc_dic['misc'].append(address.get('tourism', '').lower() )
                    loc_dic['city'].append(address.get('town', '').lower()  )                                   
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )
                elif 'road' in address.keys():
                    loc_dic['misc'].append(address.get('road', '').lower() )
                elif 'place' in address.keys():
                    loc_dic['state'].append(address.get('place', '').lower())
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )                    
                elif 'region' in address.keys():
                    loc_dic['state'].append(address.get('region', '').lower())
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )                    
                elif 'village' in address.keys():
                    loc_dic['misc'].append(address.get('tourism', '').lower() )
                    loc_dic['misc'].append(address.get('village', '').lower() )
                    loc_dic['city'].append(address.get('municipality', '').lower()  )
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )                    
                elif 'waterway' in address.keys():
                    loc_dic['misc'].append(address.get('waterway', '').lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )                                    
                else : # name of a state already, or a "boundary", or a country
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['state'].append(address.get('boundary', '') .lower() ) # region-like zone
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )
                   
        except GeocoderTimedOut as e:
            print("Error: geocode failed on input %s with message %s" %(city, e))
            loc_dic['misc'].append(city.lower())

    loc_dic = {key : set([string for string in loc_dic[key] if len(string)>0]) for key in loc_dic.keys()}
    print('\n---- List of Locations ---')
    print(loc_list)
    print('\n---- Dictionnary ---')
    print(loc_dic)
    return loc_dic

city_list=['Verrières-le-Buisson', "Munich", "Palais de l'Elysée", "Reichstag", 'france', 'la seine', 'cisjordanie', 'bretagne', 'chabot', 'bourgogne', 'boulevard maillot', 
           'trac', 'valium', 'nezvran', 'franc-maçon', 'pr lebreton', 'moyen-orient', 
           'gaza', 'sympathie', 'schultz', 'la terre', 'genou']

test = city_info(city_list)
test


 Verrières-le-Buisson
48.7467819 2.2653844
{'town': 'Verrières-le-Buisson', 'municipality': 'Palaiseau', 'county': 'Essonne', 'state': 'Île-de-France', 'country': 'France', 'postcode': '91370', 'country_code': 'fr'}

 Munich
48.1371079 11.5753822
{'city': 'Munich', 'state': 'Bavière', 'country': 'Allemagne', 'country_code': 'de'}

 Palais de l'Elysée
48.87037435 2.316068734550804
{'tourism': "Palais de l'Élysée", 'road': 'Avenue de Marigny', 'neighbourhood': 'Quartier de la Madeleine', 'suburb': 'Paris 8e Arrondissement', 'city': 'Paris', 'municipality': 'Paris', 'county': 'Paris', 'state': 'Île-de-France', 'country': 'France', 'postcode': '75008', 'country_code': 'fr'}

 Reichstag
52.4676201 13.5280284
{'tourism': 'Reichstag', 'house_number': '81', 'road': 'An der Wuhlheide', 'suburb': 'Oberschöneweide', 'borough': 'Treptow-Köpenick', 'city': 'Berlin', 'district': 'Rixdorf', 'state': 'Berlin', 'postcode': '12459', 'country': 'Allemagne', 'country_code': 'de'}

 france
46.603354 1.888

{'cont': {'asie', 'europe'},
 'country': {'allemagne', 'france', 'palestinian territory'},
 'country_code': {'DE', 'FR', 'PS'},
 'state': {'bande de gaza',
  'bavière',
  'berlin',
  'bourgogne',
  'bretagne',
  'cisjordanie',
  'judea et samaria',
  'île-de-france'},
 'city': {'berlin', 'munich', 'neuilly-sur-seine', 'verrières-le-buisson'},
 'misc': {'la seine', 'reichstag'}}

In [145]:
# summary of remaining issues 

city_info(["zorglub", "amérique latine", "mars", "soleil"])


 zorglub

 amérique latine

 mars
45.022172 4.3221548
{'village': 'Mars', 'municipality': 'Tournon-sur-Rhône', 'county': 'Ardèche', 'state': 'Auvergne-Rhône-Alpes', 'country': 'France', 'postcode': '07320', 'country_code': 'fr'}

 soleil
46.1178584 5.0808015
{'hamlet': 'Le Soleil', 'village': 'Saint-André-le-Bouchoux', 'municipality': 'Bourg-en-Bresse', 'county': 'Ain', 'state': 'Auvergne-Rhône-Alpes', 'country': 'France', 'postcode': '01240', 'country_code': 'fr'}

---- List of Locations ---
[Location(Mars, Tournon-sur-Rhône, Ardèche, Auvergne-Rhône-Alpes, France métropolitaine, 07320, France, (45.022172, 4.3221548, 0.0)), Location(Le Soleil, Saint-André-le-Bouchoux, Bourg-en-Bresse, Ain, Auvergne-Rhône-Alpes, France métropolitaine, 01240, France, (46.1178584, 5.0808015, 0.0))]

---- Dictionnary ---
{'cont': {'europe'}, 'country': {'france'}, 'country_code': {'FR'}, 'state': {'auvergne-rhône-alpes'}, 'city': {'tournon-sur-rhône'}, 'misc': {'mars'}}


{'cont': {'europe'},
 'country': {'france'},
 'country_code': {'FR'},
 'state': {'auvergne-rhône-alpes'},
 'city': {'tournon-sur-rhône'},
 'misc': {'mars'}}

__Full pipeline__

For getting structured info about geographic locations from a list of extracted entities

1. continents
2. countries
3. cities on what remains --> higher level fields are added too
4. fusion of sets

In [146]:
def geo_info(entity_list, ref_cont = my_ref_zones, ref_countries = list_countries_fr): # to add : , **kwargs
    cont_set = continent_only(entity_list, ref=ref_cont)
    country_set = country_only(entity_list, ref=ref_countries)
    city_list = list(set(entity_list)-(cont_set|country_set))
    geo_dico = city_info(city_list)
    geo_dico['cont']=geo_dico['cont']|cont_set
    geo_dico['country']=geo_dico['country']|country_set
    return geo_dico

# test

test = ['amérique latine', 'france', 'paris', 'reichstag', 'mars']

geo_info(test)

{'amérique latine'}

 paris
48.8566969 2.3514616
{'city': 'Paris', 'municipality': 'Paris', 'county': 'Paris', 'state': 'Île-de-France', 'country': 'France', 'country_code': 'fr'}

 reichstag
52.4676201 13.5280284
{'tourism': 'Reichstag', 'house_number': '81', 'road': 'An der Wuhlheide', 'suburb': 'Oberschöneweide', 'borough': 'Treptow-Köpenick', 'city': 'Berlin', 'district': 'Rixdorf', 'state': 'Berlin', 'postcode': '12459', 'country': 'Allemagne', 'country_code': 'de'}

 mars
45.022172 4.3221548
{'village': 'Mars', 'municipality': 'Tournon-sur-Rhône', 'county': 'Ardèche', 'state': 'Auvergne-Rhône-Alpes', 'country': 'France', 'postcode': '07320', 'country_code': 'fr'}

---- List of Locations ---
[Location(Paris, Île-de-France, France métropolitaine, France, (48.8566969, 2.3514616, 0.0)), Location(Reichstag, 81, An der Wuhlheide, Oberschöneweide, Treptow-Köpenick, Berlin, Rixdorf, Berlin, 12459, Allemagne, (52.4676201, 13.5280284, 0.0)), Location(Mars, Tournon-sur-Rhône, Ardèche, Auver

{'cont': {'amérique latine', 'europe'},
 'country': {'allemagne', 'france'},
 'country_code': {'DE', 'FR'},
 'state': {'auvergne-rhône-alpes', 'berlin', 'île-de-france'},
 'city': {'berlin', 'paris', 'tournon-sur-rhône'},
 'misc': {'mars', 'reichstag'}}

### 1.3 Entity Extraction

Using spacy 

In [86]:
# as a function

def entity_extractor(text_series, ent_type='LOC'):
    """
    Input:
    ------
    text_series : pandas series containing strings to process
    ent_type : type of entity to extract, LOC by default
    
    Output:
    ------
    ent_list : list of list of extracted entities, same length as the input text_series
    """
    
    nlp = spacy.load("fr") # reload
    # Create pipe containing all titles
    bodies=list(nlp.pipe(text_series.apply(lambda x: x.lower()), disable=["tagger", "parser"]) ) 
    
    ent_list = []
    for doc in bodies: 
        ent_list.append([ent.text for ent in doc.ents if ent.label_ == ent_type])

    return ent_list

# test
LOC_list = entity_extractor(news_df.text)
print(LOC_list[:10])

[['bourgogne', 'moskova', 'bordeaux', 'chabot', 'de france', 'houston', 'bretagne'], ['forêt de rambouillet', 'république de versailles', 'forêt de rambouillet', 'thiais', 'pr lebreton', 'moelle', 'autopsie', 'versailles', 'perray-en-yvelines', 'yvelines', 'versailles', 'valium', 'contre-autopsie', 'pr lebreton', 'matignon', 'libourne', 'paris', 'hôtel lutétia', 'matignon', 'rue de rivoli', 'etat', 'paris', 'matignon', 'caen', 'gagne', 'de france', 'franc-maçon', 'boulevard maillot', 'montréal', 'royan', 'moyen-orient', 'trac', 'neuilly', 'barsac'], ['chaldean street', 'reagan', 'shultz', 'cisjordanie', 'gaza', 'israël', 'israël', 'amérique', 'israël', 'israël', 'réfléchir', 'israël', 'gaza', 'etat', 'samarie', 'israël', 'israël', 'jérusalem', '»', 'israël', 'etat', 'sympathie', 'israël', 'jérusalem', 'israël'], ['beyrouth', 'jean-paul', 'liban', 'paris', 'val-de-grâce', 'liban', 'bastia', 'téhéran', 'libanais', "côte-d'ivoire", 'boulos ancache', 'bienveillante', 'syrie', 'damas', 'por

### 1.4 Comparison of Geo Dicts

Idea : 
- ensemble comparison using Jaccard similarity
- weighted sum, according to the concept's position in the geographical hierarchy

In practice:
- Dice similarity
- Jaccard similarity
- coeff de recouvrement --> not suitable because gives top similarity to a singleton containing any element from the other set!

Operations on sets : | for union, & for intersection, – for difference, ^ for symmetric difference.

In [91]:
def jaccard_sim(set1, set2):
    return len(set1&set2)/len(set1|set2)

def dice_sim(set1, set2):
    return 2*len(set1&set2)/(len(set1)+len(set2))

def recov_coeff(set1, set2):
    return len(set1&set2)/min([len(set1), len(set2)])

# test

set1 = {'Bade-Wurtemberg','Île-de-France', 'Bretagne', 'Guerrero',}

set2 = {'Bade-Wurtemberg','Île-de-France', 'Bavière', 'Berlin', 'Aquitaine', }

set3 = {'Bade-Wurtemberg','Île-de-France', 'Bavière'}

print(jaccard_sim(set1, set2))
print(jaccard_sim(set2, set3))
print(jaccard_sim(set1, set3))

print(dice_sim(set1, set2))
print(dice_sim(set2, set3))
print(dice_sim(set1, set3))

print(recov_coeff(set1, set2))
print(recov_coeff(set2, set3))
print(recov_coeff(set1, set3))

0.2857142857142857
0.6
0.4
0.4444444444444444
0.75
0.5714285714285714
0.5
1.0
0.6666666666666666


In [101]:
def geo_sim(dico1, dico2, weights = {'cont':1, 'country':2, 'state':3, 'city':6, 'misc':1}, similarity=dice_sim):
    """
    Input:
    ------
    dico1 : dict with keys 'cont', 'country','country_code', 'state', 'city', 'misc' 
    dico2 : idem
    similarity : str, 'jaccard or 'dice'
    
    Output:
    ------
    sim : float, similarity score for geographic locations contained in the dictionnaries
    """
            
    dico_sim = dict()
    for key in ['cont', 'country', 'state', 'city', 'misc']:
        dico_sim[key]=similarity(dico1[key], dico2[key])
    
    w_sum = sum(list(weights.values()))
    weights = {key : weights[key]/w_sum for key in weights.keys()}
    
    sim=sum([dico_sim[key]*weights[key] for key in dico_sim.keys()])
    
    return sim
            



1.0
0.82603550295858
0.3710622710622711


In [None]:
# test 

dico1 = {'cont': {'Amérique du Nord', 'Europe'},
 'country': {'Allemagne', 'France', 'Mexique'},
 'country_code': {'DE', 'FR', 'MX'},
 'state': {'Bade-Wurtemberg',
  'Bavière',
  'Berlin',
  'Bretagne',
  'Guerrero',
  'Île-de-France'},
 'city': {'Acapulco',
  'Berlin',
  'Munich',
  'Paris',
  'Quimper',
  'Stuttgart',
  'Verrières-le-Buisson'},
 'misc': {"Palais de l'Élysée", 'Reichstag'}}

dico2 = {'cont': {'Amérique du Nord', 'Europe', 'Afrique'},
 'country': {'Allemagne', 'France', 'Mexique'},
 'state': {'Bade-Wurtemberg',
  'Bavière',
  'Guerrero',
  'Île-de-France'},
 'city': {'Acapulco',
  'Berlin',
  'Munich',
  'Paris',
  'Stuttgart',
  'Verrières-le-Buisson'},
 'misc': set()}

dico3 = {'cont': { 'Europe', 'Afrique'},
 'country': {'France', 'Union Soviétique'},
 'state': {
  'Île-de-France'},
 'city': {'Stuttgart',
  'Verrières-le-Buisson'},
 'misc': {'station MIR', 'espace'}}

print(geo_sim(dico1, dico1))
print(geo_sim(dico1, dico2))
print(geo_sim(dico1, dico3))

## 2. Apply geoparsing to the news articles

using the utility functions defined above

In [134]:
LOC_stopwords = ['etat', 'état', 'pays', 'continent', 'endroit', 'lieu', 'de france', 'état-', 'major', 'état-major']

def remove_mistakes(input_list, stopwords=LOC_stopwords):
    return [w for w in input_list if w not in stopwords]

# test 

remove_mistakes(['etat', 'usa', 'barbades'])

['usa', 'barbades']

In [150]:
# test 1

dic_list=[]
for ent_list in LOC_list[:10]:
    ent_list = remove_mistakes(ent_list)
    geo_dic = geo_info(ent_list)
    dic_list.append(geo_dic)
    

set()

 houston
29.7589382 -95.3676974
{'city': 'Houston', 'county': 'Harris County', 'state': 'Texas', 'country': "États-Unis d'Amérique", 'country_code': 'us'}

 bretagne
48.2640845 -2.9202408
{'state': 'Bretagne', 'country': 'France', 'country_code': 'fr'}

 chabot
45.9557459 3.5723579
{'hamlet': 'Chabot', 'village': 'Châteldon', 'municipality': 'Thiers', 'county': 'Puy-de-Dôme', 'state': 'Auvergne-Rhône-Alpes', 'country': 'France', 'postcode': '63290', 'country_code': 'fr'}

 moskova
55.7504461 37.6174943
{'city': 'Moscou', 'state': 'Moscou', 'region': 'District fédéral central', 'country': 'Russie', 'country_code': 'ru'}

 bordeaux
44.841225 -0.5800364
{'city': 'Bordeaux', 'municipality': 'Bordeaux', 'county': 'Gironde', 'state': 'Nouvelle-Aquitaine', 'country': 'France', 'country_code': 'fr'}

 bourgogne
47.27808725 4.222486304306048
{'boundary': 'Bourgogne', 'country': 'France', 'country_code': 'fr'}

---- List of Locations ---
[Location(Houston, Harris County, Texas, États-Unis

44.0969405 6.232955593333333
{'building': 'La Sympathie', 'road': 'Rue Pierre Magnan', 'neighbourhood': 'Les Épinettes', 'town': 'Digne-les-Bains', 'municipality': 'Digne-les-Bains', 'county': 'Alpes-de-Haute-Provence', 'state': "Provence-Alpes-Côte d'Azur", 'country': 'France', 'postcode': '04000', 'country_code': 'fr'}

 chaldean street
43.7931066 -79.3051282
{'road': 'Chaldean Street', 'neighbourhood': "L'Amoreaux", 'quarter': 'Scarborough—Agincourt', 'suburb': 'Scarborough', 'city': 'Toronto', 'state_district': 'Golden Horseshoe', 'state': 'Ontario', 'postcode': 'M1W 2A3', 'country': 'Canada', 'country_code': 'ca'}

 réfléchir

 cisjordanie
32.0254688 35.2888075
{'place': 'Cisjordanie', 'village': 'ترمسعيا', 'county': 'منطقة ب', 'state': 'Judea et Samaria', 'postcode': '63620', 'country': 'Palestinian Territory', 'country_code': 'ps'}

 jérusalem
31.79592425 35.21198075969497
{'city': 'Jérusalem', 'state': 'District de Jérusalem', 'country': 'Israël', 'country_code': 'il'}

 »

 sh

31.94696655 35.27386547291496
{'boundary': 'Territoires Palestiniens', 'state': 'Judea et Samaria', 'country': 'Palestinian Territory', 'country_code': 'ps'}

 abou ammar
30.0010226 31.1885874
{'road': 'حارة سيد عمار', 'neighbourhood': 'مساكن أبو الفتوح', 'city': 'Gizeh', 'postcode': '12555', 'country': 'Égypte', 'country_code': 'eg'}

 rusé
43.8480413 25.9542057
{'city': 'Roussé', 'municipality': 'Municipalité de Roussé', 'state': 'Oblast de Roussé', 'country': 'Bulgarie', 'country_code': 'bg'}

 genève
46.2017559 6.1466014
{'city': 'Genève', 'state': 'Genève', 'country': 'Suisse', 'country_code': 'ch'}

 jérusalem
31.79592425 35.21198075969497
{'city': 'Jérusalem', 'state': 'District de Jérusalem', 'country': 'Israël', 'country_code': 'il'}

 tripoli
32.896672 13.1777923
{'city': 'Tripoli', 'district': 'سوق الجمعة', 'state': 'Tripoli', 'country': 'Libye', 'country_code': 'ly'}

 aviv
32.0961451 34.9514955
{'suburb': 'אביב', 'city': 'ראש העין', 'county': 'נפת פתח תקווה', 'state': 'Dis

45.3080981 0.8355407
{'hamlet': 'La Kabylie', 'city_district': 'Sorges', 'city': 'Sorges et Ligueux en Périgord', 'municipality': 'Périgueux', 'county': 'Dordogne', 'state': 'Nouvelle-Aquitaine', 'country': 'France', 'country_code': 'fr'}

 algériens
36.3950057 3.8872999770958083
{'leisure': 'Camp Scouts Musulmans Algériens', 'town': 'Bouira', 'county': 'Daïra Bouira', 'state': 'Bouira', 'country': 'Algérie', 'country_code': 'dz'}

 ouanza
4.285022 22.34449
{'village': 'Ouanza-Bilengo', 'state': 'Mbomou', 'country': 'République Centrafricaine', 'country_code': 'cf'}

 annaba
36.8982165 7.7549272
{'city': 'Annaba', 'county': 'Daïra Annaba', 'state': 'Annaba', 'postcode': '23000', 'country': 'Algérie', 'country_code': 'dz'}

 palais de justice
47.5894285 1.333323713408308
{'amenity': 'Palais de Justice', 'road': 'Place de la République', 'town': 'Blois', 'municipality': 'Blois', 'county': 'Loir-et-Cher', 'state': 'Centre-Val de Loire', 'country': 'France', 'postcode': '41000', 'country_c

In [151]:
# test 2 - waterways

dic_list=[]
for ent_list in [['la seine', 'garonne', 'rhône']]:
    geo_dic = geo_info(ent_list)
    dic_list.append(geo_dic)


set()

 la seine
48.7308806 2.444754
{'waterway': 'La Seine', 'country': 'France', 'postcode': '78230', 'country_code': 'fr'}

 garonne
44.2161598 0.5195403
{'waterway': 'La Garonne', 'country': 'France', 'postcode': '33190', 'country_code': 'fr'}

 rhône
45.8802348 4.564533629559522
{'county': 'Rhône', 'state_district': 'Circonscription départementale du Rhône', 'state': 'Auvergne-Rhône-Alpes', 'country': 'France', 'country_code': 'fr'}

---- List of Locations ---
[Location(La Seine, France métropolitaine, 78230, France, (48.7308806, 2.444754, 0.0)), Location(La Garonne, France métropolitaine, 33190, France, (44.2161598, 0.5195403, 0.0)), Location(Rhône, Circonscription départementale du Rhône, Auvergne-Rhône-Alpes, France métropolitaine, France, (45.8802348, 4.564533629559522, 0.0))]

---- Dictionnary ---
{'cont': {'europe'}, 'country': {'france'}, 'country_code': {'FR'}, 'state': {'auvergne-rhône-alpes'}, 'city': set(), 'misc': {'la seine', 'la garonne'}}


## 3. Filter and compare on the basis of geoparsing

Idea : 
* input article, by the user --[geoparsing]--> geo_dico --> dataframe continent/country
* filter on continents and/or countries
* compute similarity between the input article and all remaining articles
* keep k1 best
* compute text similarity using word embeddings
* retrieve k2 best for reco