# Find Location
In this notebook we create a function to geolocate a string.
Twitter users can give a location in their description.
We associate the string they give with real location, to know the country they come from.

Users can give any fancy location, so we need to exclude fancy names from our data to avoid mismatch such as 'my town' -> Town, Saint Lucia.

Moreover, we exclude all part of text that are not locations, such as numbers, links, stopwords.

In [7]:
import pandas as pd
import json
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
#list of fancy substrings that often cause mismatches in the geolocation
strange_places = []
with open("/home/jlenti/Codes/strange_places.txt","r") as f:
    for line in f:
        line = line.strip()
        strange_places.append(line)

#same list
strange_places =  ["he", "his","she","her","they","them","none","null","na","nan","secret","zone",
                   "hell","landing","rainbow","south","north","east","west","southeast","northeast",
                   "southwest","northwest","southern","northern","eastern","western","southeastern","northeastern",
                   "southwestern","northwestern","se","ne","sw","nw","earth","venus", "flat", "satan", 
                   "evil", "devil", "god", "jesus", "end", "mars", "blue", "moon",
                   "happy", "star", "war", "home", "earth", "en", "blm", "sua", "suo", "area", "university",
                   "cr", "ela", "terra", "cali", "capital", "street", "arg", "way", "galaxy", "le",
                   "casa", "maison", "house", "sun", "juno", "paradise", "dark", "hope", "valley", "deep",
                   "center", "centre", "side", "place", "hell", "heaven", "universe", "land", "northern",
                   "southern", "western", "eastern", "plain", "go", "ct", "nation", "dont", "stan",
                   "universo", "11", "kingdom", "et", "ph", "reading", "cama", "hospital", "calle",
                   "baby", "rep", "village", "metro", "disney", "club", "im", "atlantis", "tree",
                   "nord", "ovest", "est", "sud", "lua","twin","corazon","cosmos","still","camp","black",
                   "research","ave", "lakes","central","di","perk","deus","valhalla","national","dm",
                   "garden","freedom", "centro", "cpx", "teu","este","Αττική","org","earth","gru", "tn",
                   "republic", "eu", "www", "com", "gab", "park", "puta", "middle", "north", "south",
                   "west", "east", "tu", "cu", "ande", "onde", "mind", "love", "un", "villa","red","ac",
                   "union","coast","time","europa","ai", "ro","day","kl","ilha","rsa","vila", "du",
                   "hai","globe","cal","marte","oz","dot", "jupiter","top","green","ur","baker","bay",
                   "frc","cloud","jhb","sky","wall","oeste", "occidental", "falcon","sic","ops",
                   "ish","oto","school","outer","army","patriot", "uai","nova","au","peace",
                   "castle","fim","gulf coast","gulf","heart","côte","front","tower","ass","bunker",
                   "ocean","vale","real","provence","barra","beatriz","chornobyl","chernobyl",
                   "paradis","infierno","el infierno","parte","chaos","golf","ramos","comuna","angel",
                   "daniel", "candy","far","restaurant","cielo", "costa","Los Sueños", "chicken",
                   "saints", "abajo", "coral","core","viento","alice","first","estate","jaz",
                   "pianeta","twilight","avon", "pacífico", "la cara","el sur","plane","basi",
                   "field", "festà", "hub","rotten","lado","plant","el corazón","corazón","elle","canto",
                   "hanging","rua","dema","alto","auto","fucking","interior","liz","lives",
                   "mitten","kimberley", "lord", "mail","sunnydale","apex","brave","zu", "disco",
                   "universidad","600","piloto","cara","group","liga","branco","agua","weed","aries",
                   "lickely","entra","du", "promised land","feder","eccles","vive","misery","anna",
                   "cares", "el sol", "sol", "superior","sauna","pode","gravity","ole","magma",
                   "municipio","africa","asia","ponte","finger","campus","belt","troll","belong",
                   "industrial","sure","circle","lados","cis","lodge","liberal","utopia","police",
                   "bubble","federa","stranger","orion","luna","la luna","von","maya","aus","false","no name",
                   "ku","spread","main","ki","grid","hometown","grace","mal","tudo","sweet home","friends",
                   "kamino","camino","bethlehem","minor","libre","else","quarto","mansion","strong",
                   "treasure","sky","book","course","conta","golden city","la patria","nog","pais",
                   "lov","sloan","court","arena","santa","erre","isles","ranch","ville","luna","welcome","pals",
                   "bath", "dos", "bed", "beach", "fan", "gotham", "road", "via", "tlc", "hole","river",
                   "gone","troy","neo","mons","mont","alley","big","justice","train","low","big",
                   "line","curva","mtl","ddd","seca","land of oz","alone","cal","look","vaal","mama","nea","sur",
                   "aqui","alla","donald","time","chalé","boca","naast","pink","eye","justin",
                   "cave", "see","base","martha","mer","neptune","Colony","amor","hills","mile","mile high city",
                   "lead","ball","vida","lot","ball","made","phone","bairro","till","redondo","eua",
                   "post","op","sale","uai","porra","idk","ton","isle","sad","arab","triangle","springs",
                   "sei","college","plan","plana","bad","care","light","cac","leste","esp","quinto","casino",
                   "feliz","isla","loren","ward","grand","pond","ans","ad","empire","dog","mid","isola",
                   "gt","granada","south central","gran","rva","atx","hard","leo","ever","carmen",
                   "des","der","shi","hot","shore","dei","trent","cone","dome","ali","tell","linares",
                   "libertad","ile","ile-de-france","chair","alene","mountain","vc","turn","name",
                   "nice","bp","ts","coast","bm","lv","praia","madriz","ask","vv","si","tour","lu",
                   "live", "user", "ig", "tw", "back", "never", "lol", "mine", "town", "village", "piedmont",
                   "minha casa", "ماتيرا", "إيطاليا","ng","sovereign","carrera","catalana","puta","contre",
                   "mandela", "pas","station","sunday","cell","dove","atlantida","pale","ile de france","cb",
                   "pt","bs","ela","elu"
                  ]


In [51]:
#locdictionary associate each word to the list of location (geonameid) it can be associated to
locdictionary = json.load(open("/home/jlenti/Codes-cp/geonameslocator/locdictionary.json"))
locdictionary

{' lungsod ng olongapo': [1697172],
 ' naʿwrt\u200e': [294099],
 ' the commons': [2965243],
 ' ناعورة\u200e': [294099],
 ' သာယာဝတ': [11154303],
 ' သာယာဝတီ': [11154303],
 '1 bezirk': [11746594],
 '1 de mayo': [3840276],
 '1 decembrie': [664644, 856315],
 '1 wiener gemeindebezirk': [2775259],
 '10 de abril': [8859815,
  8859815,
  8871951,
  8871951,
  8872906,
  8872906,
  8886385,
  8886385],
 '10 de marzo': [8877742, 8877742],
 '10 wiener gemeindebezirk': [2779776],
 '100 mile house': [5881639],
 '101 laid': [6697610],
 '101073pnas0801507105': [2186224],
 '105 km': [11874839, 11874839],
 '105 км': [11874839],
 '105ij kilometr': [11874839],
 '105ий кілометр': [11874839],
 '10bezirk': [2766447],
 '10e arrondissement de budapest': [3049646],
 '10eme arrondissement': [6618616],
 '10ème arrondissement': [6618616],
 '11': [654706],
 '11 de abril': [8901069, 8901069],
 '11 myl suid': [7670862],
 '11 wiener gemeindebezirk': [2765027],
 '11875574 korostyshiv urban united territorial community'

In [52]:
len(locdictionary)

1887100

In [54]:
#dataframe with all location we can geolocate, and relative country
#we also need the population, because if a string is associated two locations, we choose the most populated one
locationdata = pd.read_csv("/home/jlenti/Codes-cp/geonameslocator/countriesdatap.tsv", sep = '\t', low_memory = False,
                           index_col = "geonameid").drop([2855707, 2769324])
locationdata.head()

Unnamed: 0_level_0,name,asciiname,alternatenames,latitude,longitude,featureclass,featurecode,countrycode,cc2,admin1code,admin2code,admin3code,admin4code,population,elevation,dem,timezone,modificationdate
geonameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
290557,United Arab Emirates,United Arab Emirates,"['United Arab Emirates', ""'Alepea Fakatahataha...",23.75,54.5,A,PCLI,AE,,0.0,,,,9630959,,96,Asia/Dubai,2020-03-29
290594,Umm Al Quwain City,Umm Al Quwain City,"['Umm Al Quwain City', 'Oumm al Qaiwain', 'Oum...",25.56473,55.55517,P,PPLA,AE,,7.0,,,,62747,,2,Asia/Dubai,2019-10-24
290595,Imārat Umm al Qaywayn,Imarat Umm al Qaywayn,"['Imārat Umm al Qaywayn', 'Imarat Umm al Qaywa...",25.5,55.75,A,ADM1,AE,,7.0,,,,56253,,52,Asia/Dubai,2020-07-08
291074,Ras Al Khaimah City,Ras Al Khaimah City,"['Ras Al Khaimah City', 'Julfa', 'Khaimah', 'R...",25.78953,55.9432,P,PPLA,AE,,5.0,,,,351943,,2,Asia/Dubai,2019-09-09
291075,Imārat Ra’s al Khaymah,Imarat Ra's al Khaymah,"['Imārat Ra’s al Khaymah', ""Imarat Ra's al Kha...",25.66667,56.0,A,ADM1,AE,,5.0,,,,187535,,18,Asia/Dubai,2020-07-08


In [53]:
len(locationdata)

568309

In [None]:
#other mismatches are given by numbers or stopwords
engstopwords = stopwords.words("english")
esstopwords = stopwords.words("spanish")
ptstopwords = stopwords.words("portuguese")

numbers = {'zero','one','two','three','four','five','six','seven','eight','nine','ten',
           'eleven','twelve','thirteen','fourteen','fifteen','sixteen','seventeen','eighteen',
           'nineteen','twenty','thirty','fourty','fifty','sixty','seventy','eighty','ninety',
           'hundred','thousand','million','billion','zillion'}

num = [str(n) for n in range(100)]

In [8]:

# Turn tokens into a sequence of n-grams
def word_ngrams(tokens, ngrams):
    min_n, max_n = 1, ngrams
    if max_n != 1:
        original_tokens = tokens
        tokens = []
        n_original_tokens = len(original_tokens)
        for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
            for i in range(n_original_tokens - n + 1):
                tokens.append(" ".join(original_tokens[i: i + n]))
    return tokens

#transform a string in sequences of names, to find the possible locations inside a longer string
tokenizer = RegexpTokenizer(r'\w+')
def find_names(text):
    tokens = word_ngrams(tokenizer.tokenize(text.lower()), 5)
    m = set()
    for token in tokens:
        if token in locdictionary:
            m.add(token)
    # filter out matched places that are substrings of another matched place
    k_list = list(m)
    for i, k in enumerate(k_list):
        for k2 in k_list[:i]:
            if k in k2 and k in m:
                m.remove(k)
        for k2 in k_list[i+1:]:
            if k in k2 and k in m:
                m.remove(k)
    return m

# Finds the best match for the text
# For speed reasons, returns only the geonamesid which can be used to index into countriesdatap
def find_best_match(text):
    #correct common mismatches
    if "Ile-de-France" in text:
        text = "Paris"
    if "Repùblica Catalana" in text:
        text = "Barcelona"
    if ("Buenos Aire" in text)|("Castelar" in text):
        text = "Buenos Aires"
    if "Islamabad" in text:
        text = "Pakistan"
    if "Brasi" in text:
        text = "Brasil"
    mymatches = find_names(text)
    #if mymatches:
    # Try finding all the matches
    bestmatch = None
    matcharray = []
    matchedids = []
    foundcountry = None
    countrymatches = None
    # Put all the matches in one array
    for foundloc in mymatches:
        matchedids = matchedids + locdictionary[foundloc]
     #remove some bad match
    
    # If there is more than 1 match, and there is a match to a country, constrain all other matches to that country
    if len(matchedids) > 1:
        matchedlocations = locationdata.loc[matchedids]
        if pd.DataFrame.any(matchedlocations.featurecode=="PCLI"):
            foundcountry = matchedlocations[matchedlocations.featurecode=="PCLI"].iloc[0].countrycode
            countrymatches = matchedlocations[matchedlocations.countrycode==foundcountry]
    # For each word match, find the most populous one
    for foundloc in mymatches:
        # Handle special cases
        if len(foundloc) < 2:
            continue
        if (foundloc in engstopwords) | (foundloc in esstopwords) | (foundloc in ptstopwords) | (foundloc in num) | (foundloc in numbers) | (foundloc in strange_places):
            continue
        else:
            matchedlocations = locationdata.loc[locdictionary[foundloc],:]
            if foundcountry: # Remove matches that are not in the country we found
                matchedlocations = matchedlocations[matchedlocations.countrycode==foundcountry]
            if matchedlocations.shape[0] > 0:
                foundmatch = matchedlocations.sort_values(by="population",ascending=False).index[0]
                matcharray.append(foundmatch)
    # Among all the matches, find the least populous one
    if len(matcharray) > 0:
        allmatches = locationdata.loc[matcharray]
        bestmatch = allmatches.sort_values(by="population",ascending=True).index[0]
    return bestmatch

def find_location(text):
    #return [location, countrycode] if possible, otherwise [null, null]
    #if the string seems to be a link or tag return [null, null]
    if text == None:
        return["null", "null"]
    if ".com" in text:
        return ["null", "null"]
    elif "@" in text:
        return ["null", "null"]
    else:
        try:
            #extract the most populous location substring from the string
            loc_match = find_best_match(text)
            #geolocate the location from the geonameid
            location = locationdata.loc[loc_match]
            #keep only location and countrycode
            return location[["name", "countrycode"]]
        except:
            return ["null", "null"]

In [43]:
examples = []

for location in ["roma", "Rome", "Buenos Aires", "Abuja", "naples", "Lima, Peru", "Bloomington, Indiana, US",
                 "Indiana", "United States", "Moon", "Luna", "Antananarivo", "LOL", "my bedroom"]:
    examples.append([location, find_location(location)[0], find_location(location)[1]])

In [44]:
pd.DataFrame(examples, columns = ["user_location", "real_location", "user_country_code"])

Unnamed: 0,user_location,real_location,user_country_code
0,roma,Città metropolitana di Roma Capitale,IT
1,Rome,Città metropolitana di Roma Capitale,IT
2,Buenos Aires,Buenos Aires,AR
3,Abuja,Abuja,NG
4,naples,Napoli,IT
5,"Lima, Peru",,
6,"Bloomington, Indiana, US",Bloomington,US
7,Indiana,Indiana,US
8,United States,United States,US
9,Moon,,
