In [1]:
import json
import os
from functools import partial

import pandas as pd
from backoff import on_exception, expo
from geopy import Photon, Point
from geopy.exc import GeocoderUnavailable, GeocoderTimedOut
from geopy.geocoders import Nominatim
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm
import pickle

In [2]:
DUMMY_LOCATION_STRS = ["Berlin, Deutschland",
"France",
"Paris et plein Centre",
"Punjab, Pakistan",
"South Africa",
"Boulder, Colorado",
"NONE",
"London",
"United States",
"Ethiopia",
"Ethiopia",
"TΧ",
"NONE",
"Paris, Mulhouse, sur la route",
"London",
"Barcelona",
"Liverpool",
"Bay Area, CA"]

In [3]:
DATA_PATH = 'data'

In [4]:
# @sleep_and_retry
# @on_exception(expo, GeocoderTimedOut, max_tries=8)
# @limits(calls=1, period=1)
def get_location(location_str, geolocator, **kwargs):
    try:
        return geolocator.geocode(location_str, **kwargs)
    except GeocoderUnavailable as e:
        return None

In [5]:
# @sleep_and_retry
# @on_exception(expo, GeocoderTimedOut, max_tries=8)
# @limits(calls=1, period=1)
def reverse_location(location_obj, geolocator, **kwargs):
    try:

        # coords = ",".join(map(str, location_obj.raw['geometry']['coordinates']))
        # coords = tuple(location_obj.raw['geometry']['coordinates'])
        lon, lat = location_obj.raw['geometry']['coordinates']
        coords = Point(lat, lon)
    except AttributeError as e:
        return None
    try:
        return geolocator.reverse(coords, **kwargs)
    except GeocoderUnavailable as e:
        return None

In [6]:
def read_locations(users_path=None):
    """
    reads the input data
    :return: a dataframe with a column `location_str` containing the description of the location
    """
    if users_path is None:
        return pd.DataFrame(DUMMY_LOCATION_STRS, columns=['location_str'], index=range(len(DUMMY_LOCATION_STRS)))
    else:
        return pd.read_json(os.path.join(DATA_PATH, users_path),
                            lines=True
                            ).rename(columns={'location':'location_str'}
                                     ).set_index('id').dropna(subset=['location_str'])

        # with open(users_path, encoding='utf8') as f:
        #     # users = {k:v for l in f for k, v in json.loads(l).items()}
        #
        #     users = pd.concat(
        #         pd.DataFrame([dict(follower, **{'follows':pollster}) for follower in followers])
        #         for l in f for pollster, followers in json.loads(l).items()
        #     )
        # return users


In [7]:
# users = read_locations(os.path.join(DATA_PATH, 'followers_rehydrated.jsonl'))
# # pd.read_json('data/followers_rehydrated.jsonl', lines=True, nrows=100)
# users=pd.read_json('data/followers_rehydrated.jsonl',
#                             lines=True
#                             ).rename(columns={'location':'location_str'}
#                                      ).set_index('id').dropna(subset=['location_str'])

In [8]:
followers = list()
with open('data/followers_rehydrated.jsonl') as f:
    for l in f:
        try:
            followers.append(json.loads(l))
        except:
            print(f"can't parse {l.strip()}")

In [9]:
users = pd.DataFrame(followers).rename(columns={'location':'location_str'}
                                     ).set_index('id').dropna(subset=['location_str'])

In [10]:
del followers

In [11]:
users.head(2)

Unnamed: 0_level_0,name,profile_image_url,location_str,public_metrics,protected,pinned_tweet_id,username,description,created_at,verified,entities,url,withheld
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3070230547,京犬@ポケモン,https://pbs.twimg.com/profile_images/852969926...,日本,"{'followers_count': 2422, 'following_count': 2...",False,7.993935999179571e+17,inu_2525,ゲーム垢用。仲良ししてくださいね！ #モンハン女子,2015-03-09T19:15:39.000Z,False,"{'description': {'hashtags': [{'start': 18, 'e...",,
1454016580874997763,Gift So,https://pbs.twimg.com/profile_images/145401840...,Republic of the Philippines,"{'followers_count': 6, 'following_count': 93, ...",False,,hermajestygifty,Unwrap the 🎁. ...,2021-10-29T09:26:00.000Z,False,,,


In [19]:
# read the data
# locations = read_locations(os.path.join(DATA_PATH, 'followers_rehydrated.jsonl'))
locations = users
# initialize the geolocator service
# geolocator = Nominatim(user_agent="twitter_poll_bio_geocoding_v0.0.1",
#                        domain='localhost:8080/nominatim', scheme='http')
# geolocator = Nominatim(user_agent="twitter_poll_bio_geocoding_v0.0.1",
#                        domain='localhost:8080', scheme='http')
geolocator = Photon(user_agent="twitter_poll_bio_geocoding_v0.0.1",
                       domain='localhost:2322', scheme='http')

In [20]:
unique_locations = set(locations.location_str.unique())

In [21]:
len(locations)

6003783

In [22]:
len(unique_locations)

1131028

In [23]:
del locations

In [24]:
if os.path.exists(os.path.join(DATA_PATH, 'resolved.pkl')):
    with open(os.path.join(DATA_PATH, 'resolved.pkl'), 'rb') as f:
        resolved = pickle.load(f)
else:
    resolved = dict()
if os.path.exists(os.path.join(DATA_PATH, 'tested.pkl')):
    with open(os.path.join(DATA_PATH, 'tested.pkl'), 'rb') as f:
        tested = pickle.load(f)
else:
    tested = set()
if os.path.exists(os.path.join(DATA_PATH, 'timed_out.pkl')):
    with open(os.path.join(DATA_PATH, 'timed_out.pkl'), 'rb') as f:
        timed_out = pickle.load(f)
else:
    timed_out = set()
if os.path.exists(os.path.join(DATA_PATH, 'excepted.pkl')):
    with open(os.path.join(DATA_PATH, 'excepted.pkl'), 'rb') as f:
        excepted = pickle.load(f)
else:
    excepted = set()


In [None]:
unique_locations = sorted(unique_locations)
for i, location in tqdm(enumerate(unique_locations)):
    if location in tested:
        continue
    if (i % 1000) == 0:
                # save data
        with open(os.path.join(DATA_PATH, 'resolved.pkl'), 'wb+') as f:
            pickle.dump(resolved, f)
        with open(os.path.join(DATA_PATH, 'tested.pkl'), 'wb+') as f:
            pickle.dump(tested, f)
        with open(os.path.join(DATA_PATH, 'timed_out.pkl'), 'wb+') as f:
            pickle.dump(timed_out, f)
        with open(os.path.join(DATA_PATH, 'excepted.pkl'), 'wb+') as f:
            pickle.dump(excepted, f)
    try:
        resolved[location] = get_location(location, geolocator)
    except GeocoderTimedOut as e:
        timed_out.add(location)
    except Exception as e:
        excepted.add(location)
    finally:
        tested.add(location)

6711it [07:22, 49.17it/s]

In [None]:
# save data
with open(os.path.join(DATA_PATH, 'resolved.pkl'), 'wb+') as f:
    pickle.dump(resolved, f)
with open(os.path.join(DATA_PATH, 'tested.pkl'), 'wb+') as f:
    pickle.dump(tested, f)
with open(os.path.join(DATA_PATH, 'timed_out.pkl'), 'wb+') as f:
    pickle.dump(timed_out, f)
with open(os.path.join(DATA_PATH, 'excepted.pkl'), 'wb+') as f:
    pickle.dump(excepted, f)

In [None]:
len(timed_out)

In [None]:
resolved_reverse = dict()
tested_reverse = set()
timed_out_reverse = set()

In [None]:
for location, location_obj in tqdm(resolved.items()):
    if location in tested_reverse:
        continue
    if not location_obj:
        continue
    try:
        resolved_reverse[location] = reverse_location(location_obj, geolocator)
    except GeocoderTimedOut as e:
        timed_out_reverse.add(location)
    except ValueError as e:
        print(location_obj.raw)
    finally:
        tested_reverse.add(location)

# save data
with open(os.path.join(DATA_PATH, 'resolved_reverse.pkl'), 'wb+') as f:
    pickle.dump(resolved_reverse, f)
with open(os.path.join(DATA_PATH, 'tested_reverse.pkl'), 'wb+') as f:
    pickle.dump(tested_reverse, f)
with open(os.path.join(DATA_PATH, 'timed_out_reverse.pkl'), 'wb+') as f:
    pickle.dump(timed_out_reverse, f)

In [None]:
locations['location_obj'] = locations.location_str.map(resolved)

In [None]:
# # apply the geolocator
# locations['location_obj'] = locations.location_str.apply(partial(get_location, geolocator=geolocator))

Not everything finds a match on Nominatim, esp. when there are natural language descriptions.

Perhaps we could give it a pass of NER for GEO entities first

In [None]:
locations[locations.location_obj.isna()]

The information returned by the `geocode` function is quite limited, because it tries to fit the specific string.
For example, states are rarely included in the object. Reverse geocoding helps with generalizing in this case.

In [65]:
locations['location_obj_reversed'] = locations.location_str.map(resolved_reverse)

In [63]:
# # apply the reverse geolocator
# locations['location_obj_reversed'] = locations.location_obj.apply(partial(reverse_location, geolocator=geolocator))

In [66]:
# unpack address information
addresses = pd.DataFrame(locations.location_obj_reversed.dropna().apply(lambda x:pd.Series(x.raw['address'])))
addresses.head()

Unnamed: 0_level_0,county,state,ISO3166-2-lvl4,country,country_code,road,neighbourhood,city,postcode,amenity,...,archipelago,subward,ward,square,allotments,landuse,community,zone,ISO3166-2-lvl10,banner
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1599787232,Decatur County,Kansas,US-KS,United States,us,,,,,,...,,,,,,,,,,
1549733678233837568,Greater Bridgeport COG,Connecticut,US-CT,United States,us,Governor John Davis Lodge Turnpike,Bassickville Historic District,Bridgeport,6601.0,,...,,,,,,,,,,
1586767903973511168,Riverside County,California,US-CA,United States,us,Kalmia Street,,Murrieta,92562.0,,...,,,,,,,,,,
1578080496906641408,Adams County,Colorado,US-CO,United States,us,Civic Center Drive,,Thornton,80229.0,Thornton Civic Center,...,,,,,,,,,,
1550149641823469568,Decatur County,Kansas,US-KS,United States,us,,,,,,...,,,,,,,,,,


In [67]:
# merge with the original dataframe
locations = pd.merge(locations, addresses, left_index=True, right_index=True)

In [68]:
locations[['location_str', 'country_code']]

Unnamed: 0_level_0,location_str,country_code
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10285,"Orlando, FL",us
451943,Perugia - Italy,it
614543,Biella - Italy,it
687203,London-ish,al
785595,"Dallas, Texas",us
...,...,...
1613973600650027008,"Fresno, CA",us
1613993985361727488,"Bandung, Jawa Barat",id
1614036014535680000,Egypt,eg
1614049376480935936,ATX,kz


some errors appear bizarre -- hard to correct

In [71]:
locations.loc[687203].location_obj.raw

{'place_id': 52010716,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'node',
 'osm_id': 4602247639,
 'boundingbox': ['41.3311355', '41.3312355', '19.8173253', '19.8174253'],
 'lat': '41.3311855',
 'lon': '19.8173753',
 'display_name': 'London, Bulevardi Zogu i Parë, Qendra, Njësia Bashkiake Nr. 9, Tiranë, Bashkia Tiranë, Qarku i Tiranës, Shqipëria Qendrore, 1005, Shqipëria',
 'class': 'amenity',
 'type': 'cafe',
 'importance': 0.11000999999999997,
 'icon': 'https://nominatim.openstreetmap.org/ui/mapicons/food_cafe.p.20.png'}

In [75]:
def latlong(location_obj):
    try:
        return pd.Series({'lat':location_obj.raw['lat'], 'lon':location_obj.raw['lon']})
    except AttributeError as e:
        return pd.Series({'lat':None, 'lon':None})

locations = pd.merge(locations, locations.location_obj.apply(latlong), left_index=True, right_index=True)

In [76]:
# save data
os.makedirs(DATA_PATH, exist_ok=True)
locations[['location_str', 'country_code']].to_csv(os.path.join(DATA_PATH, 'bio_country_codes.csv'))
locations[['location_str', 'railway',
           'road', 'suburb', 'borough', 'city', 'ISO3166-2-lvl4', 'postcode',
           'country', 'country_code', 'village', 'municipality', 'county',
           'ISO3166-2-lvl6', 'state', 'region', 'subdistrict', 'house_number',
           'tourism', 'neighbourhood', 'ISO3166-2-lvl8', 'state_district',
           'quarter', 'city_district', 'lat', 'lon']].to_csv(os.path.join(DATA_PATH, 'bio_locations.csv'))
locations.to_pickle(os.path.join(DATA_PATH, 'bio_locations.pkl'))


In [79]:
locations.groupby('country_code').size().sort_values(ascending=False)

country_code
us    17169
gb     2261
ca     1078
it      843
in      821
      ...  
bt        1
mc        1
bz        1
mg        1
cv        1
Length: 177, dtype: int64

In [83]:
locations[locations.country_code=='us'].groupby('state').size().sort_values(ascending=False)

state
Georgia                         2986
Kansas                          2907
California                      1928
Florida                         1609
Massachusetts                    742
New York                         719
North Carolina                   361
New Jersey                       361
Ohio                             343
Arizona                          337
Illinois                         315
Pennsylvania                     272
Nevada                           260
Tennessee                        230
Michigan                         191
District of Columbia             182
Oregon                           164
Washington                       160
Indiana                          155
Virginia                         147
Missouri                         147
Colorado                         146
Utah                             143
Alabama                          132
Maryland                         130
Oklahoma                         130
Wisconsin                       