# Geocoding Data
After we have got data from Step 1, please procceed with making sure that there is one source of truth about location: different locations registered on Twitter users are in the most the same city or prefecture. The task is to union all of them to one.

In [None]:
import pandas as pd
from core.geocoder import process_locations
from core.utils import convert_to_datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

df = convert_to_datetime(pd.read_excel('./src/data/2009_elections.xlsx'))
print(f'📊 {len(df)} tweets to process')

location_df = df[df['user_location'].notna()]
print(f'📊 {len(location_df)} not null locations tweets to process\n---')
print(f'📊 {len(location_df["user_location"].unique())} unique locations to process')

In [None]:
location_df['user_location'] = location_df['user_location'].str.strip()
location_df['user_location'] = location_df['user_location'].str.lower()
location_df['user_location'] = location_df['user_location'].str.replace('iphone: ', '')
location_df['user_location'] = location_df['user_location'].str.replace('⇄', '')
location_df['user_location'] = location_df['user_location'].str.replace('⇔', '')
location_df['user_location'] = location_df['user_location'].str.replace('⇔', '')
location_df['user_location'] = location_df['user_location'].str.replace('→', '')
location_df['user_location'] = location_df['user_location'].str.strip()

In [None]:
result_dict, errors_loc_list = process_locations(location_df, num_threads=3)
# save it to file in mappings folder
#...

# Solving mistakes and errors
Loading file

In [33]:
import json
import googlemaps

with open('./src/mappings/raw/2009_elections_loc_mapping.json', 'r', encoding='utf-8') as f:
    loc_mapping = json.load(f)

with open('./src/mappings/raw/2009_elections_loc_mapping_errors.txt', 'r', encoding='utf-8') as f:
    loc_mapping_errors = f.read().splitlines()
    print(f'📊 {len(loc_mapping_errors)} errors to process')

📊 4439 errors to process


In [None]:
from dotenv import load_dotenv
import os 

load_dotenv()  # take environment variables from .env.

gmaps = googlemaps.Client(key=os.getenv('GOOGLEMAPS_API_KEY'))

geocoded_errors = {}

for error in loc_mapping_errors:
    try:
        geocode_result = gmaps.geocode(error)
        if geocode_result:
            geocoded_errors[error] = geocode_result[0]['formatted_address']
        else:
            geocoded_errors[error] = None
    except Exception as e:
        print(f"An error occurred while geocoding '{error}': {e}")
        geocoded_errors[error] = 'Error'

print(geocoded_errors)

An error occurred while geocoding '': HTTP Error: 400
{'四谷湯河原千里中央': 'Yugawara, Ashigarashimo District, Kanagawa, Japan', 'hakodate,tokyo ....etc,japan': 'Hakodate, Hokkaido, Japan', 'ウェブの方からきました': None, '日本のどこか': 'Japan', 'のびのびしてていいよ♡': None, '化外の地': None, 'さにわ8年目/思い出以外の訃報は基本rt無/日常はインスタ': None, 'たぶん？東京': 'Tokyo, Japan', 'kyoto,osaka,inagi': 'Osaka, Japan', '滋賀県 shiga-pref.': 'Shiga, Japan', 'kusatsu-shi, shiga-pref, japan': 'Kusatsu, Shiga, Japan', '名古屋あたりをうろうろ': 'Nagoya, Aichi, Japan', '妖怪の山': None, '藤沢市片瀬海岸か市ヶ谷か': 'Katasekaigan, Fujisawa, Kanagawa 251-0035, Japan', 'munchen酩酊街': 'Munich, Germany', 'near nagoya': 'Nagoya, Aichi, Japan', '東京都千代田区神田神保町2-11-15 住友商事神保町ビル2f': 'Japan, 〒101-0051 Tokyo, Chiyoda City, Kanda Jinbōchō, 2-chōme−11−５ 住友商事神保町ビル 2f', 'あの日みたいにあの場所': None, 'コンサドーレ村': None, 'japan/日本 関東南部': 'Nanbu, Narita, Chiba 286-0806, Japan', 'west of tokyo, kanagawa pref.': 'Kanagawa Ward, Yokohama, Kanagawa, Japan', '大手町 川崎 等々力 ライブ会場や握手会': 'Todoroki, Nakahara Ward, Kawasaki, Kana

In [None]:
# save it to file in mappings folder
import json 
with open('./src/mappings/2009_elections_loc_mapping_errors_solved.json', 'w', encoding='utf-8') as f:
    json.dump(geocoded_errors, f, ensure_ascii=False, indent=4)

In [42]:
with open('./src/mappings/raw/2009_elections_loc_mapping_errors_solved.json', 'r', encoding='utf-8') as f:
    solved_errors = json.load(f)

# get all None values
remaining_errors = {k: v for k, v in solved_errors.items() if v is None}
print(f'📊 {len(remaining_errors)} errors to process')
remaining_errors

📊 2302 errors to process


{'ウェブの方からきました': None,
 'のびのびしてていいよ♡': None,
 '化外の地': None,
 'さにわ8年目/思い出以外の訃報は基本rt無/日常はインスタ': None,
 '妖怪の山': None,
 'あの日みたいにあの場所': None,
 'コンサドーレ村': None,
 '病院のベッド': None,
 'ｅｎｊ地方スモックタウンばんそうこタワー': None,
 '未来と過去の間': None,
 '示せ風呂の力を。示せ空気の力を。新型コロナは空気の問題です。': None,
 'バナッハ空間': None,
 '会社のトイレ': None,
 'おおおたくく': None,
 '原発の60キロ圏内らしい': None,
 'land des feuers': None,
 'ここにいるよ。': None,
 '定期券が必要ない範囲で暮らしてます': None,
 'もっとも雨が多く降るとされている地点': None,
 'tokyoという片隅': None,
 '淡口醤油の国': None,
 'localhost:8080': None,
 '多分、天邪鬼な県': None,
 '有閑喫茶あにまーれ': None,
 'mi6サーカスの会議室': None,
 'ためされるだいち': None,
 '眠らない場所、職場': None,
 'デジモンワールド': None,
 'アブダルセンチャンインカ': None,
 'そこら辺り': None,
 '＊ようもうのなかにいる＊': None,
 '君の右どなり': None,
 '好きなブシェミ：コン・エアー': None,
 'はぐれパソパラ軍': None,
 'achlußluss, listenbourg': None,
 '惑星ベジータ山手線沿線': None,
 'あそこらへん': None,
 'そのだけいば\u3000ひめじけいば': None,
 'エオルゼア': None,
 'ラストフロア': None,
 '積ん読山のふもと': None,
 '武州多摩郡某村。': None,
 'スパッツは現在。スパッツは未来。スパッツは跳躍26次元。': None,
 '「周囲皆窮鼠」な場所': None,
 'はてラボ': None,
 '偵都ヨコハマ': 

# Unified mapping

In [36]:
import json

with open('./src/mappings/raw/2009_elections_loc_mapping_errors_solved.json', 'r', encoding='utf-8') as f:
    solved_errors = json.load(f)
    print(f'📊 {len(solved_errors)} errors')
    solved_errors = {k: v for k, v in solved_errors.items() if v is not None}
    print(f'📊 {len(solved_errors)} solved errors')

with open('./src/mappings/raw/2009_elections_loc_mapping.json', 'r', encoding='utf-8') as f:
    loc_mapping = json.load(f)
    print(f'📊 {len(loc_mapping)} locations in mapping')
    loc_mapping = {k: v for k, v in loc_mapping.items() if v is not None}
    print(f'📊 {len(loc_mapping)} not-null locations in mapping')

📊 4436 errors
📊 2134 solved errors
📊 9208 locations in mapping
📊 4776 not-null locations in mapping


In [37]:
# merge the two dictionaries
loc_mapping.update(solved_errors)
print(f'📊 {len(loc_mapping)} not-null locations in mapping after merging')

📊 6910 not-null locations in mapping after merging


In [39]:
# Save it to file
import json
with open('./src/mappings/2009_loc_map.json', 'w', encoding='utf-8') as f:
    json.dump(loc_mapping, f, ensure_ascii=False, indent=4)