# Geocoding Data
After we have got data from Step 1, please procceed with making sure that there is one source of truth about location: different locations registered on Twitter users are in the most the same city or prefecture. The task is to union all of them to one.

In [1]:
import pandas as pd
from core.geocoder import process_locations_gmaps
from core.utils import convert_to_datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

df = convert_to_datetime(pd.read_excel('./src/data/2009_elections.xlsx'))
print(f'📊 {len(df)} tweets to process')

location_df = df[df['user_location'].notna()]
print(f'📊 {len(location_df)} not null locations tweets to process\n---')
print(f'📊 {len(location_df["user_location"].unique())} unique locations to process')

📅 Converted tweet_date to datetime
📊 103519 tweets to process
📊 81183 not null locations tweets to process
---
📊 9473 unique locations to process


In [None]:
location_df['user_location'] = location_df['user_location'].str.strip()
location_df['user_location'] = location_df['user_location'].str.lower()
location_df['user_location'] = location_df['user_location'].str.replace('iphone: ', '')
location_df['user_location'] = location_df['user_location'].str.replace('iphone', '')
location_df['user_location'] = location_df['user_location'].str.replace('⇄', '')
location_df['user_location'] = location_df['user_location'].str.replace('⇔', '')
location_df['user_location'] = location_df['user_location'].str.replace('⇔', '')
location_df['user_location'] = location_df['user_location'].str.replace('→', '')
location_df['user_location'] = location_df['user_location'].str.strip()
location_df = location_df[location_df['user_location'].notna()]
len(location_df['user_location'].unique())

## Run API calls (warning: billing intensive)

In [None]:
import json
geocoded_output = process_locations_gmaps(location_df) # encoding default convert to uft-8

to_export = json.loads(geocoded_output)
with open('./src/mappings/2009_loc_map_gmaps.json', 'w') as f:
    json.dump(to_export, f, indent=4)

## Filtering

In [3]:
from core.utils import load_json
# Load the JSON data
output = load_json('src/mappings/raw/2009_loc_map_gmaps.json')

def filter_japan_only(json_data):
    filtered_data = {}
    for key, value in json_data.items():
        if isinstance(value, dict):  # Added check for dictionary type
            address_components = value.get("address_components")
            if address_components:
                is_japan = any(
                    item.get("short_name") == "JP" and "country" in item.get("types")
                    for item in address_components
                )
                if is_japan:
                    filtered_data[key] = value
    return filtered_data


loaded = filter_japan_only(output)
print(f'📊 {len(loaded)} filtered by JP geocoded locations')

with open('./src/mappings/2009_loc_map_gmaps_jp.json', 'w', encoding='utf-8') as f:
    json.dump(loaded, f, indent=4, ensure_ascii=False)

📄 Loaded src/mappings/raw/2009_loc_map_gmaps.json as JSON | Length: 9225
📊 5336 filtered by JP geocoded locations
