In [39]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
from thefuzz import process
import re


DIR = "/Users/tlahtolli/dev/drone_warfare/data/cleanup"
GEOJSON_DIR = "/Users/tlahtolli/dev/drone_warfare/data/geoboundaries"

countries_list = ['AF', 'PK', 'SO', 'YE']

In [57]:
geojson_country_code = {
    'AF': 'AFG',
    'PK': 'PAK',
    'SO': 'SOM',
    'YE': 'YEM'
}

geojson_level = {
    'Adm_0': 'ADM0',
    'Adm_1': 'ADM1',
    'Adm_2': 'ADM2',
    'Adm_3': 'ADM3',
    'Adm_4': 'ADM4',
}

def create_sort_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s]', '', name)
    return name.upper().replace(' ', '')

def fuzzy_search(query, choices):
    match = process.extract(query, choices)
    return match

for country in countries_list:
    print(f'Processing {country}')
    levels = ['Adm_0', 'Adm_1', 'Adm_2', 'Loc']
    if country == 'PK':
        levels.append('Adm_3')
        levels.append('Adm_4')

    geo_code = geojson_country_code[country]
    
    for level in levels:
        df = pd.read_csv(f'{DIR}/5_nga_aggregate/{country}_{level}_agg.csv')

        if level == 'Loc' or level == 'Adm_4':
            continue

        geo_level = geojson_level[level]

        geojson_file = f'{GEOJSON_DIR}/{geo_code}/geoBoundaries-{geo_code}-{geo_level}_simplified.geojson'
        gdf_geojson = gpd.read_file(geojson_file)
        gdf_geojson.to_csv(f'{DIR}/6_geojson/{country}_{level}_geojson.csv', index=False)
        if level != 'Adm_0':
            if 'shapeName' not in gdf_geojson.columns:
                gdf_geojson = gdf_geojson.rename(columns={'PROV_34_NA': 'shapeName'})
            gdf_geojson['sort_name'] = gdf_geojson['shapeName'].apply(lambda x: create_sort_name(x))

            df['options'] = df[f'matched_sort_{level}'].apply(lambda x: fuzzy_search(x, gdf_geojson['sort_name']))
            df.to_csv(f'{DIR}/7_geojson_review/{country}_{level}_review.csv', index=True)

Processing AF
Processing PK
Processing SO
Processing YE
