In [30]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
from thefuzz import process
import re


DIR = "/Users/tlahtolli/dev/drone_warfare/data/cleanup"
GEOJSON_DIR = "/Users/tlahtolli/dev/drone_warfare/data/geoboundaries"

countries_list = ['AF', 'PK', 'SO', 'YE']

In [33]:
geojson_country_code = {
    'AF': 'AFG',
    'PK': 'PAK',
    'SO': 'SOM',
    'YE': 'YEM'
}

geojson_level = {
    'Adm_0': 'ADM0',
    'Adm_1': 'ADM1',
    'Adm_2': 'ADM2',
    'Adm_3': 'ADM3',
}

def create_sort_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s]', '', name)
    return name.upper().replace(' ', '')

def fuzzy_search(query, choices):
    match = process.extractOne(query, choices)
    if match[1] >= 80:
        return match[0]
    else:
        return query

def map_columns(level):
    return {
        f'matched_sort_{level}': 'sort_name',
        'Strike ID': 'strike_count',
        'Date': 'dates',
        'Minimum total people killed': 'min_total_killed',
        'Maximum total people killed': 'max_total_killed',
        'Minimum civilians reported killed': 'min_civilians_killed',
        'Maximum civilians reported killed': 'max_civilians_killed',
        'Minimum children reported killed': 'min_children_killed',
        'Maximum children reported killed': 'max_children_killed',
        'Minimum reported injured': 'min_injured',
        'Maximum reported injured': 'max_injured',
        f'matched_sort_{level}_ufi': 'ufi',
        f'matched_sort_{level}_adm1': 'adm1',
        f'matched_sort_{level}_full_name': 'full_name',
        f'matched_sort_{level}_full_nm_nd': 'full_nm_nd',
        f'matched_sort_{level}_lat_dd': 'lat_dd',
        f'matched_sort_{level}_long_dd': 'long_dd',
    }

def get_reviewed(row):
    if (row['reviewed'] != ' ') and (row['reviewed'] != 'unclear') and (pd.notna(row['reviewed'])):
        row['reviewed']
    else:
        return row['shapeName_sort']
            
for country in countries_list:
    print(f'Processing {country}')
    levels = ['Adm_0', 'Adm_1', 'Adm_2']
    if country == 'PK':
        levels.append('Adm_3')

    geo_code = geojson_country_code[country]
    
    for level in levels:
        if level == 'Adm_0':
            gdf = gpd.read_file(f'{GEOJSON_DIR}/{geo_code}/geoBoundaries-{geo_code}-ADM0_simplified.geojson')
            gdf_cols = map_columns(level)
            gdf = gdf.rename(columns=gdf_cols)
            output_geojson = f'{DIR}/8_geojson_output/{country}_{level}.geojson'
            gdf.to_file(output_geojson, driver='GeoJSON')
            continue

        df = pd.read_csv(f'{DIR}/7_geojson_review/{country}_{level}_review.csv')

        if level == 'Loc' or level == 'Adm_4':
            df['geometry'] = df.apply(lambda row: Point(row[f'matched_sort_{level}_lat_dd'], row[f'matched_sort_{level}_long_dd']), axis=1)
            gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
            gdf_cols = map_columns(level)
            gdf = gdf.rename(columns=gdf_cols)
            output_geojson = f'{DIR}/8_geojson_output/{country}_{level}.geojson'
            gdf.to_file(output_geojson, driver='GeoJSON')
            continue

        geo_level = geojson_level[level]

        geojson_file = f'{GEOJSON_DIR}/{geo_code}/geoBoundaries-{geo_code}-{geo_level}_simplified.geojson'
        gdf_geojson = gpd.read_file(geojson_file)
        gdf_geojson = gdf_geojson.rename(columns={'PROV_34_NA': 'shapeName'})

        if level != 'Adm_0':
            gdf_geojson['shapeName_sort'] = gdf_geojson['shapeName'].apply(lambda x: create_sort_name(x))

            df['shapeName_sort'] = df[f'matched_sort_{level}'].apply(lambda x: fuzzy_search(x, gdf_geojson['shapeName_sort']))
            df['shapeName_sort'] = df.apply(lambda row: get_reviewed(row), axis=1)

            df['shapeName_sort'] = df['shapeName_sort'].astype(str)
            gdf_geojson['shapeName_sort'] = gdf_geojson['shapeName_sort'].astype(str)
            print(level)
            # Clean up the DataFrame
            df = df.rename(columns=map_columns(level))
            df.drop(columns=['reviewed', 'options'], inplace=True)

            # Merge DataFrames on 'shapeName_sort' column
            result = df.merge(gdf_geojson[['shapeName_sort', 'geometry']], on='shapeName_sort', how='left')

            # Convert the merged DataFrame to a GeoDataFrame
            result = gpd.GeoDataFrame(result, geometry='geometry', crs=gdf_geojson.crs)

            result.to_file(f'{DIR}/8_geojson_output/{country}_{level}.geojson', driver='GeoJSON')

Processing AF
Adm_1
Adm_2
Processing PK
Adm_1
Adm_2
Adm_3
Processing SO
Adm_1
Adm_2
Processing YE
Adm_1
Adm_2


'/Users/tlahtolli/dev/drone_warfare/data/cleanup/6_geojson'