In [1]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
from thefuzz import process
import re


DATA_DIR = "/Users/tlahtolli/dev/drone_warfare/data/"
CLEANUP_DIR = "/Users/tlahtolli/dev/drone_warfare/data/cleanup/"
admin_nga = pd.read_csv(f'{DATA_DIR}/nga/Administrative_Regions/Administrative_Regions.txt', sep='\t')

countries = ['AFG', 'PAK', 'SOM', 'YEM']

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
geojson_country_code = {
    'AFG': 'AF', 
    'PAK': 'PK',
    'SOM': 'SO',
    'YEM': 'YE',
}

geojson_level = {
    'Adm_0': 'ADM0',
    'Adm_1': 'ADM1',
    'Adm_2': 'ADM2',
    'Adm_3': 'ADM3',
}

def map_columns(level):
    return {
        f'matched_sort_{level}': 'sort_name',
        'Strike ID': 'strike_count',
        'Date': 'dates',
        'Minimum total people killed': 'min_total',
        'Maximum total people killed': 'max_total',
        'Minimum civilians reported killed': 'min_civilians',
        'Maximum civilians reported killed': 'max_civilians',
        'Minimum children reported killed': 'min_children',
        'Maximum children reported killed': 'max_children',
        'Minimum reported injured': 'min_injured',
        'Maximum reported injured': 'max_injured',
        f'matched_sort_{level}_ufi': 'ufi',
        f'matched_sort_{level}_adm1': 'adm1',
        f'matched_sort_{level}_full_name': 'full_name',
        f'matched_sort_{level}_full_nm_nd': 'full_nm_nd',
        f'matched_sort_{level}_lat_dd': 'lat_dd',
        f'matched_sort_{level}_long_dd': 'long_dd',
    }
            
def fix_dates_to_ISO(dates_str):
    dates = dates_str.replace('[', '').replace(']', '').replace(' ', '').replace('\'', '').split(',')
    def fix_date(date):
        date_split = date.split('/')
        date_y_m_d = f'{date_split[2]}-{date_split[1]}-{date_split[0]}'
        return date_y_m_d
    return str([fix_date(date) for date in dates])

for country in countries:
    print(f'Processing {country}')
    levels = ['Adm_0', 'Adm_1', 'Adm_2', 'Loc']
    if country == 'PAK':
        levels.append('Adm_3')

    old_code = geojson_country_code[country]
    
    for level in levels:
        try: 
            df = pd.read_csv(f'{CLEANUP_DIR}/7_geojson_review/{old_code}_{level}_review.csv')
        except:
            continue

        if level == 'Loc':
            df['geometry'] = df.apply(lambda row: Point(row[f'matched_sort_{level}_lat_dd'], row[f'matched_sort_{level}_long_dd']), axis=1)
            gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
            gdf_cols = map_columns(level)
            result = gdf.rename(columns=gdf_cols)
        else:
            geo_level = geojson_level[level]
            geojson_file = f'{CLEANUP_DIR}/8_geojson_adm_levels/geoBoundaries-{country}-{geo_level}_simplified.geojson'
            geoboundaries_geo_df = gpd.read_file(geojson_file)
            geoboundaries_geo_df = geoboundaries_geo_df.rename(columns={'PROV_34_NA': 'shapeName'})

            if level == 'Adm_0':
                df['shapeISO'] = country
                # Merge DataFrames on 'shapeISO' column
                df = df.merge(geoboundaries_geo_df[['shapeISO', 'geometry']], on='shapeISO', how='left')
                result = df.rename(columns=map_columns(level))
                result = gpd.GeoDataFrame(result, geometry='geometry', crs="EPSG:4326")
            else:
                df['geometry'] = df.apply(lambda row: Point(row[f'matched_sort_{level}_long_dd'], row[f'matched_sort_{level}_lat_dd']), axis=1)
                gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
                result = gpd.sjoin(geoboundaries_geo_df, gdf, how='left', op='contains')
                # result.to_csv(f'{CLEANUP_DIR}/9_geojson_review/{country}_{level}_joined.csv', index=False)

                # Fix dates
                result['Date'] = result['Date'].apply(lambda x: fix_dates_to_ISO(x) if pd.notna(x) else x)

                # Clean up the DataFrame
                result = result.rename(columns=map_columns(level))

                bij_columns_to_update = ['strike_count', 'dates', 'min_total', 'max_total', 'min_civilians', 'max_civilians', 'min_children', 'max_children', 'min_injured', 'max_injured']
                for col in bij_columns_to_update:
                    if col == 'strike_count':
                        result[col] = result[col].fillna(0)
                    else:
                        result[col] = result[col].apply(lambda x: '[]' if pd.isna(x) else x)
        result.to_csv(f'{CLEANUP_DIR}/9_geojson_review/{country}_{level}_joined.csv', index=False)

Processing AFG


  if await self.run_code(code, result, async_=asy):
  if await self.run_code(code, result, async_=asy):


Processing PAK


  if await self.run_code(code, result, async_=asy):
  if await self.run_code(code, result, async_=asy):
  if await self.run_code(code, result, async_=asy):
  if await self.run_code(code, result, async_=asy):
  if await self.run_code(code, result, async_=asy):


Processing SOM
Processing YEM


  if await self.run_code(code, result, async_=asy):
  if await self.run_code(code, result, async_=asy):


In [15]:
import os

for filename in os.listdir(f'{CLEANUP_DIR}/9_geojson_review/'):
   if 'csv' in filename:
      print(filename)
      df = pd.read_csv(filename)
      gdf = gpd.GeoDataFrame(
         df.loc[:, [c for c in df.columns if c != "geometry"]],
         geometry=gpd.GeoSeries.from_wkt(df["geometry"]),
         crs="epsg:4326",
      )

      # Columns to drop if they exist in the DataFrame
      columns_to_drop = ['Level', 'reviewed', 'options', 'index_right', 'Unnamed: 0', 'ufi', 'adm1', 'full_name', 'full_nm_nd', 'lat_dd', 'long_dd', 'sort_name', 'OBJECTID_1']

      # Iterate over the list of columns and drop them if they exist in the DataFrame
      for column in columns_to_drop:
         if column in gdf.columns:
            gdf.drop(columns=[column], inplace=True)

      output_filename = filename.split('_joined')[0]
      gdf.to_file(f'{CLEANUP_DIR}/10_geojson_output/{output_filename}.geojson', driver='GeoJSON')
      gdf.to_file(f'{DATA_DIR}/geojson/{output_filename}.geojson', driver='GeoJSON')


PAK_Adm_3_joined.csv
AFG_Adm_0_joined.csv
SOM_Loc_joined.csv
PAK_Adm_1_joined.csv
AFG_Adm_2_joined.csv
YEM_Adm_1_joined.csv
SOM_Adm_1_joined.csv
SOM_Adm_2_joined.csv
AFG_Adm_1_joined.csv
YEM_Adm_2_joined.csv
PAK_Adm_2_joined.csv
YEM_Adm_0_joined.csv
PAK_Loc_joined.csv
SOM_Adm_0_joined.csv
AFG_Loc_joined.csv
PAK_Adm_0_joined.csv
YEM_Loc_joined.csv
