In [272]:
import pandas as pd
from thefuzz import process

DIR = ""

In [74]:
af = pd.read_csv(f'{DIR}/2_cleanup/af.csv',)
pk = pd.read_csv(f'{DIR}/2_cleanup/pk.csv')
so = pd.read_csv(f'{DIR}/2_cleanup/so.csv')
ye = pd.read_csv(f'{DIR}/2_cleanup/ye.csv')

af_nga = pd.read_csv(f'{DIR}/nga/Afghanistan/Afghanistan.txt', sep='\t')
pk_nga = pd.read_csv(f'{DIR}/nga/Pakistan/Pakistan.txt', sep='\t')
so_nga = pd.read_csv(f'{DIR}/nga/Somalia/Somalia.txt', sep='\t')
ye_nga = pd.read_csv(f'{DIR}/nga/Yemen/Yemen.txt', sep='\t')

admin_nga = pd.read_csv(f'{DIR}/nga/Administrative_Regions/Administrative_Regions.txt', sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [371]:
class LocationCleaner:
    def __init__(self, bij_df, nga_df, admin_df):
        self.bij_df = bij_df
        self.nga_df = nga_df
        self.admin_df = admin_df
        self.target_df = self.create_target_df()

    def create_target_df(self):
        column_mapping = {
            'Strike ID': 'strike_id',
            'Date': 'date',
            'Adm_1': 'adm_1',
            'Adm_2': 'adm_2',
            'Adm_3': 'adm_3',
            'Minimum total people killed': 'min_total_ppl',
            'Maximum total people killed': 'max_total_ppl',
            'Minimum civilians reported killed': 'min_civilians',
            'Maximum civilians reported killed': 'max_civilians',
            'Minimum children reported killed': 'min_children',
            'Maximum children reported killed': 'max_children',
            'Minimum reported injured': 'min_injured',
            'Maximum reported injured': 'max_injured'
        }
        df = self.bij_df.copy()
        df.rename(columns=column_mapping, inplace=True)
        return df

    def filter_valid_location(self, locations):
        return [x for x in locations if str(x) not in {'nan', 'None', 'unclear', 'multiple'}]

    def get_unique_nga_locations(self, desig_cd='ADM1', adm_1_code=None):
        filter_condition = (self.nga_df['desig_cd'] == desig_cd) & (self.nga_df['name_rank'] == 1)
        if adm_1_code:
            filter_condition &= (self.nga_df['adm1'] == adm_1_code)
        return self.nga_df.loc[filter_condition, 'full_nm_nd'].unique()
    
    def get_unique_bij_locations(self, bij_lvl='Adm_1'):
        return self.bij_df[bij_lvl].unique()

    # Now that we have adm_1 codes associated to secondary locations, we can create a list of secondary locations for each adm_1 code
    def get_unique_target_secondary_locations(self, target_lvl='adm_2', adm_1_code=None):
        if not adm_1_code:
            return []
        unique_secondary = self.target_df.loc[self.target_df['adm_1_code'] == adm_1_code, target_lvl].unique()
        return self.filter_valid_location(unique_secondary)

    def create_nga_subset(self, desig_cd='ADM1', adm_1_code=None):
        filter_condition = (self.nga_df['desig_cd'] == desig_cd) & (self.nga_df['name_rank'] == 1)
        if adm_1_code:
            filter_condition &= (self.nga_df['adm1'] == adm_1_code)
        return self.nga_df[filter_condition]
    
    # Function to perform fuzzy matching
    def fuzzy_match(self, value, full_name):
        try:
            match = process.extractOne(value, full_name)
            if match[1] >= 75:
                return match[0]
            else:
                return 'unclear'
        except:
            return 'unclear'
        
    def create_names_dict(self, bij_locations, nga_locations):
        names_dict = {location: self.fuzzy_match(location, nga_locations) for location in bij_locations}

        for item in names_dict:
            if any(separator in item for separator in {' and ', '/', 'Multiple', 'Across', 'border', 'Various'}):
                names_dict[item] = 'multiple'
            elif any(word in item for word in {'-', 'Unknown', 'Unclear', ' or '}):
                names_dict[item] = 'unclear'

        return names_dict
    
    # Create dictionary of secondary names based on adm_1_code. Tier is the level of secondary location (Adm_2 or Adm_3) in the target_df
    def create_secondary_names_dict(self, target_lvl='adm_2', desig_cd='ADM2'):
        unique_adm_1_codes = self.nga_df.loc[self.nga_df['adm1'].notnull(), 'adm1'].unique()
        secondary_names_dict = {adm_1_code: {} for adm_1_code in unique_adm_1_codes}

        multiple_keywords = [' and ', '/', 'Multiple', 'Across', 'border', 'Various']
        unclear_keywords = ['-', 'Unknown', 'Unclear', ' or ']

        for adm_1_code in secondary_names_dict:
            bij_secondary_locations = self.get_unique_target_secondary_locations(target_lvl, adm_1_code)
            nga_secondary_locations = self.get_unique_nga_locations(desig_cd, adm_1_code)

            for bij_location in bij_secondary_locations:
                match = self.fuzzy_match(bij_location, nga_secondary_locations)

                if any(keyword in bij_location for keyword in multiple_keywords):
                    secondary_names_dict[adm_1_code][bij_location] = 'multiple'
                elif any(keyword in bij_location for keyword in unclear_keywords):
                    secondary_names_dict[adm_1_code][bij_location] = 'unclear'
                else:
                    secondary_names_dict[adm_1_code][bij_location] = match

        secondary_names_dict['multiple'] = 'multiple'
        secondary_names_dict['unclear'] = 'unclear'

        return secondary_names_dict
        
    def normalize_names(self, bij_lvl, target_lvl, desig_cd='ADM1'):
        unique_bij_locations = self.get_unique_bij_locations(bij_lvl)
        unique_nga_locations = self.get_unique_nga_locations(desig_cd)

        names_dict = self.create_names_dict(unique_bij_locations, unique_nga_locations)
        self.target_df[target_lvl] = self.target_df[target_lvl].apply(lambda x: names_dict[x])

    def normalize_secondary_names(self, target_lvl='adm_2', desig_cd='ADM2'):
        secondary_names_dict = self.create_secondary_names_dict(target_lvl, desig_cd)

        def rename(row):
            adm_1_code = row['adm_1_code']
            return secondary_names_dict.get(adm_1_code, {}).get(row[target_lvl], 'unclear')

        self.target_df[target_lvl] = self.target_df.apply(rename, axis=1)

    def create_data_dict(self, desig_cd='ADM1', adm_1_code=None):
        data_dict = {}
        nga_subset = self.create_nga_subset(desig_cd, adm_1_code)
        for _, row in nga_subset.iterrows():
            full_nm_nd = row['full_nm_nd']
            data_dict[full_nm_nd] = {'adm_1_code': row['adm1'], 'sort_name': row['sort_name'], 'full_name': full_nm_nd, 'lat_dd': row['lat_dd'], 'long_dd': row['long_dd']}
        return data_dict

    def create_secondary_data_dict(self, target_lvl='adm_2', desig_cd='ADM2'):
        unique_adm_1_codes = self.nga_df.loc[self.nga_df['adm1'].notnull(), 'adm1'].unique()
        secondary_data_dict = {adm_1_code: {} for adm_1_code in unique_adm_1_codes}

        for adm_1_code in unique_adm_1_codes:
            target_secondary_locations = self.get_unique_target_secondary_locations(target_lvl, adm_1_code)
            adm_1_data_dict = self.create_data_dict(desig_cd, adm_1_code)

            for target_location in target_secondary_locations:
                location_data = adm_1_data_dict.get(target_location)
                if location_data:
                    secondary_data_dict[adm_1_code][target_location] = location_data

        return secondary_data_dict

    def add_nga_data(self, desig_cd='ADM1', target_lvl='adm_1', adm_1_code=None):
        data_dict = self.create_data_dict(desig_cd, adm_1_code)

        def get_data(key, data):
            return data_dict.get(key, {}).get(data, None)

        if target_lvl == 'adm_1':
            self.target_df['adm_1_code'] = self.target_df[target_lvl].apply(lambda x: get_data(x, 'adm_1_code'))
        
        for data in ['sort_name', 'full_name', 'lat_dd', 'long_dd']:
            self.target_df[f'{target_lvl}_{data}'] = self.target_df[target_lvl].apply(lambda x: get_data(x, data))

    def add_nga_data_secondary(self, desig_cd='ADM2', target_lvl='adm_2'):
        secondary_data_dict = self.create_secondary_data_dict(target_lvl, desig_cd)

        def get_secondary_data(row, data):
            return secondary_data_dict.get(row['adm_1_code'], {}).get(row[target_lvl], {}).get(data, None)

        for data in ['sort_name', 'full_name', 'lat_dd', 'long_dd']:
            self.target_df[f'{target_lvl}_{data}'] = self.target_df.apply(get_secondary_data, args=(data,), axis=1)
            
    def normalize_all(self):
        self.normalize_names('Adm_1', 'adm_1', 'ADM1')
        self.add_nga_data('ADM1', 'adm_1')
        self.normalize_secondary_names('adm_2', 'ADM2')
        self.add_nga_data_secondary('ADM2', 'adm_2')

In [372]:
# af_util = LocationCleaner(af, af_nga, admin_nga)
# af_util.normalize_all()
# af_util.target_df.to_csv(f'{DIR}/3_location_cleanup/af.csv', index=False)
ye_util = LocationCleaner(ye, ye_nga, admin_nga)
ye_util.normalize_all()
# ye_util.target_df.to_csv(f'{DIR}/3_location_cleanup/ye.csv', index=False)




In [373]:
ye_util.target_df.head(20)

Unnamed: 0,strike_id,date,adm_2,adm_1,min_total_ppl,max_total_ppl,min_civilians,max_civilians,min_children,max_children,...,adm_3,adm_1_code,adm_1_sort_name,adm_1_full_name,adm_1_lat_dd,adm_1_long_dd,adm_2_sort_name,adm_2_full_name,adm_2_lat_dd,adm_2_long_dd
0,YEM001,03/11/2002,unclear,Ma'rib,6,6,0,0,0,0,...,Unknown,YE-MA,MARIB,Ma'rib,15.416667,45.35,,,,
1,YEM002,17/12/2009,unclear,Abyan,55,58,44,44,21,22,...,Unknown,YE-AB,ABYAN,Abyan,13.786202,46.141766,,,,
2,YEM003,17/12/2009,Arhab,San`a',0,0,0,0,0,0,...,Unknown,YE-SN,SANA,San`a',15.260691,44.424895,ARHAB,Arhab,15.790939,44.246602
3,YEM004,24/12/2009,unclear,Shabwah,30,34,0,0,0,0,...,Unknown,YE-SH,SHABWAH,Shabwah,15.0,47.0,,,,
4,YEM005,12/01/2010,unclear,Shabwah,1,2,0,0,0,0,...,Unknown,YE-SH,SHABWAH,Shabwah,15.0,47.0,,,,
5,YEM006,15/01/2010,unclear,unclear,0,6,0,0,0,0,...,Unknown,,,,,,,,,
6,YEM007,20/01/2010,Al Jubah,Ma'rib,2,2,0,0,0,0,...,Unknown,YE-MA,MARIB,Ma'rib,15.416667,45.35,ALJUBAH,Al Jubah,15.129253,45.286811
7,YEM008,31/01/2010,unclear,multiple,0,0,0,0,0,0,...,Unknown,,,,,,,,,
8,YEM009,14/03/2010,Al Mahfid,Abyan,2,3,0,0,0,0,...,Unknown,YE-AB,ABYAN,Abyan,13.786202,46.141766,ALMAHFID,Al Mahfid,14.01847,46.755958
9,YEM010,15/03/2010,Mudiyah,Abyan,7,20,0,20,0,0,...,Unknown,YE-AB,ABYAN,Abyan,13.786202,46.141766,MUDIYAH,Mudiyah,13.885422,46.219647
