In [1]:
import pandas as pd

In [2]:
def name_matcher(df_ours, df_klaus, replace_pairs=None, merge_pairs=None):
    """
    Replace names and add merged areas to Klaus neighbourhood dataframe.
    Note: some values are changed in place.
    
    Input
    -----
    replace_pairs: {area_before:area_after}
    merge_pairs: {merged_area:{sub_area1, sub_area2, ...}}
    """
    
    # 1. Replace names
    if replace_pairs:
        for our_area, klaus_area in replace_pairs.items():
            df_ours['Area name'] = df_ours['Area name'].str.replace(our_area, klaus_area)
    
    # 2. Merge neighbourhood area names
    if merge_pairs:
        merged = {key:[] for key in df_klaus.columns}
        for merged_area in merge_pairs:
            print('%s (merged)' % merged_area, end=': ')
            for sub_area in merge_pairs[merged_area]:
                for i, row in df_klaus[df_klaus.me_name == sub_area].iterrows():
                    if row['neighbour_name'] not in merge_pairs[merged_area]:
                        merged['me'].append('Unknown')
                        merged['neighbour'].append(row['neighbour'])
                        merged['me_name'].append(merged_area)
                        merged['neighbour_name'].append(row['neighbour_name'])
                        merged['border'].append(row['border'])
                        print(row['neighbour_name'], end=' | ')
            print()
        df_klaus = df_klaus.append(pd.DataFrame(merged)).reset_index(drop=True)
    
    return df_ours, df_klaus

def name_match_uk(df_uk, df_klaus):
    replace_pairs = {
        'Hackney and City of London':'City of London',
        'East Suffolk':'Suffolk Coastal',
        'West Suffolk':'Mid Suffolk',
        'Cornwall and Isles of Scilly':'Cornwall',
        'Somerset West and Taunton':'West Somerset',
        'Bournemouth, Christchurch and Poole':'Bournemouth'
    }
    merge_pairs = {
        'Dorset':{'West Dorset', 'East Dorset', 'North Dorset'}
    }
    add_pairs = {
        'Isle of Wight'
    }
    return name_matcher(df_uk, df_klaus, replace_pairs, merge_pairs)

### Load data

In [4]:
df_uk = pd.read_csv('/project_data/data_asset/coronavirus-cases_latest.csv')
df_klaus = pd.read_csv('/project_data/data_asset/UK.neighbours.csv')
df_uk, df_klaus = name_match_uk(df_uk, df_klaus)
print('\n-- Unique area names --')
for atype in df_uk['Area type'].unique():
    print('%s: %d areas' % (atype, df_uk[df_uk['Area type'] == atype]['Area name'].nunique()))
print('\n-- After adding merged areas into Klaus dataframe --')
df_klaus.tail(15)

Dorset (merged): East Devon | Purbeck | Weymouth and Portland | South Somerset | Wiltshire | Purbeck | South Somerset | Bournemouth | Poole | Wiltshire | Christchurch | Purbeck | New Forest | 

-- Unique area names --
Nation: 1 areas
Region: 9 areas
Upper tier local authority: 149 areas
Lower tier local authority: 314 areas

-- After adding merged areas into Klaus dataframe --


Unnamed: 0,me,neighbour,me_name,neighbour_name,border
1914,W06000024,W06000018,Merthyr Tydfil,Caerphilly,37118.308068
1915,W06000024,W06000023,Merthyr Tydfil,Powys,23064.012463
1916,Unknown,E07000040,Dorset,East Devon,36492.081036
1917,Unknown,E07000051,Dorset,Purbeck,34327.563188
1918,Unknown,E07000053,Dorset,Weymouth and Portland,41356.623492
1919,Unknown,E07000189,Dorset,South Somerset,130135.948203
1920,Unknown,E06000054,Dorset,Wiltshire,46484.06052
1921,Unknown,E07000051,Dorset,Purbeck,22536.501324
1922,Unknown,E07000189,Dorset,South Somerset,38623.502539
1923,Unknown,E06000028,Dorset,Bournemouth,8505.790696


### Check inconsistency

In [4]:
area_type = 'Lower tier local authority'
keywords = ['City of London', 'Somerset', 'Dorset', 'Suffolk',  'Wight', 'Cornwall', 'Bounemouth']

our_set = set(df_uk[df_uk['Area type'] == area_type]['Area name'])
klaus_set = set(df_klaus.me_name)
print('\nAreas defined in our data but not in Klaus data (Ours(%d) - Klaus(%d))' % (len(our_set), len(klaus_set)))
print(our_set - klaus_set)
print('\nSearching for minimal keywords')
for key in keywords:
    print(key)
    for area in df_klaus:
        if key in area:
            print('-> %s' % area)


Areas defined in our data but not in Klaus data (Ours(314) - Klaus(386))
{'Isle of Wight'}

Searching for minimal keywords
City of London
Somerset
Dorset
Suffolk
Wight
Cornwall
Bounemouth


In [6]:
df_klaus.to_csv("/project_data/data_asset/df_uk_neighbours_processed.csv",index_label=False)

In [7]:
pd.__version__

'1.0.5'