### Dissertation Data analysis

In [None]:
# Import Libraries 
import pandas as pd
import os
import glob
import seaborn as sns
import zipfile
import re
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
from collections import Counter
from geopy.distance import geodesic
from dateutil.parser import parse
pd.options.display.max_columns = None


def get_moved(x, col = 'bus_prox_class'):
    if not x['moved']:
        return False, False, False
    else:
        if pd.isna(x[f'prev_{col}']):
            return True, False, False
        else:
            if x[f'prev_{col}'] < x[col]:
                if col == 'prox_class':
                    return False, True, False
                else:
                    return True, False, False
            elif x[f'prev_{col}'] > x[col]:
                if col == 'prox_class':
                    return True, False, False
                else:
                    return False, True, False
            else:
                return False, False, True

def get_moved_within(x, col = 'oa21cd'):
    if not x['moved']:
        return False
    else:
        if pd.isna(x[f'prev_{col}']):
            return False
        else:
            if x[f'prev_{col}'] == x[col]:
                return True
            else:
                return False

def get_status_date(x, df):
    if pd.isna(x['status_date']):
        tmp = df[(df['rates_code'] == x['rates_code']) & (df['location_code'] == x['location_code'])].copy()
        if len(tmp):
            return tmp['status_date'].iloc[0]
        else:
            if pd.isna(x['occupation_date']):
                return x['from_date']
            else:
                return x['occupation_date']
    else:
        return x['status_date']

# Preprocess Business Rate Data and Clean wrong Postcodes

In [None]:
# Upload data files 
# use glob to get all the CSV files in the folder
path = 'Data\\business rate data\\Wandsworth\\'
csv_files = glob.glob(os.path.join(path, "*.csv"))

# loop over the list of CSV files
wands_br_df= []
for f in csv_files:
    
    df = pd.read_csv(f)
    df['quarter']= f.split("\\")[-1][:7]
    wands_br_df.append(df)
# Dataframe concatenation
wands_br_df= pd.concat(wands_br_df, ignore_index=True)


wands_br_2024_df = pd.read_csv("Data\\business rate data\\alswliman\\period-2021-to-2024-E09000032-wandsworth.csv")
wands_br_2024_df = wands_br_2024_df.rename(columns = {'filter_period' : 'period', 'billing_authority_name' : 'rates_authority',
                     'geocode' : 'rates_authority_id', 'uarn' : 'location_code', 'billing_reference' : 'rates_code',
                     'account_name' : 'name', 'account_start_date' : 'status_date', 'searchable_address' : 'address_street',
                     'postcode_id' : 'pc__pcs', 'category_id' : 'scat_code', 'primary_description' : 'use_category',
                     'category_subgroup' : 'sub_category', 'category_group' : 'category', 'rates_payable' : 'rates_expected',
                     'rateable_value' : 'rental_valuation', 'total_floor_area' : 'floor_area'})
wands_br_2024_df = wands_br_2024_df.drop(columns = ['geometry', 'occupation_state', 'unit_of_measure', 'series', 'epoch'])
print(wands_br_2024_df.shape)
wands_br_2024_df['period'] = wands_br_2024_df['period'].apply(lambda x: '-'.join([x.split('-')[0], x.split('-')[1]]))
wands_br_2024_df['quarter'] = wands_br_2024_df['period']
wands_br_2024_df = wands_br_2024_df[wands_br_2024_df['period'].apply(lambda x: int(x.split('-')[1])) == 12].copy()
print(wands_br_2024_df.shape)
wands_br_2024_df['status_date'] = wands_br_2024_df.groupby(['rates_code', 'location_code'])['status_date'].ffill()
wands_br_2024_df['status_date'] = wands_br_2024_df.groupby(['rates_code', 'location_code'])['status_date'].bfill()
print(wands_br_2024_df.shape)
print(wands_br_2024_df[wands_br_2024_df['status_date'].apply(pd.isna)].shape)
#wands_br_2024_df['status_date'] = wands_br_2024_df.apply(lambda x: get_status_date(x, wands_br_df), axis = 1)

wands_br_rates_code_df = wands_br_df[['rates_code', 'location_code', 'name', 'status_date'
                                     ]].rename(columns = {'status_date' : 'old_status_date'})
wands_br_rates_code_df['old_status_date'] = wands_br_rates_code_df['old_status_date'].apply(
    lambda x: str(parse(x)).split(' ')[0] if isinstance(x, str) else str(x).split(' ')[0])
wands_br_rates_code_df = wands_br_rates_code_df.drop_duplicates()
wands_br_rates_code_df = wands_br_rates_code_df.groupby(['rates_code', 'location_code', 'name']).min().reset_index()
print(wands_br_rates_code_df.shape)
print(wands_br_2024_df.shape)
wands_br_2024_df = wands_br_2024_df.set_index(['rates_code', 'location_code', 'name']).join(
    wands_br_rates_code_df.set_index(['rates_code', 'location_code', 'name'])).reset_index()
print(wands_br_2024_df.shape)
wands_br_2024_df['status_date'] = wands_br_2024_df.apply(lambda x: x['old_status_date'] if pd.isna(x['status_date'])
                                                         else x['status_date'], axis = 1)
wands_br_2024_df = wands_br_2024_df.drop(columns = ['old_status_date'])
wands_br_2024_df['status_date'] = wands_br_2024_df.apply(lambda x: x['occupation_date'] if pd.isna(x['status_date'])
                                                         else x['status_date'], axis = 1)
wands_br_2024_df['status_date'] = wands_br_2024_df.apply(lambda x: x['from_date'] if pd.isna(x['status_date'])
                                                         else x['status_date'], axis = 1)
print(wands_br_2024_df.shape)
print(wands_br_2024_df[wands_br_2024_df['status_date'].apply(pd.isna)].shape)

drop_cols = list(set(wands_br_2024_df.columns) - set(wands_br_df.columns))
print(drop_cols)
wands_br_2024_df = wands_br_2024_df.drop(columns = drop_cols)
ext_cols = list(set(wands_br_df.columns) - set(wands_br_2024_df.columns))
print(ext_cols)
for col in ext_cols:
    wands_br_2024_df[col] = np.nan

drop_cols = list(set(wands_br_2024_df.columns) - set(wands_br_df.columns))
print(drop_cols)
wands_br_2024_df = wands_br_2024_df.drop(columns = drop_cols)
ext_cols = list(set(wands_br_df.columns) - set(wands_br_2024_df.columns))
print(ext_cols)
for col in ext_cols:
    wands_br_2024_df[col] = np.nan
wands_br_df = pd.concat([wands_br_df, wands_br_2024_df[wands_br_df.columns]], ignore_index = True)
print(wands_br_df.shape)
print(wands_br_2024_df.shape)
print(wands_br_df.shape)
wands_br_df.head()

In [None]:
pc_lon_lat_map_df = pd.read_csv("Data/ONSPD_MAY_2025_UK_SW.csv")
pc_lon_lat_map_df = pc_lon_lat_map_df[['pcd2', 'lat', 'long']]
pc_lon_lat_map_df['pcd2'] = pc_lon_lat_map_df['pcd2'].apply(lambda x: ' '.join([x.split(' ')[0],
                                                            x.split(' ')[-1]]))

pc_lsoa_mapping_df = pd.read_csv("Data\\PCD_OA21_LSOA21\\PCD_OA21_LSOA21.csv", encoding = "ISO-8859-1")
pc_lsoa_mapping_df = pc_lsoa_mapping_df[pc_lsoa_mapping_df['ladnm'] == 'Wandsworth'].copy()
pc_lsoa_mapping_df.head()

In [None]:
closest_pc_mapping = {'SW12 0BP' : 'SW12 0BT', 'SW12 0AB' : 'SW12 0BT', 'SW12 0HY' : 'SW12 0PJ',
'SW12 0NA' : 'SW12 0BS', 'SW12 0PH' : 'SW12 0PJ', 'SW12 0LT' : 'SW12 0BT',
'SW12 0PQ' : 'SW12 0PJ', 'SW12 0PS' : 'SW12 0PL', 'SW17 0AY' : 'SW17 0AS',
'SW17 0BH' : 'SW17 0BQ', 'SW4 0PT' : 'SW4 0QE', 'SW8 3LG' : 'SW8 3DJ'}
wands_br_df['closest_pc'] = wands_br_df['pc__pcs'].apply(lambda x: x if x not in closest_pc_mapping
                                                        else closest_pc_mapping[x])
wands_br_lsoa_df = wands_br_df.set_index('closest_pc').join(pc_lsoa_mapping_df.rename(
    columns = {'pcds' : 'closest_pc'}).set_index('closest_pc')[['oa21cd', 'lsoa21cd', 'msoa21cd']]).reset_index()
wands_br_lsoa_df['year'] = wands_br_lsoa_df['quarter'].apply(lambda x: int(x.split('-')[0]))
wands_br_lsoa_df['address_no'] = wands_br_lsoa_df['address_no'].fillna('')
wands_br_lsoa_df['addressline1'] = wands_br_lsoa_df.apply(lambda x:
                                  ' '.join([x['address_no'], x['address_street']]), axis = 1)
wands_br_lsoa_df['addressline1_merge'] = wands_br_lsoa_df['addressline1'].fillna('').apply(
    lambda x: re.sub(r'[^\w\s]','',x.lower()))

#fix missing names
br_cols = ['closest_pc', 'pc__pcs', 'rates_code', 'location_code', 'address_no',
           'address_street', 'use_category', 'sub_category', 'category', 'rental_valuation',
           'valuation_date', 'period', 'rates_expected', 'name', 'status', 'status_date', 'quarter']
wands_br_lsoa_df = wands_br_lsoa_df.sort_values(by = ['rates_code', 'period'])
wands_br_lsoa_df['name'] = wands_br_lsoa_df['name'].bfill()

#add min and mex periods
wands_br_lsoa_df = wands_br_lsoa_df.set_index('rates_code').join(wands_br_lsoa_df[['rates_code', 'period'
        ]].groupby('rates_code').min().rename(columns = {'period' : 'min_period'})).reset_index()
wands_br_lsoa_df = wands_br_lsoa_df.set_index('rates_code').join(wands_br_lsoa_df[['rates_code', 'period'
        ]].groupby('rates_code').max().rename(columns = {'period' : 'max_period'})).reset_index()

#reduce business rate data to yearly
wands_br_lsoa_df['quarter'] = wands_br_lsoa_df['quarter'].apply(lambda x: int(x.split('-')[1]))
quarter_wands_br_lsoa_df = wands_br_lsoa_df.copy()
wands_br_lsoa_df = wands_br_lsoa_df[(wands_br_lsoa_df['quarter'] == 12)].copy()


#map post code to lon, lat
wands_br_lsoa_df = wands_br_lsoa_df.set_index('closest_pc').join(pc_lon_lat_map_df.rename(columns = {
    'pcd2' : 'closest_pc'}).set_index('closest_pc')).reset_index()

# add dissolutiondate and incorporationdate
wands_br_lsoa_df['dissolutiondate'] = wands_br_lsoa_df['max_period'].apply(lambda x: np.nan
                                                   if x == '2024-12' else x + '-01')
wands_br_lsoa_df['incorporationdate'] = wands_br_lsoa_df['status_date']

#identify moved businesses
wands_br_lsoa_df['postcode'] = wands_br_lsoa_df['closest_pc']
wands_br_lsoa_df = wands_br_lsoa_df.sort_values(by = ['rates_code', 'year'])
wands_br_lsoa_df['prev_postcode'] = wands_br_lsoa_df.groupby('rates_code')['postcode'].shift()
wands_br_lsoa_df['moved'] = wands_br_lsoa_df.apply(lambda x:
        ((x['prev_postcode'] != x['postcode']) and (not pd.isna(x['prev_postcode']))), axis = 1)

#add company name and company id
wands_br_lsoa_df['companyname'] = wands_br_lsoa_df['name']
wands_br_lsoa_df['rates_code'] = wands_br_lsoa_df['rates_code'].apply(lambda x: str(int(float(x))) if 'E+' in str(x) else x)
wands_br_lsoa_df['CompanyID'] = wands_br_lsoa_df['rates_code']

#########################
print(wands_br_lsoa_df.shape)
t = wands_br_lsoa_df.fillna('').groupby(['year', 'CompanyID']).count().sort_values(by = 'companyname').reset_index()
ids = t[t['companyname'] > 1]['CompanyID'].unique()
wands_br_lsoa_df_tmp = wands_br_lsoa_df[wands_br_lsoa_df['CompanyID'].isin(ids)].copy()
wands_br_lsoa_df_tmp['CompanyID'] = wands_br_lsoa_df_tmp.apply(lambda x: '-'.join([x['CompanyID'],
                                                                       str(x['location_code']), x['name']]), axis = 1)
wands_br_lsoa_df = pd.concat([wands_br_lsoa_df_tmp, wands_br_lsoa_df[~wands_br_lsoa_df['CompanyID'].isin(ids)]])
print(wands_br_lsoa_df.shape)
#########################

#add scat categories and survived
wands_br_lsoa_df['category'] = wands_br_lsoa_df.apply(lambda x: x['sub_category']
                                  if pd.isna(x['category']) else x['category'], axis = 1)
wands_br_lsoa_df['scat_sub_category'] = wands_br_lsoa_df['sub_category']
wands_br_lsoa_df['scat_category'] = wands_br_lsoa_df['category']
wands_br_lsoa_df['survived'] = wands_br_lsoa_df['dissolutiondate'].apply(lambda x: pd.isna(x))

wands_br_lsoa_df.head()

In [None]:
s = wands_br_lsoa_df.groupby(['CompanyID', 'companyname', 'year']).count()[['postcode']
                    ].reset_index().groupby(['companyname', 'year']).count().reset_index()
print(s[s['CompanyID'] > 1].shape)
print(s[s['CompanyID'] == 1].shape)
companies = s['companyname'].unique()
non_uniq_companies = s[s['CompanyID'] != 1]['companyname'].unique()
uniq_companies = list(set(companies) - set(non_uniq_companies))
print(len(uniq_companies))
print(len(non_uniq_companies))
print(len(companies))
print(wands_br_lsoa_df.shape)
wands_br_lsoa_df = wands_br_lsoa_df[wands_br_lsoa_df['companyname'].isin(uniq_companies)]
print(wands_br_lsoa_df.shape)

### Processing Business Census Data 

In [None]:
## Load Wandsworth ONS mapping data
cols = ['CompanyID', 'CompanyNumber', 'CompanyName', 'CompanyStatus', 'PostCode',
       'oa11', 'lsoa11', 'msoa11', 'oa21', 'lsoa21', 'msoa21', 'PostCodeLatitude', 'PostCodeLongitude']
ons_df = pd.read_csv("Data\Business Cencus CDRC\entities\\ONSGeography.csv.zip",
#                     compression='zip', nrows=100, usecols=cols)
                      compression='zip', usecols=cols)
wands_ons_df = ons_df[ons_df['lsoa21'].isin(pc_lsoa_mapping_df['lsoa21cd'].unique())].copy()
wands_ons_df.to_csv('Data\Business Cencus CDRC\entities\Wandsowrth_ONSGeography.csv', index = False)

## Combine Business Census data with Wandsworth ONS data

wands_year_census_df = []
for i in range(13):
    year = 2012 + i
    census_df = pd.read_csv(f"Data\\Business Cencus CDRC\\business_census{year}.csv.zip", compression='zip')
    census_df['data_year'] = year
    tmp = census_df.rename(columns = {'id' : 'CompanyID'}).set_index('CompanyID').join(
        wands_ons_df.set_index('CompanyID'), how = 'inner')
    print(tmp.shape)
    wands_year_census_df.append(tmp)
wands_year_census_df = pd.concat(wands_year_census_df).reset_index()

## Added code to cleanup dataframe
ons_cols = ['CompanyNumber', 'CompanyName', 'CompanyStatus', 'PostCode']
wands_year_census_df = wands_year_census_df.rename(columns = {col:f'ons_{col.lower()}' for col in ons_cols})
wands_year_census_df = wands_year_census_df.rename(columns = {'PostCodeLatitude' : 'lat',
                                                             'PostCodeLongitude' : 'lon'})
wands_year_census_df['addressline1'] = wands_year_census_df['addressline1'].fillna('')
wands_year_census_df.to_csv('Data\\Business Cencus CDRC\\business_census_lsoa.csv', index = False)

##clean post code data
wands_year_census_df = wands_year_census_df.sort_values(by = ['CompanyID', 'data_year'])
#handle post code length 4 and 3 and 10 and 12
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: x if pd.isna(x) else x.strip())
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: 'CR4 2LU' if
                                                      x == 'CR 4 2 LU' else x)
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: 'SW17 0JT' if
                                                      x == '3800-003' else x)
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: 'SW18 1FX' if
                                                      x == 'SW18 1FX, UK' else x)
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: 'SW18 1EA' if x == 'SW 18 1 EA'
                                                                          else x)
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: 'W11 2NH' if x == 'W11'
                                                                          else x)
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: 'SW18 3LG' if x == 'SW18'
                                                                          else x)
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: 'SW17 8TB' if x == 'SW17'
                                                                          else x)
#Handle na in postcode
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].bfill()
#Handle missing spaces in post code
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: x if len(x) != 5
                                                     else ' '.join([x[:2], x[2:]]))
#Handle 7 char post code with no space
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: ' '.join([x[:4], x[4:]])
                                                          if (len(x) == 7) and (' ' not in x) else x)
#Handle 6 char post code with no space
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: ' '.join([x[:3], x[3:]])
                                                      if (len(x) == 6) and (' ' not in x) else x)
#Handle double spaces in post code
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: x if pd.isna(x) else
                                     ' '.join([x.split(' ')[0], x.split(' ')[-1]]))
#clean extra chars
wands_year_census_df['postcode'] = wands_year_census_df['postcode'].apply(lambda x: x if len(x) != 9
                                                     else x[:8])
wands_year_census_df = wands_year_census_df.set_index('postcode').join(pc_lsoa_mapping_df.rename(
    columns = {'pcds' : 'postcode'}).set_index('postcode')[['oa21cd', 'lsoa21cd', 'msoa21cd',
                                                            'ladnm']]).reset_index()
wands_year_census_df.to_csv('Data\\Business Cencus CDRC\\business_census_lsoa_pc_cleaned.csv', index = False)

In [None]:
wands_ons_df = pd.read_csv('Data\Business Cencus CDRC\entities\Wandsowrth_ONSGeography.csv')
wands_year_census_df = pd.read_csv('Data\\Business Cencus CDRC\\business_census_lsoa_pc_cleaned.csv')
wands_year_census_df['addressline1_merge'] = wands_year_census_df['addressline1'].fillna('').apply(
    lambda x: re.sub(r'[^\w\s]','',x.lower()))
print(wands_year_census_df.shape)
print(wands_year_census_df[wands_year_census_df['lsoa21cd'].apply(pd.isna)].shape)
print(wands_year_census_df[~wands_year_census_df['lsoa21cd'].apply(pd.isna)].shape)
print(wands_year_census_df.groupby('CompanyID').count().shape)
print(wands_year_census_df.groupby(['CompanyID', 'postcode']).count().shape)
wands_year_census_df = wands_year_census_df.rename(columns = {'data_year' : 'year'})

cols = ['CompanyID', 'companynumber', 'companyname', 'addressline1', 'addressline2', 'ons_postcode',
        'oa21', 'lsoa21', 'postcode', 'oa21cd', 'lsoa21cd', 'ladnm', 'year']

#map census industry to business rate industry
sic_cat_map_df = pd.read_excel("Data/sic_cat_mapping.xlsx", sheet_name = 'sic_cat_mapping')
sic_scat_map_df = pd.read_excel("Data/sic_cat_mapping.xlsx", sheet_name = 'scat_sic_mapping')
wands_year_census_df = wands_year_census_df.sort_values(by = ['CompanyID', 'year'])
wands_year_census_df['siccode'] = wands_year_census_df['siccode'].bfill()
wands_year_census_df = wands_year_census_df.set_index('siccode').join(sic_cat_map_df.set_index('siccode')[[
    'sic_category']]).reset_index()
wands_year_census_df = wands_year_census_df.set_index('sic_category').join(sic_scat_map_df.set_index(
    'sic_category')).reset_index()
wands_year_census_df['sic_category'] = wands_year_census_df['sic_category'].fillna('Unclassified')
wands_year_census_df['scat_category'] = wands_year_census_df['scat_category'].fillna('Unclassified')
wands_year_census_df['scat_sub_category'] = wands_year_census_df['scat_sub_category'].fillna('Unclassified')

#identify moved businesses
wands_year_census_df = wands_year_census_df.sort_values(by = ['CompanyID', 'year'])
wands_year_census_df['prev_postcode'] = wands_year_census_df.groupby('CompanyID')['postcode'].shift()
wands_year_census_df['moved'] = wands_year_census_df.apply(lambda x:
        ((x['prev_postcode'] != x['postcode']) and (not pd.isna(x['prev_postcode']))), axis = 1)

# filter on wondsworth only
wands_year_census_df = wands_year_census_df[wands_year_census_df['ladnm'].isin(['Wandsworth', 'Lambeth'])].copy()

#add correct lon, lat
wands_year_census_df = wands_year_census_df.rename(columns = {'lon' : 'ons_lon',
                                                              'lat' : 'ons_lat'})
wands_year_census_df = wands_year_census_df.set_index('postcode').join(pc_lon_lat_map_df.rename(columns = {
    'pcd2' : 'postcode'}).set_index('postcode')).reset_index()

#add survived column
wands_year_census_df['survived'] = wands_year_census_df['dissolutiondate'].apply(lambda x: pd.isna(x))

#add missing fields
wands_year_census_df['floor_area'] = None
wands_year_census_df['rental_valuation'] = None
wands_year_census_df['rates_expected'] = None

wands_year_census_df.head()

In [None]:
print(wands_year_census_df[wands_year_census_df['CompanyID'].apply(lambda x: isinstance(x, str))].shape)
print(wands_year_census_df[~wands_year_census_df['CompanyID'].apply(lambda x: isinstance(x, str))].shape)
print(wands_year_census_df.shape)

In [None]:
final_cols = ['CompanyID', 'companyname', 'addressline1', 'postcode', 'dissolutiondate',
              'incorporationdate', 'survived', 'year', 'oa21cd', 'lsoa21cd', 'msoa21cd', 'ladnm', 'long', 'lat',
              'addressline1_merge', 'scat_sub_category', 'scat_category', 'prev_postcode', 'moved',
              'floor_area', 'rental_valuation', 'rates_expected']

# Match Business Rates and Business Census Datasets

In [None]:
## get exact matches

print(wands_br_lsoa_df.shape)
print(wands_year_census_df.shape)
wands_br_lsoa_df = wands_br_lsoa_df.rename(columns = {'rates_authority' : 'ladnm'})
group_cols = ['postcode', 'addressline1_merge', 'year']
exact_match_br_census_df = wands_br_lsoa_df[final_cols].set_index(group_cols
           ).join(wands_year_census_df[final_cols].set_index(group_cols
           ), lsuffix = '_br', rsuffix = '_census', how = 'inner').reset_index()
grouped_cols_df = exact_match_br_census_df[group_cols].drop_duplicates()
grouped_cols_df['exist'] = True

print(exact_match_br_census_df.shape)
print(grouped_cols_df.shape)

no_exact_wands_br_lsoa_df = wands_br_lsoa_df.set_index(group_cols).join(grouped_cols_df.set_index(group_cols))
print(no_exact_wands_br_lsoa_df[~no_exact_wands_br_lsoa_df['exist'].apply(pd.isna)].shape)
no_exact_wands_br_lsoa_df = no_exact_wands_br_lsoa_df[no_exact_wands_br_lsoa_df['exist'].apply(pd.isna)
                            ].reset_index().drop(columns = ['exist'])
print(no_exact_wands_br_lsoa_df.shape)


no_exact_wands_year_census_df = wands_year_census_df.set_index(group_cols).join(
                    grouped_cols_df.set_index(group_cols))
print(no_exact_wands_year_census_df[~no_exact_wands_year_census_df['exist'].apply(pd.isna)].shape)
no_exact_wands_year_census_df = no_exact_wands_year_census_df[no_exact_wands_year_census_df['exist'].apply(pd.isna)
                            ].reset_index().drop(columns = ['exist'])
print(no_exact_wands_year_census_df.shape)

### get matches based on postcode and year

sub_group_cols = ['postcode', 'year']
print(no_exact_wands_br_lsoa_df.shape)
print(no_exact_wands_year_census_df.shape)
no_exact_match_br_census_df = no_exact_wands_br_lsoa_df[final_cols].set_index(sub_group_cols
           ).join(no_exact_wands_year_census_df[final_cols].set_index(sub_group_cols
           ), lsuffix = '_br', rsuffix = '_census', how = 'outer').reset_index()
print(no_exact_match_br_census_df.shape)
print(no_exact_match_br_census_df[no_exact_match_br_census_df['CompanyID_br'].apply(pd.isna)].shape)
print(no_exact_match_br_census_df[no_exact_match_br_census_df['CompanyID_census'].apply(pd.isna)].shape)

### get no matches at all and clean matches based on postcode and year

no_exact_match_br_census_df_census1 = no_exact_match_br_census_df[no_exact_match_br_census_df
                            ['CompanyID_br'].apply(pd.isna)].copy()
drop_cols = [c for c in no_exact_match_br_census_df_census1.columns if '_br' in c]
no_exact_match_br_census_df_census1 = no_exact_match_br_census_df_census1.drop(columns = drop_cols)
no_exact_match_br_census_df_census1 = no_exact_match_br_census_df_census1.rename(columns = {
    c:c.replace('_census', '') for c in no_exact_match_br_census_df_census1.columns})
no_exact_match_br_census_df_census1['source'] = 'Census'

no_exact_match_br_census_df_br1 = no_exact_match_br_census_df[no_exact_match_br_census_df
                            ['CompanyID_census'].apply(pd.isna)].copy()
drop_cols = [c for c in no_exact_match_br_census_df_br1.columns if '_census' in c]
no_exact_match_br_census_df_br1 = no_exact_match_br_census_df_br1.drop(columns = drop_cols)
no_exact_match_br_census_df_br1 = no_exact_match_br_census_df_br1.rename(columns = {
    c:c.replace('_br', '') for c in no_exact_match_br_census_df_br1.columns})
no_exact_match_br_census_df_br1['source'] = 'BR'

no_exact_match_br_census_df = no_exact_match_br_census_df[
                            (~no_exact_match_br_census_df['CompanyID_br'].apply(pd.isna)) &
                            (~no_exact_match_br_census_df['CompanyID_census'].apply(pd.isna))]

print(no_exact_match_br_census_df_census1.shape)
print(no_exact_match_br_census_df_br1.shape)
print(no_exact_match_br_census_df.shape)

# Calculate Similarity Based on Deep Neural Net Model

In [None]:
model = SentenceTransformer('Lajavaness/bilingual-embedding-small', trust_remote_code=True)

print(wands_br_lsoa_df.shape)
print(wands_year_census_df.shape)
print(wands_year_census_df[wands_year_census_df['ladnm'] == 'Wandsworth'].shape)
pc_test = wands_br_lsoa_df.rename(columns = {'pc__pcs' : 'postcode', 'year' : 'data_year'}).set_index(['postcode',
       'data_year']).join(wands_year_census_df[wands_year_census_df['ladnm'] == 'Wandsworth'][
        cols + ['addressline1_merge']].set_index(['postcode', 'data_year']), rsuffix = '_census').reset_index()
print(pc_test.shape)
print(pc_test[pc_test['CompanyID'].apply(pd.isna)].shape)
print(pc_test[~pc_test['CompanyID'].apply(pd.isna)].shape)
pc_test['match'] = pc_test.apply(lambda x: x['addressline1_merge'] == x['addressline1_merge_census'], axis = 1)
print(pc_test[pc_test['match']].shape)
print(pc_test[~pc_test['match']].shape)
pc_test['addressline1'] = pc_test['addressline1'].fillna('')
pc_test['addressline1_census'] = pc_test['addressline1_census'].fillna('')

pc_test_tmp = pc_test.iloc[1810000:][['postcode', 'addressline1',
      'addressline1_merge', 'addressline1_census', 'addressline1_merge_census']].drop_duplicates()

step = 10000
n = int(np.ceil(len(pc_test_tmp) / step))
pc_test_sim_df = []
print('Total steps is:', n)
for i in range(n):
    print(i)
    print(datetime.datetime.now())
    tmp = pc_test_tmp.iloc[i * step: (i+1) * step].copy()
    addressline1_embed = model.encode(tmp['addressline1'].tolist())
    addressline1_census_embed = model.encode(tmp['addressline1_census'].tolist())
    similarity = model.similarity(addressline1_embed, addressline1_census_embed)
    tmp['similarity'] = torch.diagonal(similarity, dim1=-2, dim2=-1)
    pc_test_sim_df.append(tmp)
    print(tmp.shape)
    if i % 10 == 0:
        pd.concat(pc_test_sim_df, ignore_index = True).to_pickle("pc_test_sim_df_dedup.pkl")
pc_test_sim_df = pd.concat(pc_test_sim_df, ignore_index = True)
pc_test_sim_df.to_pickle("pc_test_sim_df_dedup.pkl")

In [None]:
pc_test_sim_df_dedup_final = pd.read_pickle('pc_test_sim_df_dedup_final.pkl')
print(pc_test_sim_df_dedup_final.shape)
print(no_exact_match_br_census_df.shape)
test = no_exact_match_br_census_df.set_index(['postcode', 'addressline1_br', 'addressline1_census']
        ).join(pc_test_sim_df_dedup_final.rename(columns = {'addressline1' : 'addressline1_br'}
        ).set_index(['postcode', 'addressline1_br', 'addressline1_census'])[['similarity']]).reset_index()
test['similarity'] = test['similarity'].fillna(0)
print(test.shape)
print(test[test['similarity'].apply(pd.isna)].shape)
print(test[~test['similarity'].apply(pd.isna)].shape)

#calc names similarities
company_names_map_df = test[test['similarity'] >= 0.8][['postcode', 'addressline1_br', 'addressline1_census',
                   'CompanyID_br', 'CompanyID_census', 'year', 'companyname_br', 'companyname_census',
       'similarity']].sort_values(by = 'similarity')[[
    'companyname_br', 'companyname_census']].drop_duplicates()
print(company_names_map_df.shape)
company_names_map_df = pd.concat([company_names_map_df, exact_match_br_census_df[[
    'companyname_br', 'companyname_census']].drop_duplicates()])
print(company_names_map_df.shape)


model = SentenceTransformer('Lajavaness/bilingual-embedding-small', trust_remote_code=True)

step = 10000
n = int(np.ceil(len(company_names_map_df) / step))
comp_names_sim_df = []
print('Total steps is:', n)
for i in range(n):
    print(i)
    print(datetime.datetime.now())
    tmp = company_names_map_df.iloc[i * step: (i+1) * step].copy()
    companyname_br_embed = model.encode(tmp['companyname_br'].fillna('').tolist())
    companyname_census_embed = model.encode(tmp['companyname_census'].fillna('').tolist())
    similarity = model.similarity(companyname_br_embed, companyname_census_embed)
    tmp['similarity'] = torch.diagonal(similarity, dim1=-2, dim2=-1)
    comp_names_sim_df.append(tmp)
    print(tmp.shape)
    if i % 10 == 0:
        pd.concat(comp_names_sim_df, ignore_index = True).to_pickle("comp_names_sim_df.pkl")
comp_names_sim_df = pd.concat(comp_names_sim_df, ignore_index = True)
comp_names_sim_df.to_pickle("comp_names_sim_df.pkl")

comp_names_sim_df = pd.read_pickle('comp_names_sim_df.pkl')
comp_names_sim_df['companyname_br'] = comp_names_sim_df['companyname_br'].fillna('')
comp_names_sim_df['companyname_census'] = comp_names_sim_df['companyname_census'].fillna('')

comp_names_sim_df = comp_names_sim_df.rename(columns = {'similarity' : 'names_similarity'})
comp_names_sim_df = comp_names_sim_df.drop_duplicates(subset = ['companyname_br', 'companyname_census'])

print(no_exact_match_br_census_df.shape)
no_exact_match_br_census_df = no_exact_match_br_census_df.set_index(['companyname_br', 'companyname_census']).join(
    comp_names_sim_df.set_index(['companyname_br', 'companyname_census'])).reset_index()
print(no_exact_match_br_census_df.shape)

no_exact_match_br_census_df = no_exact_match_br_census_df.set_index(['companyname_br','year']
           ).join(no_exact_match_br_census_df[['companyname_br', 'year', 'names_similarity']].groupby(
    ['companyname_br', 'year']).max().rename(columns = {'names_similarity' : 'names_max_similarity'}
                                        )).reset_index()
print(no_exact_match_br_census_df.shape)
no_exact_match_br_census_df['names_match'] = no_exact_match_br_census_df.apply(lambda x: True
       if (x['names_similarity'] == x['names_max_similarity']) and
       (x['names_similarity'] >= 0.85) else False, axis = 1)


no_exact_match_br_census_df['names_similarity'] = no_exact_match_br_census_df['names_similarity'].fillna(0)
no_exact_match_br_census_df['names_max_similarity'] = no_exact_match_br_census_df['names_max_similarity'].fillna(0)
no_exact_match_br_census_df['names_match'] = no_exact_match_br_census_df['names_match'].fillna(False)

#extract matched abd unmatched businesses
br_match_agg = no_exact_match_br_census_df[['postcode', 'year', 'companyname_br', 'CompanyID_br',
         'names_match']].groupby(['postcode', 'year', 'companyname_br', 'CompanyID_br']
                ).agg(list)['names_match'].apply(any)

census_match_agg = no_exact_match_br_census_df[['postcode', 'year', 'companyname_census', 'CompanyID_census',
         'names_match']].groupby(['postcode', 'year', 'companyname_census', 'CompanyID_census']
                ).agg(list)['names_match'].apply(any)
no_exact_match_br_census_df = no_exact_match_br_census_df.set_index(['postcode', 'year', 'companyname_br',
         'CompanyID_br']).join(br_match_agg, rsuffix = '_br').reset_index().set_index(['postcode', 'year',
         'companyname_census', 'CompanyID_census']).join(census_match_agg, rsuffix = '_census').reset_index()

# Merge Datasets Based on Matched ant Non-Matched Scnearios

In [None]:
no_exact_match_br_census_df_match = no_exact_match_br_census_df[no_exact_match_br_census_df['names_match']].copy()
br_cols = [c for c in no_exact_match_br_census_df_match.columns if ('_br' in c) and (c != 'comb_address_no_br')] + ['CompanyID_census', 'companyname_census', 'postcode', 'year']
no_exact_match_br_census_df_match = no_exact_match_br_census_df_match[br_cols].drop_duplicates()
no_exact_match_br_census_df_match.columns = [c.replace('_br','').replace('_census','_alt')
                                         for c in no_exact_match_br_census_df_match.columns]
no_exact_match_br_census_df_match['source'] = 'Both'

no_exact_match_br_census_df_br2 = no_exact_match_br_census_df[
                        no_exact_match_br_census_df['names_match_br'] == False]
br_cols = [c for c in no_exact_match_br_census_df_br2.columns if ('_br' in c and c != 'comb_address_no_br')] + ['postcode', 'year']
no_exact_match_br_census_df_br2 = no_exact_match_br_census_df_br2[br_cols].drop_duplicates()
no_exact_match_br_census_df_br2.columns = [c.replace('_br','') for c in no_exact_match_br_census_df_br2.columns]
no_exact_match_br_census_df_br2['source'] = 'BR'

no_exact_match_br_census_df_census2 = no_exact_match_br_census_df[no_exact_match_br_census_df['names_match_census'] == False]
census_cols = [c for c in no_exact_match_br_census_df_census2.columns if ('_census' in c and c !=
                                         'comb_address_no_census')] + ['postcode', 'year']
no_exact_match_br_census_df_census2 = no_exact_match_br_census_df_census2[census_cols].drop_duplicates()
no_exact_match_br_census_df_census2.columns = [c.replace('_census','') for c in
                                              no_exact_match_br_census_df_census2.columns]
no_exact_match_br_census_df_census2['source'] = 'Census'

print(no_exact_match_br_census_df_match.shape)
print(no_exact_match_br_census_df_br2.shape)
print(no_exact_match_br_census_df_census2.shape)

In [None]:
print(exact_match_br_census_df.shape)
exact_match_br_census_df = exact_match_br_census_df.set_index(['companyname_br', 'companyname_census']).join(
    comp_names_sim_df.set_index(['companyname_br', 'companyname_census'])).reset_index()
print(exact_match_br_census_df.shape)
print(exact_match_br_census_df[exact_match_br_census_df['names_similarity'].apply(pd.isna)].shape)
print(exact_match_br_census_df[~exact_match_br_census_df['names_similarity'].apply(pd.isna)].shape)

exact_match_br_census_df = exact_match_br_census_df.set_index(['companyname_br','year']
           ).join(exact_match_br_census_df[['companyname_br', 'year', 'names_similarity']].groupby(
    ['companyname_br', 'year']).max().rename(columns = {'names_similarity' : 'names_max_similarity'}
                                        )).reset_index()
print(exact_match_br_census_df.shape)
exact_match_br_census_df['names_match'] = exact_match_br_census_df.apply(lambda x: True
       if (x['names_similarity'] == x['names_max_similarity']) and
       (x['names_similarity'] >= 0.85) else False, axis = 1)

br_cols = [c for c in exact_match_br_census_df.columns if ('_br' in c) and (c != 'comb_address_no_br')
          ] + ['CompanyID_census', 'companyname_census', 'postcode', 'year', 'addressline1_merge']

exact_match_br_census_df1 = exact_match_br_census_df[exact_match_br_census_df['names_match']].copy()
exact_match_br_census_df1 = exact_match_br_census_df1[br_cols].drop_duplicates()
exact_match_br_census_df1.columns = [c.replace('_br','').replace('_census','_alt') for c in
                                     exact_match_br_census_df1.columns]
exact_match_br_census_df1['source'] = 'Both'
print(exact_match_br_census_df1.shape)

br_cols = [c for c in exact_match_br_census_df.columns if ('_br' in c and c != 'comb_address_no_br')
          ] + ['postcode', 'year', 'addressline1_merge']

exact_match_br_census_df2 = exact_match_br_census_df[
                        exact_match_br_census_df['names_match'] == False]
exact_match_br_census_df2 = exact_match_br_census_df2[br_cols].drop_duplicates()
exact_match_br_census_df2.columns = [c.replace('_br','') for c in exact_match_br_census_df2.columns]
exact_match_br_census_df2['source'] = 'BR'
print(exact_match_br_census_df2.shape)

census_cols = [c for c in exact_match_br_census_df.columns if ('_census' in c and c != 'comb_address_no_census')
          ] + ['postcode', 'year', 'addressline1_merge']

exact_match_br_census_df3 = exact_match_br_census_df[
                        exact_match_br_census_df['names_match'] == False]
exact_match_br_census_df3 = exact_match_br_census_df3[census_cols].drop_duplicates()
exact_match_br_census_df3.columns = [c.replace('_census','') for c in exact_match_br_census_df3.columns]
exact_match_br_census_df3['source'] = 'Census'
print(exact_match_br_census_df3.shape)

In [None]:
final_cols = ['CompanyID', 'companyname', 'addressline1', 'postcode', 'dissolutiondate',
              'incorporationdate', 'survived', 'year', 'oa21cd', 'lsoa21cd', 'msoa21cd', 'ladnm', 'long', 'lat',
              'addressline1_merge', 'scat_sub_category', 'scat_category', 'prev_postcode', 'moved',
              'floor_area', 'rental_valuation', 'rates_expected', 'source']
final_br_census_df = pd.concat([no_exact_match_br_census_df_census1[final_cols],
                           no_exact_match_br_census_df_br1[final_cols],
                           no_exact_match_br_census_df_match[final_cols  + ['CompanyID_alt', 'companyname_alt']],
                           no_exact_match_br_census_df_br2[final_cols],
                           no_exact_match_br_census_df_census2[final_cols],
                           exact_match_br_census_df1[final_cols + ['CompanyID_alt', 'companyname_alt']],
                           exact_match_br_census_df2[final_cols],
                           exact_match_br_census_df3[final_cols]], ignore_index = True)
print(final_br_census_df.shape)
print(wands_br_lsoa_df.shape)
print(wands_year_census_df.shape)
print(no_exact_match_br_census_df_match.shape)
print(exact_match_br_census_df1.shape)

print(final_br_census_df.shape)
t = final_br_census_df.groupby(['year', 'CompanyID']).count()[['postcode']].sort_values(by = 'postcode').reset_index()
t = t[t['postcode'] > 1].copy().drop(columns = 'postcode')
print(t.shape)
t['duplicate'] = 'True'
tmp = final_br_census_df.set_index(['year', 'CompanyID']).join(t.set_index(['year', 'CompanyID'])
                                                              ).reset_index()
print(tmp[(tmp['duplicate'].apply(pd.isna))].shape)
print(tmp[(~tmp['duplicate'].apply(pd.isna))].shape)
print(tmp[(~tmp['duplicate'].apply(pd.isna)) & (tmp['source'] == 'Both')].shape)
tmp1 = tmp[tmp['duplicate'].apply(pd.isna)].copy().drop(columns = 'duplicate')
tmp2 = tmp[(~tmp['duplicate'].apply(pd.isna)) & (tmp['source'] == 'Both')].copy().drop(columns = 'duplicate')
final_br_census_df = pd.concat([tmp1, tmp2], ignore_index = True).sort_values(by = ['CompanyID', 'year'])
print(final_br_census_df.shape)
final_br_census_df.to_pickle("final_br_census_df_v5.pkl")

# Plot Moved Business Distribution

In [None]:
final_br_census_df = pd.read_pickle("final_br_census_df_v5.pkl")
#final_br_census_df['ladnm'] = 'Wandsworth'
final_south_br_census_df = pd.read_pickle("lam_south_final_br_census_df_v5.pkl")[final_br_census_df.columns]
final_br_census_df = pd.concat([final_br_census_df, final_south_br_census_df[final_south_br_census_df['ladnm']
                                == 'Lambeth']], ignore_index = True)
final_south_br_census_df = final_south_br_census_df[final_south_br_census_df['ladnm'] == 'Southwark'].copy()

#####################
moved_final_br_census_df = final_br_census_df[final_br_census_df['moved']].copy()

pc_lsoa_mapping_df = pd.read_csv("Data\\PCD_OA21_LSOA21\\PCD_OA21_LSOA21.csv", encoding = "ISO-8859-1")
ldanm_map_df = pd.read_csv('georef-united-kingdom-local-authority-district.csv', sep = ';')
unfound_ladnm_df = {
    'Official Name Local authority district': ['Barrow-in-Furness', 'Carlisle', 'Craven', 'Eden', 'Hambleton',
              'Harrogate', 'Mendip', 'Pseudo (Channel Islands)', 'Richmondshire',
              'Ryedale', 'Scarborough', 'Sedgemoor', 'Selby',
              'Somerset West and Taunton', 'South Lakeland', 'South Somerset'],
    'Official Name County/Unitary district': ['Cumbria', 'Cumbria', 'North Yorkshire', 'Cumbria', 'North Yorkshire',
               'North Yorkshire', 'Somerset', '', 'North Yorkshire',
               'North Yorkshire', 'North Yorkshire', 'Somerset', 'North Yorkshire',
               'Somerset', 'Cumbria', 'Somerset'],
    'Official Name Country': ['England', 'England', 'England', 'England', 'England',
                'England', 'England', '', 'England',
                'England', 'England', 'England', 'England',
                'England', 'England', 'England'],
    'Official Name Region': ['North West England', 'North West England', 'Yorkshire and the Humber', 'North West England', 'Yorkshire and the Humber',
               'Yorkshire and the Humber', 'South West England', '', 'Yorkshire and the Humber',
               'Yorkshire and the Humber', 'Yorkshire and the Humber', 'South West England', 'Yorkshire and the Humber',
               'South West England', 'North West England', 'South West England']
}
unfound_ladnm_df = pd.DataFrame(unfound_ladnm_df)
ldanm_map_df = pd.concat([ldanm_map_df, unfound_ladnm_df], ignore_index = True)

t = moved_final_br_census_df.set_index('prev_postcode').join(pc_lsoa_mapping_df.rename(
    columns = {'pcds' : 'prev_postcode'}).set_index('prev_postcode')[['oa21cd', 'lsoa21cd', 'msoa21cd',
            'ladnm']].rename(columns = {'oa21cd' : 'prev_oa21cd', 'lsoa21cd' : 'prev_lsoa21cd', 'msoa21cd' : 'prev_msoa21cd',
                                        'ladnm' : 'prev_ladnm'})).reset_index()

print(t.shape)
print(t[t['prev_ladnm'].apply(pd.isna)].shape)
print(t[~t['prev_ladnm'].apply(pd.isna)].shape)

t = t.set_index('prev_ladnm').join(ldanm_map_df.rename(columns = {'Official Name Local authority district' : 'prev_ladnm'}).set_index('prev_ladnm')[[
                'Official Name Country', 'Official Name Region', 'Official Name County/Unitary district']]).reset_index()
t['moved_inside_VNEB'] = t['prev_oa21cd'].apply(lambda x: x in oas_area_wandsworth['oa21cd'].unique())
t['moved_inside_borough'] = t['prev_ladnm'].apply(lambda x: x in ['Lambeth', 'Wandsworth'])
t['moved_inside_London'] = t['Official Name Region'].apply(lambda x: x in ['London'])
print(t.shape)
print(t[t['prev_ladnm'].apply(pd.isna)].shape)
print(t[~t['prev_ladnm'].apply(pd.isna)].shape)
print(t[t['Official Name Country'].apply(pd.isna)].shape)
print(t[~t['Official Name Country'].apply(pd.isna)].shape)

t.head()

In [None]:
test = t.fillna('').groupby(['year', 'moved_inside_London', 'moved_inside_borough', 'moved_inside_VNEB']).count()[['prev_postcode']
            ].reset_index().rename(columns = {'prev_postcode' : 'Moved Business Counts'})
test1 = test[test['moved_inside_VNEB']][['year', 'Moved Business Counts']]
test1['Moved Type'] = 'From Inside VNEB'
test2 = test[(~test['moved_inside_VNEB']) & (test['moved_inside_borough'])][['year', 'Moved Business Counts']]
test2['Moved Type'] = 'From Outside VNEB Inside Boroughs'
test3 = test[(~test['moved_inside_borough']) & (test['moved_inside_London'])][['year', 'Moved Business Counts']]
test3['Moved Type'] = 'From Rest London'
test4 = test[(~test['moved_inside_London'])][['year', 'Moved Business Counts']]
test4['Moved Type'] = 'From Outside London'
final_test = pd.concat([test1, test2, test3, test4], ignore_index = True)
test.head()

# Load Opportunity Area and Merge with Business Data

In [None]:
oas = "Data\\OA Boundray\\Output_Areas_2021_EW_BFE_V9_-4280877107876255952\\OA_2021_EW_BFE_V9.shp"
oas_gdf = gpd.read_file(oas)
south_oas_gdf = oas_gdf[oas_gdf['LSOA21NM'].apply(lambda x: ('southwark' in x.lower()))].copy()
oas_gdf = oas_gdf[oas_gdf['LSOA21NM'].apply(lambda x: ('wandsworth' in x.lower()) or ('lambeth' in x.lower()))]
oas_gdf["area_m2"] = oas_gdf.geometry.area
south_oas_gdf["area_m2"] = south_oas_gdf.geometry.area

##############
ptal_path = "Data\\TFL Data\\OneDrive_1_7-19-2025\\PTAL data\\11.PTAL Data 2023\\PTAL_2023_Grid_100m_100m.shp"
ptal_df = gpd.read_file(ptal_path)

##############
oa_area = 'Data\\Opportunity areas boundray\\Opportunity_Areas3\\Opportunity_Areas.shp'
oa_area_gdf = gpd.read_file(oa_area)
south_oa_area_gdf = oa_area_gdf[oa_area_gdf['sitename'] == 'Old Kent Road'].copy()
wand_oa_area_gdf = oa_area_gdf[oa_area_gdf['borough'] == 'Lambeth,  Wandsworth'].copy()
south_oa_area_gdf = south_oa_area_gdf.to_crs(oas_gdf.crs)
wand_oa_area_gdf = wand_oa_area_gdf.to_crs(oas_gdf.crs)

################
oas_area_wandsworth = oas_gdf.sjoin(wand_oa_area_gdf, predicate = 'intersects'
                                   ).drop(columns = ['index_right'])
oas_area_wandsworth.columns = [c.lower() for c in oas_area_wandsworth.columns]
oas_area_wandsworth = oas_area_wandsworth.rename(columns = {'long' : 'oa_long', 'lat' : 'oa_lat'})
oas_area_wandsworth = gpd.GeoDataFrame(oas_area_wandsworth,
                                   geometry=oas_area_wandsworth['geometry'], crs="EPSG:27700")

###############
battersea_station = (51.479774, -0.1418745)
nine_elms = (51.4799781, -0.1285638)
oas_area_wandsworth['battersea_dis'] = oas_area_wandsworth.apply(lambda x:
                                         geodesic((x['oa_lat'], x['oa_long']), battersea_station).mi, axis = 1)
oas_area_wandsworth['nine_elms_dis'] = oas_area_wandsworth.apply(lambda x:
                                         geodesic((x['oa_lat'], x['oa_long']), nine_elms).mi, axis = 1)
oas_area_wandsworth['min_dis'] = oas_area_wandsworth.apply(lambda x: min([x['battersea_dis'],
                                                  x['nine_elms_dis']]), axis = 1)
oas_area_wandsworth['min_dis_station'] = oas_area_wandsworth.apply(lambda x: 'battersea' if
                               x['min_dis'] == x['battersea_dis'] else 'nine_elms', axis = 1)
oas_area_wandsworth['prox_class'] = oas_area_wandsworth.apply(lambda x: 'prox4'
                                  if x['min_dis'] > 1 else 'prox3', axis = 1)
oas_area_wandsworth['prox_class'] = oas_area_wandsworth.apply(lambda x: x['prox_class']
                                  if x['min_dis'] > 0.5 else 'prox2', axis = 1)
oas_area_wandsworth['prox_class'] = oas_area_wandsworth.apply(lambda x: x['prox_class']
                                  if x['min_dis'] > 0.25 else 'prox1', axis = 1)

###############
oas_area_south = south_oas_gdf.sjoin(south_oa_area_gdf, predicate = 'intersects'
                                    ).drop(columns = ['index_right'])
oas_area_south.columns = [c.lower() for c in oas_area_south.columns]
oas_area_south = oas_area_south.rename(columns = {'long' : 'oa_long', 'lat' : 'oa_lat'})
oas_area_south = gpd.GeoDataFrame(oas_area_south, geometry=oas_area_south['geometry'], crs="EPSG:27700")
oas_area_south['prox_class'] = 'prox4'
oas_area_south['min_dis_station'] = np.nan
oas_area_south['min_dis'] = np.nan

# Create rates based on previous run business data

In [None]:
final_br_census_df = pd.read_pickle("final_br_census_df_v4.pkl")
final_br_census_df['ladnm'] = 'Wandsworth'
final_south_br_census_df = pd.read_pickle("lam_south_final_br_census_df_v4.pkl")[final_br_census_df.columns]
final_br_census_df = pd.concat([final_br_census_df, final_south_br_census_df[final_south_br_census_df['ladnm']
                                == 'Lambeth']], ignore_index = True)
final_south_br_census_df = final_south_br_census_df[final_south_br_census_df['ladnm'] == 'Southwark'].copy()

###########################

print(final_br_census_df.shape)
print(final_south_br_census_df.shape)
final_br_census_df = final_br_census_df.set_index('oa21cd').join(
    oas_area_wandsworth.set_index('oa21cd')[['sitename', 'area_m2', 'hectares', 'oa_long', 'oa_lat']]).reset_index()
final_br_census_oa_df = final_br_census_df[~final_br_census_df['sitename'].apply(pd.isna)]
final_south_br_census_df = final_south_br_census_df.set_index('oa21cd').join(
    oas_area_south.set_index('oa21cd')[['sitename', 'area_m2', 'hectares', 'oa_long', 'oa_lat']]).reset_index()
final_south_br_census_oa_df = final_south_br_census_df[~final_south_br_census_df['sitename'].apply(pd.isna)]

final_south_br_census_oa_df['rates_expected_per_area'] = final_south_br_census_oa_df.apply(lambda x: np.nan
                       if pd.isna(x['floor_area']) else x['rates_expected'] / x['floor_area'], axis = 1)
final_br_census_oa_df['rates_expected_per_area'] = final_br_census_oa_df.apply(lambda x: np.nan
                       if pd.isna(x['floor_area']) else x['rates_expected'] / x['floor_area'], axis = 1)
final_south_br_census_oa_df['rental_valuation_per_area'] = final_south_br_census_oa_df.apply(lambda x: np.nan
                       if pd.isna(x['floor_area']) else x['rental_valuation'] / x['floor_area'], axis = 1)
final_br_census_oa_df['rental_valuation_per_area'] = final_br_census_oa_df.apply(lambda x: np.nan
                       if pd.isna(x['floor_area']) else x['rental_valuation'] / x['floor_area'], axis = 1)

final_br_census_oa_df['scat_category'] = final_br_census_oa_df['scat_category'].apply(lambda x: x.title())
final_br_census_oa_df['scat_sub_category'] = final_br_census_oa_df['scat_sub_category'].apply(lambda x: x.replace("OFFICE_DATA_CENTRES",
       "Office IT / Data Centres").replace("RETAIL_OTHER_SUPERSTORES_AND_WAREHOUSES", "Retail Other Superstores/ Retail Warehouses"
       ).replace("LEISURE_GENERAL_HOTELS", "Leisure General Hotels (3 star)").replace("&", "and").replace("  ", " ").replace(
    "_", " ").title())

final_south_br_census_oa_df['scat_category'] = final_south_br_census_oa_df['scat_category'].apply(lambda x: x.title())
final_south_br_census_oa_df['scat_sub_category'] = final_south_br_census_oa_df['scat_sub_category'].apply(lambda x: x.replace("OFFICE_DATA_CENTRES",
       "Office IT / Data Centres").replace("RETAIL_OTHER_SUPERSTORES_AND_WAREHOUSES", "Retail Other Superstores/ Retail Warehouses"
       ).replace("LEISURE_GENERAL_HOTELS", "Leisure General Hotels (3 star)").replace("&", "and").replace("  ", " ").replace(
    "_", " ").title())

rates_df = final_br_census_oa_df[['oa21cd', 'year', 'floor_area', 'rental_valuation', 'rates_expected',
                       'rates_expected_per_area', 'rental_valuation_per_area']].groupby(['oa21cd', 'year']).mean().reset_index()
rates_df['industry'] = 'All'
rates_df_cat = final_br_census_oa_df[['oa21cd', 'year', 'scat_category', 'floor_area', 'rental_valuation', 'rates_expected',
                   'rates_expected_per_area', 'rental_valuation_per_area']].groupby(['oa21cd', 'year', 'scat_category']
                   ).mean().reset_index().rename(columns = {'scat_category' : 'industry'}).copy()
rates_df_sub_cat = final_br_census_oa_df[['oa21cd', 'year', 'scat_sub_category', 'floor_area', 'rental_valuation', 'rates_expected',
                   'rates_expected_per_area', 'rental_valuation_per_area']].groupby(['oa21cd', 'year', 'scat_sub_category']
                   ).mean().reset_index().rename(columns = {'scat_sub_category' : 'industry'}).copy()
rates_df = pd.concat([rates_df, rates_df_cat, rates_df_sub_cat], ignore_index = True)
rates_df = rates_df.sort_values(by = ['oa21cd', 'year', 'industry'])

south_rates_df = final_south_br_census_oa_df[['oa21cd', 'year', 'floor_area', 'rental_valuation', 'rates_expected',
                       'rates_expected_per_area', 'rental_valuation_per_area']].groupby(['oa21cd', 'year']).mean().reset_index()
south_rates_df['industry'] = 'All'
south_rates_df_cat = final_south_br_census_oa_df[['oa21cd', 'year', 'scat_category', 'floor_area', 'rental_valuation',
                  'rates_expected', 'rates_expected_per_area', 'rental_valuation_per_area']].groupby(['oa21cd', 'year',
                  'scat_category']).mean().reset_index().rename(columns = {'scat_category' : 'industry'}).copy()
south_rates_df_sub_cat = final_south_br_census_oa_df[['oa21cd', 'year', 'scat_sub_category', 'floor_area', 'rental_valuation',
              'rates_expected', 'rates_expected_per_area', 'rental_valuation_per_area']].groupby(['oa21cd', 'year',
                   'scat_sub_category']).mean().reset_index().rename(columns = {'scat_sub_category' : 'industry'}).copy()
south_rates_df = pd.concat([south_rates_df, south_rates_df_cat, south_rates_df_sub_cat], ignore_index = True)
south_rates_df = south_rates_df.sort_values(by = ['oa21cd', 'year', 'industry'])
rates_df.to_pickle("oa_rates_df.pkl")
south_rates_df.to_pickle("oa_south_rates_df.pkl")
south_rates_df

# Finalise Business Dataset for VNEB and OKR OA

In [None]:
final_br_census_df = pd.read_pickle("final_br_census_df_v5.pkl")
final_south_br_census_df = pd.read_pickle("lam_south_final_br_census_df_v5.pkl")[final_br_census_df.columns]
final_br_census_df = pd.concat([final_br_census_df, final_south_br_census_df[final_south_br_census_df['ladnm']
                                == 'Lambeth']], ignore_index = True)
final_south_br_census_df = final_south_br_census_df[final_south_br_census_df['ladnm'] == 'Southwark'].copy()

###########################
oa_missing_ptals = {'E00015701' : '6a', 'E00171043' : '3', 'E00171044' : '3', 'E00177504' : '6b',
        'E00177572' : '6a', 'E00177582' : '5', 'E00177587' : '5', 'E00177612' : '6a', 'E00177619' : '6a',
        'E00183114' : '3', 'E00183164' : '2', 'E00183170' : '3', 'E00183175' : '4'}

oas_area_wandsworth_tmp = oas_area_wandsworth.sjoin(ptal_df[['PTAL_2023',
                                 'geometry']], predicate="intersects").drop(columns = ['index_right'])
print(oas_area_wandsworth_tmp.shape)
print(oas_area_wandsworth.shape)
print(len(oas_area_wandsworth_tmp['oa21cd'].unique()))
print(len(oas_area_wandsworth['oa21cd'].unique()))

oas_area_wandsworth_tmp['PTAL_2023'] = oas_area_wandsworth_tmp['PTAL_2023'].apply(str)
print(len(oas_area_wandsworth_tmp))
oas_area_wandsworth_tmp_ptal2023 = oas_area_wandsworth_tmp.groupby('oa21cd').agg(list)[['PTAL_2023']]

oas_area_wandsworth_tmp_ptal2023['PTAL_2023'] = oas_area_wandsworth_tmp_ptal2023['PTAL_2023'].apply(lambda x:
                                     pd.Series(x).sort_values(ascending = False).tolist())
oas_area_wandsworth_tmp_ptal2023['PTAL_2023'] = oas_area_wandsworth_tmp_ptal2023['PTAL_2023'].apply(
    lambda x: {c:Counter(x)[c] / len(x) for c in Counter(x)})
oas_area_wandsworth_tmp_ptal2023['PTAL2023_main'] = oas_area_wandsworth_tmp_ptal2023['PTAL_2023'].apply(
    lambda x: max(x, key = x.get))

oas_area_wandsworth = oas_area_wandsworth.set_index('oa21cd').join(oas_area_wandsworth_tmp_ptal2023).reset_index()
print(oas_area_wandsworth.shape)

oas_area_south_tmp = oas_area_south.sjoin(ptal_df[['PTAL_2023',
                                 'geometry']], predicate="intersects").drop(columns = ['index_right'])
print(oas_area_south_tmp.shape)
print(oas_area_south.shape)
print(len(oas_area_south_tmp['oa21cd'].unique()))
print(len(oas_area_south['oa21cd'].unique()))

oas_area_south_tmp['PTAL_2023'] = oas_area_south_tmp['PTAL_2023'].apply(str)
print(len(oas_area_south_tmp))
oas_area_south_tmp_ptal2023 = oas_area_south_tmp.groupby('oa21cd').agg(list)[['PTAL_2023']]

oas_area_south_tmp_ptal2023['PTAL_2023'] = oas_area_south_tmp_ptal2023['PTAL_2023'].apply(lambda x:
                                     pd.Series(x).sort_values(ascending = False).tolist())
oas_area_south_tmp_ptal2023['PTAL_2023'] = oas_area_south_tmp_ptal2023['PTAL_2023'].apply(
    lambda x: {c:Counter(x)[c] / len(x) for c in Counter(x)})
oas_area_south_tmp_ptal2023['PTAL2023_main'] = oas_area_south_tmp_ptal2023['PTAL_2023'].apply(
    lambda x: max(x, key = x.get))

oas_area_south = oas_area_south.set_index('oa21cd').join(oas_area_south_tmp_ptal2023).reset_index()
print(oas_area_south.shape)

##############################
print(final_br_census_df.shape)
print(final_south_br_census_df.shape)
final_br_census_df = final_br_census_df.set_index('oa21cd').join(
    oas_area_wandsworth.set_index('oa21cd')[['sitename', 'area_m2', 'hectares', 'oa_long', 'oa_lat',
                                            'PTAL2023_main', 'prox_class']]).reset_index()
final_br_census_oa_df = final_br_census_df[~final_br_census_df['sitename'].apply(pd.isna)]
final_south_br_census_df = final_south_br_census_df.set_index('oa21cd').join(
    oas_area_south.set_index('oa21cd')[['sitename', 'area_m2', 'hectares', 'oa_long', 'oa_lat',
                                       'PTAL2023_main', 'prox_class']]).reset_index()
final_south_br_census_oa_df = final_south_br_census_df[~final_south_br_census_df['sitename'].apply(pd.isna)]
final_br_census_oa_df['new_business'] = final_br_census_oa_df.apply(lambda x:
                            parse(x['incorporationdate']).year == x['year'], axis = 1)
final_br_census_oa_df['dissolved_business'] = final_br_census_oa_df.apply(lambda x: False if
                  pd.isna(x['dissolutiondate']) else parse(x['dissolutiondate']).year == x['year'], axis = 1)
final_south_br_census_oa_df['new_business'] = final_south_br_census_oa_df.apply(lambda x:
                    parse(x['incorporationdate']).year == x['year'] if isinstance(x['incorporationdate'], str)
                    else x['incorporationdate'].year == x['year'], axis = 1)
final_south_br_census_oa_df['dissolved_business'] = final_south_br_census_oa_df.apply(lambda x: False if
                  pd.isna(x['dissolutiondate']) else parse(x['dissolutiondate']).year == x['year'], axis = 1)
print(final_br_census_df.shape)
print(final_br_census_oa_df.shape)
print(final_south_br_census_df.shape)
print(final_south_br_census_oa_df.shape)
final_br_census_oa_df.head()

In [None]:
pc_df1 = pd.read_csv("Data\\PCD_OA21_LSOA21\\SE postcodes.csv")
pc_df2 = pd.read_csv("Data\\PCD_OA21_LSOA21\\SW postcodes.csv")
pc_df3 = pd.read_csv("Data\\PCD_OA21_LSOA21\\CR postcodes.csv")
pc_long_lat_df = pd.concat([pc_df1, pc_df2, pc_df3], ignore_index = True).rename(columns = {'Postcode' : 'postcode',
                                       'Latitude' : 'pc_lat', 'Longitude' : 'pc_long'})[['postcode', 'pc_lat', 'pc_long']]

In [None]:
print(final_br_census_oa_df.shape)
final_br_census_oa_df = final_br_census_oa_df.set_index('postcode').join(pc_long_lat_df.set_index('postcode')).reset_index()
print(final_br_census_oa_df.shape)
print(final_br_census_oa_df[final_br_census_oa_df['pc_long'].apply(pd.isna)].shape)
final_br_census_oa_df['lat'] = final_br_census_oa_df.apply(lambda x: x['lat'] if not pd.isna(x['lat'])
                                                           else x['pc_lat'], axis = 1)
final_br_census_oa_df['long'] = final_br_census_oa_df.apply(lambda x: x['long'] if not pd.isna(x['long'])
                                                           else x['pc_long'], axis = 1)

####################
battersea_station = (51.479774, -0.1418745)
nine_elms = (51.4799781, -0.1285638)
final_br_census_oa_df['battersea_dis'] = final_br_census_oa_df.apply(lambda x:
                                         geodesic((x['lat'], x['long']), battersea_station).mi, axis = 1)
final_br_census_oa_df['nine_elms_dis'] = final_br_census_oa_df.apply(lambda x:
                                         geodesic((x['lat'], x['long']), nine_elms).mi, axis = 1)
final_br_census_oa_df['min_dis'] = final_br_census_oa_df.apply(lambda x: min([x['battersea_dis'],
                                                  x['nine_elms_dis']]), axis = 1)
final_br_census_oa_df['bus_prox_class'] = final_br_census_oa_df.apply(lambda x: 'prox4'
                                  if x['min_dis'] > 1 else 'prox3', axis = 1)
final_br_census_oa_df['bus_prox_class'] = final_br_census_oa_df.apply(lambda x: x['bus_prox_class']
                                  if x['min_dis'] > 0.5 else 'prox2', axis = 1)
final_br_census_oa_df['bus_prox_class'] = final_br_census_oa_df.apply(lambda x: x['bus_prox_class']
                                  if x['min_dis'] > 0.25 else 'prox1', axis = 1)
final_br_census_oa_df = final_br_census_oa_df.sort_values(by = ['CompanyID', 'year'])
final_br_census_oa_df['prev_bus_prox_class'] = final_br_census_oa_df.groupby('CompanyID')['bus_prox_class'].shift()
final_br_census_oa_df['prox_moved_closer'], final_br_census_oa_df['prox_moved_further'], \
   final_br_census_oa_df['prox_moved_same'] = zip(*final_br_census_oa_df.apply(get_moved, axis = 1))

final_br_census_oa_df = gpd.GeoDataFrame(
    final_br_census_oa_df, geometry=gpd.points_from_xy(final_br_census_oa_df['long'],
                                               final_br_census_oa_df['lat']), crs="EPSG:4326")
final_br_census_oa_df = final_br_census_oa_df.to_crs(epsg=27700)
final_br_census_oa_df = final_br_census_oa_df.sjoin(ptal_df[['PTAL_2023','geometry']].rename(
    columns = {'PTAL_2023' : 'bus_PTAL_2023'}), predicate="intersects").drop(columns = ['index_right'])
final_br_census_oa_df['bus_PTAL_2023'] = final_br_census_oa_df['bus_PTAL_2023'].apply(str)
final_br_census_oa_df = final_br_census_oa_df.sort_values(by = ['CompanyID', 'year'])
final_br_census_oa_df['prev_bus_PTAL_2023'] = final_br_census_oa_df.groupby('CompanyID')['bus_PTAL_2023'].shift()
final_br_census_oa_df['PTAL_moved_closer'], final_br_census_oa_df['PTAL_moved_further'], \
   final_br_census_oa_df['PTAL_moved_same'] = zip(*final_br_census_oa_df.apply(lambda x: get_moved(x,
                                                               col = 'bus_PTAL_2023'), axis = 1))

final_br_census_oa_df['prev_oa21cd'] = final_br_census_oa_df.groupby('CompanyID')['oa21cd'].shift()
final_br_census_oa_df['moved_within'] = final_br_census_oa_df.apply(get_moved_within, axis = 1)

final_br_census_oa_df.head()

In [None]:
print(final_south_br_census_oa_df.shape)
final_south_br_census_oa_df = final_south_br_census_oa_df.set_index('postcode').join(pc_long_lat_df.set_index('postcode')
                                                                                    ).reset_index()
print(final_south_br_census_oa_df.shape)
print(final_south_br_census_oa_df[final_south_br_census_oa_df['pc_long'].apply(pd.isna)].shape)
final_south_br_census_oa_df['lat'] = final_south_br_census_oa_df.apply(lambda x: x['lat'] if not pd.isna(x['lat'])
                                                           else x['pc_lat'], axis = 1)
final_south_br_census_oa_df['long'] = final_south_br_census_oa_df.apply(lambda x: x['long'] if not pd.isna(x['long'])
                                                           else x['pc_long'], axis = 1)

####################

final_south_br_census_oa_df['battersea_dis'] = np.nan
final_south_br_census_oa_df['nine_elms_dis'] = np.nan
final_south_br_census_oa_df['min_dis'] = np.nan
final_south_br_census_oa_df['bus_prox_class'] = 'prox4'
final_south_br_census_oa_df['prev_bus_prox_class'] = 'prox4'
final_south_br_census_oa_df['prox_moved_closer'], final_south_br_census_oa_df['prox_moved_further'], \
   final_south_br_census_oa_df['prox_moved_same'] = zip(*final_south_br_census_oa_df.apply(get_moved, axis = 1))

final_south_br_census_oa_df = gpd.GeoDataFrame(
    final_south_br_census_oa_df, geometry=gpd.points_from_xy(final_south_br_census_oa_df['long'],
                                               final_south_br_census_oa_df['lat']), crs="EPSG:4326")
final_south_br_census_oa_df = final_south_br_census_oa_df.to_crs(epsg=27700)
final_south_br_census_oa_df = final_south_br_census_oa_df.sjoin(ptal_df[['PTAL_2023','geometry']].rename(
    columns = {'PTAL_2023' : 'bus_PTAL_2023'}), predicate="intersects").drop(columns = ['index_right'])
final_south_br_census_oa_df['bus_PTAL_2023'] = final_south_br_census_oa_df['bus_PTAL_2023'].apply(str)
final_south_br_census_oa_df = final_south_br_census_oa_df.sort_values(by = ['CompanyID', 'year'])
final_south_br_census_oa_df['prev_bus_PTAL_2023'] = final_south_br_census_oa_df.groupby('CompanyID')['bus_PTAL_2023'].shift()
final_south_br_census_oa_df['PTAL_moved_closer'], final_south_br_census_oa_df['PTAL_moved_further'], \
   final_south_br_census_oa_df['PTAL_moved_same'] = zip(*final_south_br_census_oa_df.apply(lambda x: get_moved(x,
                                                               col = 'bus_PTAL_2023'), axis = 1))

final_south_br_census_oa_df['prev_oa21cd'] = final_south_br_census_oa_df.groupby('CompanyID')['oa21cd'].shift()
final_south_br_census_oa_df['moved_within'] = final_south_br_census_oa_df.apply(get_moved_within, axis = 1)

final_br_census_oa_df['rates_expected_per_area'] = final_br_census_oa_df.apply(lambda x: np.nan
                       if pd.isna(x['floor_area']) else x['rates_expected'] / x['floor_area'], axis = 1)
final_br_census_oa_df['rental_valuation_per_area'] = final_br_census_oa_df.apply(lambda x: np.nan
                       if pd.isna(x['floor_area']) else x['rental_valuation'] / x['floor_area'], axis = 1)
final_south_br_census_oa_df['rates_expected_per_area'] = final_south_br_census_oa_df.apply(lambda x: np.nan
                       if pd.isna(x['floor_area']) else x['rates_expected'] / x['floor_area'], axis = 1)
final_south_br_census_oa_df['rental_valuation_per_area'] = final_south_br_census_oa_df.apply(lambda x: np.nan
                       if pd.isna(x['floor_area']) else x['rental_valuation'] / x['floor_area'], axis = 1)

final_br_census_oa_df['scat_category'] = final_br_census_oa_df['scat_category'].apply(lambda x: x.title())
final_br_census_oa_df['scat_sub_category'] = final_br_census_oa_df['scat_sub_category'].apply(lambda x: x.replace("OFFICE_DATA_CENTRES",
       "Office IT / Data Centres").replace("RETAIL_OTHER_SUPERSTORES_AND_WAREHOUSES", "Retail Other Superstores/ Retail Warehouses"
       ).replace("LEISURE_GENERAL_HOTELS", "Leisure General Hotels (3 star)").replace("&", "and").replace("  ", " ").replace(
    "_", " ").title())

final_south_br_census_oa_df['scat_category'] = final_south_br_census_oa_df['scat_category'].apply(lambda x: x.title())
final_south_br_census_oa_df['scat_sub_category'] = final_south_br_census_oa_df['scat_sub_category'].apply(lambda x: x.replace("OFFICE_DATA_CENTRES",
       "Office IT / Data Centres").replace("RETAIL_OTHER_SUPERSTORES_AND_WAREHOUSES", "Retail Other Superstores/ Retail Warehouses"
       ).replace("LEISURE_GENERAL_HOTELS", "Leisure General Hotels (3 star)").replace("&", "and").replace("  ", " ").replace(
    "_", " ").title())

final_br_census_oa_df.to_pickle("final_br_census_oa_df_v3.pkl")
final_south_br_census_oa_df.to_pickle("final_south_br_census_oa_df_v3.pkl")

final_south_br_census_oa_df.head()

# Calculate Agglomeration Distance

In [None]:
##############################
t = final_br_census_oa_df.copy()
t['coords'] = t.apply(lambda x:(x['pc_lat'], x['pc_long']), axis = 1)
t = t[['oa21cd', 'year', 'pc_long', 'pc_lat', 'coords']].groupby(['oa21cd', 'year']).agg({'pc_long' : np.mean, 'pc_lat' : np.mean,
                                                     'coords' : list}).reset_index()
t['industry'] = 'All'
t['cent_distances'] = t.apply(lambda x: [geodesic((x['pc_lat'], x['pc_long']), c).m for c in x['coords']], axis = 1)
t['cent_mean_dis'] = t['cent_distances'].apply(np.mean)

final_agg_distance = [t]

industry_details = {'scat_category' : final_br_census_oa_df['scat_category'].unique().tolist(),
 'scat_sub_category' : final_br_census_oa_df['scat_sub_category'].unique().tolist()}
for industry_col in industry_details:
    for industry in industry_details[industry_col]:
        tmp = final_br_census_oa_df[final_br_census_oa_df[industry_col] == industry].copy()
        tmp['coords'] = tmp.apply(lambda x:(x['pc_lat'], x['pc_long']), axis = 1)
        tmp = tmp[['oa21cd', 'year', 'pc_long', 'pc_lat', 'coords']].groupby(['oa21cd', 'year']).agg({'pc_long' : np.mean, 'pc_lat' : np.mean,
                                                             'coords' : list}).reset_index()
        tmp['industry'] = industry
        tmp['cent_distances'] = tmp.apply(lambda x: [geodesic((x['pc_lat'], x['pc_long']), c).m for c in x['coords']], axis = 1)
        tmp['cent_mean_dis'] = tmp['cent_distances'].apply(np.mean)
        final_agg_distance.append(tmp)

final_agg_distance = pd.concat(final_agg_distance, ignore_index = True)

############################
t = final_south_br_census_oa_df.copy()
t['coords'] = t.apply(lambda x:(x['pc_lat'], x['pc_long']), axis = 1)
t = t[['oa21cd', 'year', 'pc_long', 'pc_lat', 'coords']].groupby(['oa21cd', 'year']).agg({'pc_long' : np.mean, 'pc_lat' : np.mean,
                                                     'coords' : list}).reset_index()
t['industry'] = 'All'
t['cent_distances'] = t.apply(lambda x: [geodesic((x['pc_lat'], x['pc_long']), c).m for c in x['coords']], axis = 1)
t['cent_mean_dis'] = t['cent_distances'].apply(np.mean)

final_south_agg_distance = [t]

industry_details = {'scat_category' : final_south_br_census_oa_df['scat_category'].unique().tolist(),
 'scat_sub_category' : final_south_br_census_oa_df['scat_sub_category'].unique().tolist()}
for industry_col in industry_details:
    for industry in industry_details[industry_col]:
        tmp = final_south_br_census_oa_df[final_south_br_census_oa_df[industry_col] == industry].copy()
        tmp['coords'] = tmp.apply(lambda x:(x['pc_lat'], x['pc_long']), axis = 1)
        tmp = tmp[['oa21cd', 'year', 'pc_long', 'pc_lat', 'coords']].groupby(['oa21cd', 'year']).agg({'pc_long' : np.mean, 'pc_lat' : np.mean,
                                                             'coords' : list}).reset_index()
        tmp['industry'] = industry
        tmp['cent_distances'] = tmp.apply(lambda x: [geodesic((x['pc_lat'], x['pc_long']), c).m for c in x['coords']], axis = 1)
        tmp['cent_mean_dis'] = tmp['cent_distances'].apply(np.mean)
        final_south_agg_distance.append(tmp)

final_south_agg_distance = pd.concat(final_south_agg_distance, ignore_index = True)

final_agg_distance.to_pickle("final_agg_distance_v3.pkl")
final_south_agg_distance.to_pickle("final_south_agg_distance_v3.pkl")

# OA Businesses Aggregation

In [None]:
#OA Businesses Aggregation
final_br_census_oa_df['scat_category'] = final_br_census_oa_df['scat_category'].apply(lambda x: x.title())
final_br_census_oa_df['scat_sub_category'] = final_br_census_oa_df['scat_sub_category'].apply(lambda x: x.replace("OFFICE_DATA_CENTRES",
       "Office IT / Data Centres").replace("RETAIL_OTHER_SUPERSTORES_AND_WAREHOUSES", "Retail Other Superstores/ Retail Warehouses"
       ).replace("LEISURE_GENERAL_HOTELS", "Leisure General Hotels (3 star)").replace("&", "and").replace("  ", " ").replace(
    "_", " ").title())

br_census_df = final_br_census_oa_df.copy()
industry_details = {'scat_category' : br_census_df['scat_category'].unique().tolist(),
 'scat_sub_category' : br_census_df['scat_sub_category'].unique().tolist()}

oas = br_census_df['oa21cd'].unique().tolist()
years = br_census_df['year'].unique().tolist()
oa_final_df = []

for oa in oas:
    print(f"Processing oa {oa} with time is {str(datetime.datetime.now())}")
    oa_m2 = br_census_df[br_census_df['oa21cd'] == oa]['area_m2'].iloc[0]
    oa_ladnm = br_census_df[br_census_df['oa21cd'] == oa]['ladnm'].iloc[0]
    oa_plat2023 = br_census_df[br_census_df['oa21cd'] == oa]['PTAL2023_main'].iloc[0]
    oa_prox = br_census_df[br_census_df['oa21cd'] == oa]['prox_class'].iloc[0]
    oa_lat = br_census_df[br_census_df['oa21cd'] == oa]['oa_lat'].iloc[0]
    oa_long = br_census_df[br_census_df['oa21cd'] == oa]['oa_long'].iloc[0]
    for year in years:
        tmp = br_census_df[(br_census_df['oa21cd'] == oa) & (br_census_df['year'] == year)
                                ].copy()
        census_total = len(tmp[tmp['source'] == 'Census'])
        census_total = census_total if census_total else np.nan
        br_total = len(tmp[tmp['source'] == 'BR'])
        br_total = br_total if br_total else np.nan
        both_total = len(tmp[tmp['source'] == 'Both'])
        both_total = both_total if both_total else np.nan
        total = len(tmp)
        survival_count = len(tmp[(tmp['survived'])])
        survival_rate = survival_count / total if total else 0
        new_count = len(tmp[(tmp['new_business'])]) 
        new_rate = new_count / total if total else 0
        moved_count = len(tmp[(tmp['moved'])])
        moved_rate = moved_count / total if total else 0
        moved_within_count = len(tmp[(tmp['moved']) & (tmp['moved_within'])])
        moved_within_rate = moved_within_count / total if total else 0
        moved_outside_count = len(tmp[(tmp['moved']) & (~tmp['moved_within'])])
        moved_outside_rate = moved_outside_count / total if total else 0
        
        PTAL_moved_closer_count = len(tmp[(tmp['PTAL_moved_closer'])])
        PTAL_moved_closer_rate = PTAL_moved_closer_count / total if total else 0
        PTAL_moved_further_count = len(tmp[(tmp['PTAL_moved_further'])])
        PTAL_moved_further_rate = PTAL_moved_further_count / total if total else 0
        PTAL_moved_same_count = len(tmp[(tmp['PTAL_moved_same'])])
        PTAL_moved_same_rate = PTAL_moved_same_count / total if total else 0
        prox_moved_closer_count = len(tmp[(tmp['prox_moved_closer'])])
        prox_moved_closer_rate = prox_moved_closer_count / total if total else 0
        prox_moved_further_count = len(tmp[(tmp['prox_moved_further'])])
        prox_moved_further_rate = prox_moved_further_count / total if total else 0
        prox_moved_same_count = len(tmp[(tmp['prox_moved_same'])])
        prox_moved_same_rate = prox_moved_same_count / total if total else 0
        dissolved_count = len(tmp[(tmp['dissolved_business'])])
        dissolved_rate = dissolved_count / total if total else 0
        rates_expected_mean = tmp['rates_expected'].mean()
        rental_valuation_mean = tmp['rental_valuation'].mean()
        floor_area_mean = tmp['floor_area'].mean()
        rates_expected_per_area_mean = tmp['rates_expected_per_area'].mean()
        rental_valuation_per_area_mean = tmp['rental_valuation_per_area'].mean()
        rates_expected_std = tmp['rates_expected'].std()
        rental_valuation_std = tmp['rental_valuation'].std()
        floor_area_std = tmp['floor_area'].std()
        
        oa_dic = {'oa21cd' : [oa], 'year' : [year], 'area_m2' : [oa_m2], 'oa_ladnm' : [oa_ladnm],
      'oa_plat2023' : [oa_plat2023], 'oa_prox' : [oa_prox],
      'oa_lat' : [oa_lat], 'oa_long' : [oa_long], 'bus_count_actual' : [total],
      'census_bus_count' : [census_total], 'br_bus_count' : [br_total], 'both_bus_count' : [both_total],
      'survival_rate' : [survival_rate], 'new_rate' : [new_rate], 'moved_rate' : [moved_rate],
      'moved_within_rate' : [moved_within_rate],  'moved_outside_rate' : [moved_outside_rate],
      'dissolved_rate' : [dissolved_rate], 'PTAL_moved_closer_rate' : [PTAL_moved_closer_rate],
      'PTAL_moved_further_rate' : [PTAL_moved_further_rate], 'PTAL_moved_same_rate' : [PTAL_moved_same_rate],
      'prox_moved_closer_rate' : [prox_moved_closer_rate],
      'prox_moved_further_rate' : [prox_moved_further_rate], 'prox_moved_same_rate' : [prox_moved_same_rate],
      'survival_count' : [survival_count], 'new_count' : [new_count],
      'moved_count' : [moved_count], 'moved_within_count' : [moved_within_count],  'moved_outside_count' : [moved_outside_count],
      'dissolved_count' : [dissolved_count], 'PTAL_moved_closer_count' : [PTAL_moved_closer_count],
      'PTAL_moved_further_count' : [PTAL_moved_further_count], 'PTAL_moved_same_count' : [PTAL_moved_same_count],
      'prox_moved_closer_count' : [prox_moved_closer_count],
      'prox_moved_further_count' : [prox_moved_further_count], 'prox_moved_same_count' : [prox_moved_same_count],
      'rates_expected_mean' : [rates_expected_mean],
      'rental_valuation_mean' : [rental_valuation_mean], 'floor_area_mean' : [floor_area_mean],
      'rates_expected_per_area_mean' : [rates_expected_per_area_mean], 'rental_valuation_per_area_mean' : [rental_valuation_per_area_mean],
      'rates_expected_std' : [rates_expected_std], 'rental_valuation_std' : [rental_valuation_std],
      'floor_area_std' : [floor_area_std], 'industry' : ['All']
              }
        oa_final_df.append(pd.DataFrame(oa_dic))
        for industry_col in industry_details:
            for industry in industry_details[industry_col]:
                tmp = br_census_df[(br_census_df['oa21cd'] == oa) &
                                         (br_census_df['year'] == year) &
                                         (br_census_df[industry_col] == industry)].copy()
                census_total = len(tmp[tmp['source'] == 'Census'])
                census_total = census_total if total else np.nan
                br_total = len(tmp[tmp['source'] == 'BR'])
                br_total = br_total if total else np.nan
                both_total = len(tmp[tmp['source'] == 'Both'])
                both_total = both_total if total else np.nan
                indus_total = len(tmp)
                survival_count = len(tmp[(tmp['survived'])])
                survival_rate = survival_count / indus_total if indus_total else 0
                new_count = len(tmp[(tmp['new_business'])]) 
                new_rate = new_count / indus_total if indus_total else 0
                moved_count = len(tmp[(tmp['moved'])])
                moved_rate = moved_count / indus_total if indus_total else 0
                moved_within_count = len(tmp[(tmp['moved']) & (tmp['moved_within'])])
                moved_within_rate = moved_within_count / indus_total if indus_total else 0
                moved_outside_count = len(tmp[(tmp['moved']) & (~tmp['moved_within'])])
                moved_outside_rate = moved_outside_count / indus_total if indus_total else 0
                PTAL_moved_closer_count = len(tmp[(tmp['PTAL_moved_closer'])])
                PTAL_moved_closer_rate = PTAL_moved_closer_count / indus_total if indus_total else 0
                PTAL_moved_further_count = len(tmp[(tmp['PTAL_moved_further'])])
                PTAL_moved_further_rate = PTAL_moved_further_count / indus_total if indus_total else 0
                PTAL_moved_same_count = len(tmp[(tmp['PTAL_moved_same'])])
                PTAL_moved_same_rate = PTAL_moved_same_count / indus_total if indus_total else 0
                prox_moved_closer_count = len(tmp[(tmp['prox_moved_closer'])])
                prox_moved_closer_rate = prox_moved_closer_count / indus_total if indus_total else 0
                prox_moved_further_count = len(tmp[(tmp['prox_moved_further'])])
                prox_moved_further_rate = prox_moved_further_count / indus_total if indus_total else 0
                prox_moved_same_count = len(tmp[(tmp['prox_moved_same'])])
                prox_moved_same_rate = prox_moved_same_count / indus_total if indus_total else 0
                dissolved_count = len(tmp[(tmp['dissolved_business'])])
                dissolved_rate = dissolved_count / indus_total if indus_total else 0
                rates_expected_mean = tmp['rates_expected'].mean()
                rental_valuation_mean = tmp['rental_valuation'].mean()
                rates_expected_per_area_mean = tmp['rates_expected_per_area'].mean()
                rental_valuation_per_area_mean = tmp['rental_valuation_per_area'].mean()
                floor_area_mean = tmp['floor_area'].mean()
                rates_expected_std = tmp['rates_expected'].std()
                rental_valuation_std = tmp['rental_valuation'].std()
                floor_area_std = tmp['floor_area'].std()
                
                oa_dic = {'oa21cd' : [oa], 'year' : [year], 'area_m2' : [oa_m2], 'oa_ladnm' : [oa_ladnm],
      'oa_plat2023' : [oa_plat2023], 'oa_prox' : [oa_prox],
      'oa_lat' : [oa_lat], 'oa_long' : [oa_long], 'bus_count_actual' : [indus_total],
      'census_bus_count' : [census_total], 'br_bus_count' : [br_total], 'both_bus_count' : [both_total],
      'survival_rate' : [survival_rate], 'new_rate' : [new_rate], 'moved_rate' : [moved_rate],
      'moved_within_rate' : [moved_within_rate],  'moved_outside_rate' : [moved_outside_rate],
      'dissolved_rate' : [dissolved_rate], 'PTAL_moved_closer_rate' : [PTAL_moved_closer_rate],
      'PTAL_moved_further_rate' : [PTAL_moved_further_rate], 'PTAL_moved_same_rate' : [PTAL_moved_same_rate],
      'prox_moved_closer_rate' : [prox_moved_closer_rate],
      'prox_moved_further_rate' : [prox_moved_further_rate], 'prox_moved_same_rate' : [prox_moved_same_rate],
      'survival_count' : [survival_count], 'new_count' : [new_count],
      'moved_count' : [moved_count], 'moved_within_count' : [moved_within_count],  'moved_outside_count' : [moved_outside_count],
      'dissolved_count' : [dissolved_count], 'PTAL_moved_closer_count' : [PTAL_moved_closer_count],
      'PTAL_moved_further_count' : [PTAL_moved_further_count], 'PTAL_moved_same_count' : [PTAL_moved_same_count],
      'prox_moved_closer_count' : [prox_moved_closer_count],
      'prox_moved_further_count' : [prox_moved_further_count], 'prox_moved_same_count' : [prox_moved_same_count],
      'rates_expected_mean' : [rates_expected_mean],
      'rental_valuation_mean' : [rental_valuation_mean], 'floor_area_mean' : [floor_area_mean],
      'rates_expected_per_area_mean' : [rates_expected_per_area_mean], 'rental_valuation_per_area_mean' : [rental_valuation_per_area_mean],
      'rates_expected_std' : [rates_expected_std], 'rental_valuation_std' : [rental_valuation_std],
      'floor_area_std' : [floor_area_std], 'industry' : [industry]
              }
                oa_final_df.append(pd.DataFrame(oa_dic))
group_cols = ['oa21cd', 'industry']
oa_final_df = pd.concat(oa_final_df, ignore_index = True).sort_values(by = ['oa21cd', 'year'])
oa_final_df['census_bus_count'] = oa_final_df.groupby(group_cols)['census_bus_count'].bfill().fillna(0)
oa_final_df['br_bus_count'] = oa_final_df['br_bus_count'].fillna(0)
oa_final_df['both_bus_count'] = oa_final_df.groupby(group_cols)['both_bus_count'].bfill().fillna(0)
oa_final_df['bus_count'] = oa_final_df['census_bus_count'] + oa_final_df['br_bus_count'] \
                                + oa_final_df['both_bus_count']
oa_final_df.to_pickle("VNEB_oa_final_df_v5.pkl")
oa_final_df.head()

In [None]:
#OA Businesses Aggregation

final_south_br_census_oa_df['scat_category'] = final_south_br_census_oa_df['scat_category'].apply(lambda x: x.title())
final_south_br_census_oa_df['scat_sub_category'] = final_south_br_census_oa_df['scat_sub_category'].apply(lambda x: x.replace("OFFICE_DATA_CENTRES",
       "Office IT / Data Centres").replace("RETAIL_OTHER_SUPERSTORES_AND_WAREHOUSES", "Retail Other Superstores/ Retail Warehouses"
       ).replace("LEISURE_GENERAL_HOTELS", "Leisure General Hotels (3 star)").replace("&", "and").replace("  ", " ").replace(
    "_", " ").title())

br_census_df = final_south_br_census_oa_df.copy()
industry_details = {'scat_category' : br_census_df['scat_category'].unique().tolist(),
 'scat_sub_category' : br_census_df['scat_sub_category'].unique().tolist()}

oas = br_census_df['oa21cd'].unique().tolist()
years = br_census_df['year'].unique().tolist()
south_oa_final_df = []

for oa in oas:
    print(f"Processing oa {oa} with time is {str(datetime.datetime.now())}")
    oa_m2 = br_census_df[br_census_df['oa21cd'] == oa]['area_m2'].iloc[0]
    oa_ladnm = br_census_df[br_census_df['oa21cd'] == oa]['ladnm'].iloc[0]
    oa_plat2023 = br_census_df[br_census_df['oa21cd'] == oa]['PTAL2023_main'].iloc[0]
    oa_prox = br_census_df[br_census_df['oa21cd'] == oa]['prox_class'].iloc[0]
    oa_lat = br_census_df[br_census_df['oa21cd'] == oa]['oa_lat'].iloc[0]
    oa_long = br_census_df[br_census_df['oa21cd'] == oa]['oa_long'].iloc[0]
    for year in years:
        tmp = br_census_df[(br_census_df['oa21cd'] == oa) & (br_census_df['year'] == year)].copy()
        census_total = len(tmp[tmp['source'] == 'Census'])
        census_total = census_total if census_total else np.nan
        br_total = len(tmp[tmp['source'] == 'BR'])
        br_total = br_total if br_total else np.nan
        both_total = len(tmp[tmp['source'] == 'Both'])
        both_total = both_total if both_total else np.nan
        total = len(tmp)
        survival_count = len(tmp[(tmp['survived'])])
        survival_rate = survival_count / total if total else 0
        new_count = len(tmp[(tmp['new_business'])]) 
        new_rate = new_count / total if total else 0
        moved_count = len(tmp[(tmp['moved'])])
        moved_rate = moved_count / total if total else 0
        moved_within_count = len(tmp[(tmp['moved']) & (tmp['moved_within'])])
        moved_within_rate = moved_within_count / total if total else 0
        moved_outside_count = len(tmp[(tmp['moved']) & (~tmp['moved_within'])])
        moved_outside_rate = moved_outside_count / total if total else 0
        PTAL_moved_closer_count = len(tmp[(tmp['PTAL_moved_closer'])])
        PTAL_moved_closer_rate = PTAL_moved_closer_count / total if total else 0
        PTAL_moved_further_count = len(tmp[(tmp['PTAL_moved_further'])])
        PTAL_moved_further_rate = PTAL_moved_further_count / total if total else 0
        PTAL_moved_same_count = len(tmp[(tmp['PTAL_moved_same'])])
        PTAL_moved_same_rate = PTAL_moved_same_count / total if total else 0
        prox_moved_closer_count = len(tmp[(tmp['prox_moved_closer'])])
        prox_moved_closer_rate = prox_moved_closer_count / total if total else 0
        prox_moved_further_count = len(tmp[(tmp['prox_moved_further'])])
        prox_moved_further_rate = prox_moved_further_count / total if total else 0
        prox_moved_same_count = len(tmp[(tmp['prox_moved_same'])])
        prox_moved_same_rate = prox_moved_same_count / total if total else 0
        dissolved_count = len(tmp[(tmp['dissolved_business'])])
        dissolved_rate = dissolved_count / total if total else 0
        rates_expected_mean = tmp['rates_expected'].mean()
        rental_valuation_mean = tmp['rental_valuation'].mean()
        floor_area_mean = tmp['floor_area'].mean()
        rates_expected_per_area_mean = tmp['rates_expected_per_area'].mean()
        rental_valuation_per_area_mean = tmp['rental_valuation_per_area'].mean()
        rates_expected_std = tmp['rates_expected'].std()
        rental_valuation_std = tmp['rental_valuation'].std()
        floor_area_std = tmp['floor_area'].std()
        
        oa_dic = {'oa21cd' : [oa], 'year' : [year], 'area_m2' : [oa_m2], 'oa_ladnm' : [oa_ladnm],
      'oa_plat2023' : [oa_plat2023], 'oa_prox' : [oa_prox],
      'oa_lat' : [oa_lat], 'oa_long' : [oa_long], 'bus_count_actual' : [total],
      'census_bus_count' : [census_total], 'br_bus_count' : [br_total], 'both_bus_count' : [both_total],
      'survival_rate' : [survival_rate], 'new_rate' : [new_rate], 'moved_rate' : [moved_rate],
      'moved_within_rate' : [moved_within_rate],  'moved_outside_rate' : [moved_outside_rate],
      'dissolved_rate' : [dissolved_rate], 'PTAL_moved_closer_rate' : [PTAL_moved_closer_rate],
      'PTAL_moved_further_rate' : [PTAL_moved_further_rate], 'PTAL_moved_same_rate' : [PTAL_moved_same_rate],
      'prox_moved_closer_rate' : [prox_moved_closer_rate],
      'prox_moved_further_rate' : [prox_moved_further_rate], 'prox_moved_same_rate' : [prox_moved_same_rate],
      'survival_count' : [survival_count], 'new_count' : [new_count],
      'moved_count' : [moved_count], 'moved_within_count' : [moved_within_count],  'moved_outside_count' : [moved_outside_count],
      'dissolved_count' : [dissolved_count], 'PTAL_moved_closer_count' : [PTAL_moved_closer_count],
      'PTAL_moved_further_count' : [PTAL_moved_further_count], 'PTAL_moved_same_count' : [PTAL_moved_same_count],
      'prox_moved_closer_count' : [prox_moved_closer_count],
      'prox_moved_further_count' : [prox_moved_further_count], 'prox_moved_same_count' : [prox_moved_same_count],
      'rates_expected_mean' : [rates_expected_mean],
      'rental_valuation_mean' : [rental_valuation_mean], 'floor_area_mean' : [floor_area_mean],
      'rates_expected_per_area_mean' : [rates_expected_per_area_mean], 'rental_valuation_per_area_mean' : [rental_valuation_per_area_mean],
      'rates_expected_std' : [rates_expected_std], 'rental_valuation_std' : [rental_valuation_std],
      'floor_area_std' : [floor_area_std], 'industry' : ['All']
              }
        south_oa_final_df.append(pd.DataFrame(oa_dic))
        for industry_col in industry_details:
            for industry in industry_details[industry_col]:
                tmp = br_census_df[(br_census_df['oa21cd'] == oa) &
                                         (br_census_df['year'] == year) &
                                         (br_census_df[industry_col] == industry)].copy()
                census_total = len(tmp[tmp['source'] == 'Census'])
                census_total = census_total if total else np.nan
                br_total = len(tmp[tmp['source'] == 'BR'])
                br_total = br_total if total else np.nan
                both_total = len(tmp[tmp['source'] == 'Both'])
                both_total = both_total if total else np.nan
                indus_total = len(tmp)
                survival_count = len(tmp[(tmp['survived'])])
                survival_rate = survival_count / indus_total if indus_total else 0
                new_count = len(tmp[(tmp['new_business'])]) 
                new_rate = new_count / indus_total if indus_total else 0
                moved_count = len(tmp[(tmp['moved'])])
                moved_within_count = len(tmp[(tmp['moved']) & (tmp['moved_within'])])
                moved_within_rate = moved_within_count / indus_total if indus_total else 0
                moved_outside_count = len(tmp[(tmp['moved']) & (~tmp['moved_within'])])
                moved_outside_rate = moved_outside_count / indus_total if indus_total else 0
                moved_rate = moved_count / indus_total if indus_total else 0
                PTAL_moved_closer_count = len(tmp[(tmp['PTAL_moved_closer'])])
                PTAL_moved_closer_rate = PTAL_moved_closer_count / indus_total if indus_total else 0
                PTAL_moved_further_count = len(tmp[(tmp['PTAL_moved_further'])])
                PTAL_moved_further_rate = PTAL_moved_further_count / indus_total if indus_total else 0
                PTAL_moved_same_count = len(tmp[(tmp['PTAL_moved_same'])])
                PTAL_moved_same_rate = PTAL_moved_same_count / indus_total if indus_total else 0
                prox_moved_closer_count = len(tmp[(tmp['prox_moved_closer'])])
                prox_moved_closer_rate = prox_moved_closer_count / indus_total if indus_total else 0
                prox_moved_further_count = len(tmp[(tmp['prox_moved_further'])])
                prox_moved_further_rate = prox_moved_further_count / indus_total if indus_total else 0
                prox_moved_same_count = len(tmp[(tmp['prox_moved_same'])])
                prox_moved_same_rate = prox_moved_same_count / indus_total if indus_total else 0
                dissolved_count = len(tmp[(tmp['dissolved_business'])])
                dissolved_rate = dissolved_count / indus_total if indus_total else 0
                rates_expected_mean = tmp['rates_expected'].mean()
                rental_valuation_mean = tmp['rental_valuation'].mean()
                floor_area_mean = tmp['floor_area'].mean()
                rates_expected_per_area_mean = tmp['rates_expected_per_area'].mean()
                rental_valuation_per_area_mean = tmp['rental_valuation_per_area'].mean()
                rates_expected_std = tmp['rates_expected'].std()
                rental_valuation_std = tmp['rental_valuation'].std()
                floor_area_std = tmp['floor_area'].std()
                
                oa_dic = {'oa21cd' : [oa], 'year' : [year], 'area_m2' : [oa_m2], 'oa_ladnm' : [oa_ladnm],
      'oa_plat2023' : [oa_plat2023], 'oa_prox' : [oa_prox],
      'oa_lat' : [oa_lat], 'oa_long' : [oa_long], 'bus_count_actual' : [indus_total],
      'census_bus_count' : [census_total], 'br_bus_count' : [br_total], 'both_bus_count' : [both_total],
      'survival_rate' : [survival_rate], 'new_rate' : [new_rate], 'moved_rate' : [moved_rate],
      'moved_within_rate' : [moved_within_rate],  'moved_outside_rate' : [moved_outside_rate],
      'dissolved_rate' : [dissolved_rate], 'PTAL_moved_closer_rate' : [PTAL_moved_closer_rate],
      'PTAL_moved_further_rate' : [PTAL_moved_further_rate], 'PTAL_moved_same_rate' : [PTAL_moved_same_rate],
      'prox_moved_closer_rate' : [prox_moved_closer_rate],
      'prox_moved_further_rate' : [prox_moved_further_rate], 'prox_moved_same_rate' : [prox_moved_same_rate],
      'survival_count' : [survival_count], 'new_count' : [new_count],
      'moved_count' : [moved_count], 'moved_within_count' : [moved_within_count],  'moved_outside_count' : [moved_outside_count],
      'dissolved_count' : [dissolved_count], 'PTAL_moved_closer_count' : [PTAL_moved_closer_count],
      'PTAL_moved_further_count' : [PTAL_moved_further_count], 'PTAL_moved_same_count' : [PTAL_moved_same_count],
      'prox_moved_closer_count' : [prox_moved_closer_count],
      'prox_moved_further_count' : [prox_moved_further_count], 'prox_moved_same_count' : [prox_moved_same_count],
      'rates_expected_mean' : [rates_expected_mean],
      'rental_valuation_mean' : [rental_valuation_mean], 'floor_area_mean' : [floor_area_mean],
      'rates_expected_per_area_mean' : [rates_expected_per_area_mean], 'rental_valuation_per_area_mean' : [rental_valuation_per_area_mean],
      'rates_expected_std' : [rates_expected_std], 'rental_valuation_std' : [rental_valuation_std],
      'floor_area_std' : [floor_area_std], 'industry' : [industry]
              }
                south_oa_final_df.append(pd.DataFrame(oa_dic))
group_cols = ['oa21cd', 'industry']
south_oa_final_df = pd.concat(south_oa_final_df, ignore_index = True).sort_values(by = ['oa21cd', 'year'])
south_oa_final_df['census_bus_count'] = south_oa_final_df.groupby(group_cols)['census_bus_count'].bfill().fillna(0)
south_oa_final_df['br_bus_count'] = south_oa_final_df['br_bus_count'].fillna(0)
#south_oa_final_df['both_bus_count'] = south_oa_final_df.groupby(group_cols)['both_bus_count'].bfill()
south_oa_final_df['bus_count'] = south_oa_final_df['census_bus_count'] + south_oa_final_df['br_bus_count'] \
                                + south_oa_final_df['both_bus_count']
south_oa_final_df.to_pickle("Old_Kent_oa_final_df_v5.pkl")
south_oa_final_df.head()

# Prepare other Datasets

In [None]:
pc_lsoa_mapping_df = pd.read_csv("Data\\PCD_OA21_LSOA21\\PCD_OA21_LSOA21.csv",
                                 encoding = "ISO-8859-1")
pc_lsoa_mapping_df = pc_lsoa_mapping_df[(pc_lsoa_mapping_df['ladnm'] == 'Wandsworth') |
            (pc_lsoa_mapping_df['ladnm'] == 'Southwark') | (pc_lsoa_mapping_df['ladnm'] == 'Lambeth')].copy()
lsoa11_to_lsoa21 = pd.read_csv("Data\\PCD_OA21_LSOA21\\LSOA_(2011)_to_LSOA_(2021).csv")
lsoa01_to_lsoa11 = pd.read_csv("Data\\PCD_OA21_LSOA21\\LSOA_(2001)_to_LSOA_(2011).csv")

lsoa_cols = ['lsoa11cd', 'lsoa21cd']
lsoa01_cols = ['lsoa01cd', 'lsoa11cd']

lsoa21 = pc_lsoa_mapping_df[pc_lsoa_mapping_df['oa21cd'].isin(oa_final_df['oa21cd'].unique())
        ][['oa21cd', 'lsoa21cd']].drop_duplicates()
south_lsoa21 = pc_lsoa_mapping_df[pc_lsoa_mapping_df['oa21cd'].isin(south_oa_final_df['oa21cd'].unique())
        ][['oa21cd', 'lsoa21cd']].drop_duplicates()

lsoa11_to_lsoa21.columns = [c.lower() for c in lsoa11_to_lsoa21.columns]
lsoa01_to_lsoa11.columns = [c.lower() for c in lsoa01_to_lsoa11.columns]

lsoa11_to_lsoa21_map = lsoa21.set_index('lsoa21cd').join(lsoa11_to_lsoa21[lsoa_cols].set_index('lsoa21cd')).reset_index()
lsoa01_to_lsoa21_map = lsoa21.set_index('lsoa21cd').join(lsoa11_to_lsoa21[lsoa_cols].set_index('lsoa21cd')
                ).reset_index().set_index('lsoa11cd').join(lsoa01_to_lsoa11[lsoa01_cols].set_index('lsoa11cd')).reset_index()

south_lsoa11_to_lsoa21_map = south_lsoa21.set_index('lsoa21cd').join(
                    lsoa11_to_lsoa21[lsoa_cols].set_index('lsoa21cd')).reset_index()
south_lsoa01_to_lsoa21_map = south_lsoa21.set_index('lsoa21cd').join(lsoa11_to_lsoa21[lsoa_cols].set_index('lsoa21cd')
                ).reset_index().set_index('lsoa11cd').join(lsoa01_to_lsoa11[lsoa01_cols].set_index('lsoa11cd')).reset_index()

print(lsoa11_to_lsoa21_map.shape)
print(south_lsoa11_to_lsoa21_map.shape)

print(lsoa01_to_lsoa21_map.shape)
print(south_lsoa01_to_lsoa21_map.shape)

In [None]:
path = 'Data\\MEP CDRC\\MEP_LSOA\\MEP_LSOA\\'
mep_full_df = []
for f in range(14):
    year = 2010 + f
    filename = f"MEP_lsoa11_{year}.csv"
    df = pd.read_csv(path + filename)
    df['year']= year
    mep_full_df.append(df)
mep_full_df = pd.concat(mep_full_df, ignore_index=True)
mep_full_df = mep_full_df.rename(columns = {'lsoa11' : 'lsoa11cd'})
mep_full_df.columns = ['lsoa11cd'] + [c + '_lsoa' for c in mep_full_df.columns[1:-1]] + ['year']

In [None]:
filtered_mep_full_df = lsoa11_to_lsoa21_map.set_index('lsoa11cd').join(mep_full_df.set_index('lsoa11cd')
                                                             ).reset_index()
filtered_south_mep_full_df = south_lsoa11_to_lsoa21_map.set_index('lsoa11cd').join(
    mep_full_df.set_index('lsoa11cd')).reset_index()

print(filtered_mep_full_df.shape)
print(filtered_south_mep_full_df.shape)
filtered_mep_full_df = filtered_mep_full_df.sort_values(by = ['year', 'lsoa11cd'])
filtered_south_mep_full_df = filtered_south_mep_full_df.sort_values(by = ['year', 'lsoa11cd'])

filtered_mep_full_df.head()

In [None]:
oa11_to_oa21 = pd.read_csv("Data\\PCD_OA21_LSOA21\\OA_(2011)_to_OA_(2021).csv")
oa11_to_oa21.columns = [c.lower() for c in oa11_to_oa21.columns]
oa11_to_oa21_map = oa11_to_oa21[oa11_to_oa21['oa21cd'].isin(oa_final_df['oa21cd'].unique())][['oa11cd', 'oa21cd']]
south_oa11_to_oa21_map = oa11_to_oa21[oa11_to_oa21['oa21cd'].isin(
                    south_oa_final_df['oa21cd'].unique())][['oa11cd', 'oa21cd']]
print(len(oa11_to_oa21_map))
print(len(south_oa11_to_oa21_map))
oa11_to_oa21_map.head()

# Generate OA Population Data

In [None]:
final_pop_oa_df = []
final_south_pop_oa_df = []
for i in range(2):
    year = 2010 + i
    pop_oa_df = pd.read_excel(f"Data\\OA Population\\sapemid2011.xls", sheet_name = f'london_{year}')
    pop_oa_df = pop_oa_df.rename(columns = {'All Ages' : 'total_pop'})
    pop_oa_df['pop_19_64'] = pop_oa_df[[i for i in range(19,65)]].sum(axis = 1)
    pop_oa_df = pop_oa_df.groupby(['OA11CD', 'LAD11CD']).sum()[['total_pop', 'pop_19_64']].reset_index()
    pop_oa_df = pop_oa_df[['OA11CD', 'LAD11CD', 'total_pop', 'pop_19_64']]
    pop_oa_df.columns = [c.lower() for c in pop_oa_df.columns]
    
    south_pop_oa_df = south_oa11_to_oa21_map.set_index('oa11cd').join(pop_oa_df.set_index('oa11cd')).reset_index()
    south_pop_oa_df = south_pop_oa_df.set_index('oa11cd').join(south_pop_oa_df.groupby('oa11cd').count()[
                   ['oa21cd']].rename(columns = {'oa21cd' : 'oa21cd_count'})).reset_index().rename(columns = 
                                      {'total_pop' : 'total_pop_oa11cd', 'pop_19_64' : 'pop_19_64_oa11cd'})
    south_pop_oa_df['total_pop'] = (south_pop_oa_df['total_pop_oa11cd'] / south_pop_oa_df['oa21cd_count']).apply(
        np.round)
    south_pop_oa_df['pop_19_64'] = (south_pop_oa_df['pop_19_64_oa11cd'] / south_pop_oa_df['oa21cd_count']).apply(
        np.round)
    south_pop_oa_df = south_pop_oa_df[['oa21cd', 'total_pop', 'pop_19_64']].groupby('oa21cd').sum().reset_index()
    south_pop_oa_df['year'] = year
    
    pop_oa_df = oa11_to_oa21_map.set_index('oa11cd').join(pop_oa_df.set_index('oa11cd')).reset_index()
    pop_oa_df = pop_oa_df.set_index('oa11cd').join(pop_oa_df.groupby('oa11cd').count()[
                   ['oa21cd']].rename(columns = {'oa21cd' : 'oa21cd_count'})).reset_index().rename(columns = 
                                      {'total_pop' : 'total_pop_oa11cd', 'pop_19_64' : 'pop_19_64_oa11cd'})
    pop_oa_df['total_pop'] = (pop_oa_df['total_pop_oa11cd'] / pop_oa_df['oa21cd_count']).apply(np.round)
    pop_oa_df['pop_19_64'] = (pop_oa_df['pop_19_64_oa11cd'] / pop_oa_df['oa21cd_count']).apply(np.round)
    pop_oa_df = pop_oa_df[['oa21cd', 'total_pop', 'pop_19_64']].groupby('oa21cd').sum().reset_index()
    pop_oa_df['year'] = year
    final_pop_oa_df.append(pop_oa_df)
    final_south_pop_oa_df.append(south_pop_oa_df)

for i in range(9):
    year = 2012 + i
    pop_oa_df = pd.read_excel(f"Data\\OA Population\\sapemid{year}.xlsx",
                             sheet_name = f'Mid-{year} Persons', skiprows = 4)
    pop_oa_df = pop_oa_df.rename(columns = {'All Ages' : 'total_pop'})
    pop_oa_df['pop_19_64'] = pop_oa_df[[i for i in range(19,65)]].sum(axis = 1)
    pop_oa_df = pop_oa_df[['OA11CD', 'LSOA11CD', 'total_pop', 'pop_19_64']]
    pop_oa_df.columns = [c.lower() for c in pop_oa_df.columns]
    
    south_pop_oa_df = south_oa11_to_oa21_map.set_index('oa11cd').join(pop_oa_df.set_index('oa11cd')).reset_index()
    south_pop_oa_df = south_pop_oa_df.set_index('oa11cd').join(south_pop_oa_df.groupby('oa11cd').count()[
                   ['oa21cd']].rename(columns = {'oa21cd' : 'oa21cd_count'})).reset_index().rename(columns = 
                                      {'total_pop' : 'total_pop_oa11cd', 'pop_19_64' : 'pop_19_64_oa11cd'})
    south_pop_oa_df['total_pop'] = (south_pop_oa_df['total_pop_oa11cd'] / south_pop_oa_df['oa21cd_count']).apply(
        np.round)
    south_pop_oa_df['pop_19_64'] = (south_pop_oa_df['pop_19_64_oa11cd'] / south_pop_oa_df['oa21cd_count']).apply(
        np.round)
    south_pop_oa_df = south_pop_oa_df[['oa21cd', 'total_pop', 'pop_19_64']].groupby('oa21cd').sum().reset_index()
    south_pop_oa_df['year'] = year
    
    pop_oa_df = oa11_to_oa21_map.set_index('oa11cd').join(pop_oa_df.set_index('oa11cd')).reset_index()
    pop_oa_df = pop_oa_df.set_index('oa11cd').join(pop_oa_df.groupby('oa11cd').count()[
                   ['oa21cd']].rename(columns = {'oa21cd' : 'oa21cd_count'})).reset_index().rename(columns = 
                                      {'total_pop' : 'total_pop_oa11cd', 'pop_19_64' : 'pop_19_64_oa11cd'})
    pop_oa_df['total_pop'] = (pop_oa_df['total_pop_oa11cd'] / pop_oa_df['oa21cd_count']).apply(np.round)
    pop_oa_df['pop_19_64'] = (pop_oa_df['pop_19_64_oa11cd'] / pop_oa_df['oa21cd_count']).apply(np.round)
    pop_oa_df = pop_oa_df[['oa21cd', 'total_pop', 'pop_19_64']].groupby('oa21cd').sum().reset_index()
    pop_oa_df['year'] = year
    final_pop_oa_df.append(pop_oa_df)
    final_south_pop_oa_df.append(south_pop_oa_df)

for i in range(2):
    year = 2021 + i
    pop_oa_df = pd.read_excel(f"Data\\OA Population\\sapeoatablefinal{year}v2.xlsx",
                             sheet_name = f'Mid-{year} OA 2021', skiprows = 3)
    pop_oa_df = pop_oa_df.rename(columns = {'Total' : 'total_pop', 'OA 2021 Code' : 'oa21cd', 'LAD 2021 Code' : 
                                           'lsoa11cd'})
    pop_oa_df['female_pop_19_64'] = pop_oa_df[[f'F{i}' for i in range(19,65)]].sum(axis = 1)
    pop_oa_df['male_pop_19_64'] = pop_oa_df[[f'M{i}' for i in range(19,65)]].sum(axis = 1)
    pop_oa_df['pop_19_64'] = pop_oa_df['male_pop_19_64'] + pop_oa_df['female_pop_19_64']
    pop_oa_df['year'] = year
    pop_oa_df = pop_oa_df[['oa21cd','total_pop','pop_19_64','year']]
    south_pop_oa_df = pop_oa_df[pop_oa_df['oa21cd'].isin(south_oa11_to_oa21_map['oa21cd'].unique())]
    pop_oa_df = pop_oa_df[pop_oa_df['oa21cd'].isin(oa11_to_oa21_map['oa21cd'].unique())]
    final_pop_oa_df.append(pop_oa_df)
    final_south_pop_oa_df.append(south_pop_oa_df)

pop_oa_df['year'] = 2023
south_pop_oa_df['year'] = 2023
final_pop_oa_df.append(pop_oa_df)
final_south_pop_oa_df.append(south_pop_oa_df)
pop_oa_df['year'] = 2024
south_pop_oa_df['year'] = 2024
final_pop_oa_df.append(pop_oa_df)
final_south_pop_oa_df.append(south_pop_oa_df)

final_pop_oa_df = pd.concat(final_pop_oa_df, ignore_index = True)
final_south_pop_oa_df = pd.concat(final_south_pop_oa_df, ignore_index = True)
final_pop_oa_df.to_pickle("final_pop_oa_df.pkl")
final_south_pop_oa_df.to_pickle("final_south_pop_oa_df.pkl")
final_pop_oa_df.head()

In [None]:
final_pop_oa_df = pd.read_pickle("final_pop_oa_df.pkl")
final_south_pop_oa_df = pd.read_pickle("final_south_pop_oa_df.pkl")

# Load House Occupants Churn

In [None]:
#process dataset
hh_churn_df_test = pd.read_csv('Data\\CDRC Residential Mobility Index\\hh_churn_lsoa11_2023.csv')
hh_churn_df_test = hh_churn_df_test.rename(columns = {'area' : 'lsoa11cd'})

south_hh_churn_df_test = south_lsoa11_to_lsoa21_map.set_index('lsoa11cd').join(
    hh_churn_df_test.set_index('lsoa11cd')).reset_index()
hh_churn_df_test = lsoa11_to_lsoa21_map.set_index('lsoa11cd').join(hh_churn_df_test.set_index('lsoa11cd')
                                                             ).reset_index()

hh_churn_df = []
south_hh_churn_df = []
for i in range(13):
    year = 2010 + i
    tmp = hh_churn_df_test[['lsoa11cd', 'lsoa21cd','oa21cd', f'chn{year}']].rename(columns =
                                                           {f'chn{year}' : 'hh_chn'}).copy()
    tmp['year'] = year
    hh_churn_df.append(tmp)
    tmp = south_hh_churn_df_test[['lsoa11cd', 'lsoa21cd','oa21cd', f'chn{year}']].rename(columns = 
                                                         {f'chn{year}' : 'hh_chn'}).copy()
    tmp['year'] = year
    south_hh_churn_df.append(tmp)

hh_churn_df = pd.concat(hh_churn_df, ignore_index = True)
south_hh_churn_df = pd.concat(south_hh_churn_df, ignore_index = True)
hh_churn_df.to_pickle("hh_churn_df.pkl")
south_hh_churn_df.to_pickle("south_hh_churn_df.pkl")
hh_churn_df

In [None]:
final_pop_oa_df = pd.read_pickle("final_pop_oa_df.pkl")
final_south_pop_oa_df = pd.read_pickle("final_south_pop_oa_df.pkl")
hh_churn_df = pd.read_pickle("hh_churn_df.pkl")
south_hh_churn_df = pd.read_pickle("south_hh_churn_df.pkl")
final_pop_oa_df.head()

In [None]:
pc_lsoa_mapping_df = pd.read_csv("Data\\PCD_OA21_LSOA21\\PCD_OA21_LSOA21.csv",
                                 encoding = "ISO-8859-1")
pc_lsoa_mapping_df.head()

In [None]:
prop_cols = ['priceper', 'year', 'postcode', 'oa_ladnm']
tmp = pd.read_csv('Data\\House Prices\\Lambeth_link_26122024.csv')
tmp['oa_ladnm'] = 'Lambeth'
tmp1 = pd.read_csv('Data\\House Prices\\Wandsworth_link_26122024.csv')
tmp1['oa_ladnm'] = 'Wandsworth'
prop_prices_df = pd.concat([tmp[prop_cols], tmp1[prop_cols]], ignore_index = True)
print(prop_prices_df.shape)
prop_prices_df = prop_prices_df.set_index('postcode').join(pc_lsoa_mapping_df.rename(columns =
                 {'pcds' : 'postcode'}).set_index('postcode')[['oa21cd']].drop_duplicates(), how = 'inner'
                 ).reset_index()
prop_prices_df = prop_prices_df[prop_prices_df['year'] >= 2010].copy()
prop_prices_df = prop_prices_df[['oa21cd', 'oa_ladnm', 'year', 'priceper']].groupby(
    ['oa21cd', 'oa_ladnm', 'year']).mean().reset_index()
prop_prices_df = prop_prices_df.sort_values(by = ['year', 'oa21cd'])
prop_prices_df = lsoa21.set_index('oa21cd').join(prop_prices_df.set_index('oa21cd')).reset_index()
print(len(prop_prices_df['oa21cd'].unique()))
print(prop_prices_df.shape)
for year in prop_prices_df['year'].unique():
    for oa in lsoa21['oa21cd'].unique():
        if not len(prop_prices_df[(prop_prices_df['year'] == year) & (prop_prices_df['oa21cd'] == oa)]):
            m = prop_prices_df[(prop_prices_df['year'] == year)]['priceper'].mean()
            lsoa = lsoa21[lsoa21['oa21cd'] == oa].iloc[0]['lsoa21cd']
            ladnm = oa_final_df[oa_final_df['oa21cd'] == oa].iloc[0]['oa_ladnm']
            prop_prices_df = pd.concat([prop_prices_df, pd.DataFrame({'oa21cd' : [oa],
        'oa_ladnm' : [ladnm], 'year' : [year], 'priceper' : [m], 'lsoa21cd' : [lsoa]})], ignore_index = True)
print(prop_prices_df.shape)
print(prop_prices_df[prop_prices_df['oa21cd'].apply(pd.isna)].shape)
south_prop_prices_df = pd.read_csv('Data\\House Prices\\Southwark_link_26122024.csv')
south_prop_prices_df['oa_ladnm'] = 'Southwark'
south_prop_prices_df = south_prop_prices_df[prop_cols].copy()
print(south_prop_prices_df.shape)
south_prop_prices_df = south_prop_prices_df.set_index('postcode').join(pc_lsoa_mapping_df.rename(columns =
                 {'pcds' : 'postcode'}).set_index('postcode')[['oa21cd']].drop_duplicates(), how = 'inner'
              ).reset_index()
south_prop_prices_df = south_prop_prices_df[south_prop_prices_df['year'] >= 2010].copy()
south_prop_prices_df = south_prop_prices_df[['oa21cd', 'oa_ladnm', 'year', 'priceper']].groupby(
    ['oa21cd', 'oa_ladnm', 'year']).mean().reset_index()
south_prop_prices_df = south_prop_prices_df.sort_values(by = ['year', 'oa21cd'])
south_prop_prices_df = south_lsoa21.set_index('oa21cd').join(south_prop_prices_df.set_index('oa21cd')
                                                            ).reset_index()
for year in south_prop_prices_df['year'].unique():
    for oa in south_lsoa21['oa21cd'].unique():
        if not len(south_prop_prices_df[(south_prop_prices_df['year'] == year) &
                   (south_prop_prices_df['oa21cd'] == oa)]):
            m = south_prop_prices_df[(south_prop_prices_df['year'] == year)]['priceper'].mean()
            lsoa = south_lsoa21[south_lsoa21['oa21cd'] == oa].iloc[0]['lsoa21cd']
            ladnm = south_oa_final_df[south_oa_final_df['oa21cd'] == oa].iloc[0]['oa_ladnm']
            south_prop_prices_df = pd.concat([south_prop_prices_df, pd.DataFrame({'oa21cd' : [oa],
        'oa_ladnm' : [ladnm], 'year' : [year], 'priceper' : [m], 'lsoa21cd' : [lsoa]})], ignore_index = True)
print(len(south_prop_prices_df['oa21cd'].unique()))
print(south_prop_prices_df.shape)

prop_prices_df = prop_prices_df[~prop_prices_df['year'].apply(pd.isna)].sort_values(by = ['year', 'oa21cd'])
south_prop_prices_df = south_prop_prices_df[~south_prop_prices_df['year'].apply(pd.isna)
].sort_values(by = ['year', 'oa21cd'])
prop_prices_df.to_pickle("prop_prices_df.pkl")
south_prop_prices_df.to_pickle("south_prop_prices_df.pkl")

prop_prices_df

# Process RMD Dataset

In [None]:
rmd_in_df = pd.read_csv("Data\\RMD CDRC\\RMD_LSOA_1997_2023\\RMD_in_LSOA.csv")
rmd_in_df = rmd_in_df.rename(columns = {'d_first' : 'year', 'd_lsoa11' : 'lsoa11cd',
            'mean_imd_change' : 'imd_change_in', 'count' : 'in_count'})
rmd_in_df = rmd_in_df[rmd_in_df['year'] >= 2010].copy()
rmd_out_df = pd.read_csv("Data\\RMD CDRC\\RMD_LSOA_1997_2023\\RMD_out_LSOA.csv")
rmd_out_df = rmd_out_df.rename(columns = {'o_last' : 'year', 'o_lsoa11' : 'lsoa11cd',
            'mean_imd_change' : 'imd_change_out', 'count' : 'out_count'})
rmd_out_df = rmd_out_df[rmd_out_df['year'] >= 2010].copy()
rmd_df = rmd_in_df.set_index(['year', 'lsoa11cd']).join(rmd_out_df.set_index(['year', 'lsoa11cd'])).reset_index()


filtered_rmd_df = lsoa11_to_lsoa21_map.set_index('lsoa11cd').join(
    rmd_df.set_index('lsoa11cd')).reset_index()
print(filtered_rmd_df.shape)
filtered_south_rmd_df = south_lsoa11_to_lsoa21_map.set_index('lsoa11cd').join(
    rmd_df.set_index('lsoa11cd')).reset_index()
print(filtered_south_rmd_df.shape)
filtered_rmd_df.to_pickle("filtered_rmd_df.pkl")
filtered_south_rmd_df.to_pickle("filtered_south_rmd_df.pkl")
filtered_south_rmd_df.head()

# Process Jobs Data 2015 till 2023

In [None]:
jobs_df = pd.read_csv("Data\\jobs data\\jobs_stats_clean.csv")
jobs_df.columns = [c.split(':')[1].strip().replace(';', ',') if ':' in c else c for c in jobs_df.columns]
sic_scat_map_df = pd.read_excel("Data/sic_cat_mapping.xlsx", sheet_name = 'scat_sic_mapping')
jobs_df = jobs_df.rename(columns = {'Activities of households as employers,undifferentiated goods-and services-producing activities of households for own use' :
                      'Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use'})
jobs_df['lsoa11cd'] = jobs_df['lsoa11cd'].apply(lambda x: x.split(':')[0].strip())
jobs_df = jobs_df.sort_values(by = ['lsoa11cd', 'year'])

jobs_df.head()

In [None]:
final_jobs_df = []
for col in jobs_df.columns[1:-2]:
    sector = sic_scat_map_df[sic_scat_map_df['sic_category'].apply(lambda x: str.lower(x).replace(';', ',')) == col.lower().replace(';', ',')].iloc[0]['scat_category']
    sub_sector = sic_scat_map_df[sic_scat_map_df['sic_category'].apply(lambda x: str.lower(x).replace(';', ',')) == col.lower().replace(';', ',')].iloc[0]['scat_sub_category']
    tmp = jobs_df[['lsoa11cd', 'year', col]].copy().rename(columns = {col : 'jobs_count'})
    tmp['sub_sector'] = sub_sector
    tmp['sector'] = sector
    final_jobs_df.append(tmp)

tmp = jobs_df[['lsoa11cd', 'year', 'total']].rename(columns = {'total' : 'jobs_count'})
tmp['industry'] = 'All'
tmp = tmp[['lsoa11cd', 'year', 'industry', 'jobs_count']]
final_jobs_df = pd.concat(final_jobs_df, ignore_index = True)
final_jobs_df = final_jobs_df.groupby(['lsoa11cd','year','sub_sector','sector']).sum().reset_index()
final_jobs_df = pd.concat([final_jobs_df.groupby(['lsoa11cd', 'year', 'sub_sector']
                        ).sum()[['jobs_count']].reset_index().rename(columns = {'sub_sector' : 'industry'}),
                          final_jobs_df.groupby(['lsoa11cd', 'year', 'sector']
                        ).sum()[['jobs_count']].reset_index().rename(columns = {'sector' : 'industry'}),
                          tmp])
print(final_jobs_df.shape)

filtered_jobs_df = lsoa11_to_lsoa21_map.set_index('lsoa11cd').join(
    final_jobs_df.set_index('lsoa11cd')).reset_index()
print(filtered_jobs_df.shape)
filtered_south_jobs_df = south_lsoa11_to_lsoa21_map.set_index('lsoa11cd').join(
    final_jobs_df.set_index('lsoa11cd')).reset_index()
print(filtered_south_jobs_df.shape)

print(final_pop_oa_df.shape)
final_pop_oa_df_tmp = final_pop_oa_df.set_index('oa21cd')[['year', 'total_pop']].join(
    lsoa11_to_lsoa21_map.set_index('oa21cd')).reset_index()
print(final_pop_oa_df_tmp.shape)
final_pop_oa_df_tmp = final_pop_oa_df_tmp.set_index(['lsoa21cd', 'year']).join(
    final_pop_oa_df_tmp.groupby(['lsoa21cd', 'year']).sum()[['total_pop']].rename(
        columns = {'total_pop' : 'lsoa_total_pop'})).reset_index()
final_pop_oa_df_tmp['total_pop_perc'] = final_pop_oa_df_tmp['total_pop'] / final_pop_oa_df_tmp['lsoa_total_pop']
print(final_pop_oa_df_tmp.shape)

print(final_south_pop_oa_df.shape)
final_south_pop_oa_df_tmp = final_south_pop_oa_df.set_index('oa21cd')[['year', 'total_pop']].join(
    south_lsoa11_to_lsoa21_map.set_index('oa21cd')).reset_index()
print(final_south_pop_oa_df_tmp.shape)
final_south_pop_oa_df_tmp = final_south_pop_oa_df_tmp.set_index(['lsoa21cd', 'year']).join(
    final_south_pop_oa_df_tmp.groupby(['lsoa21cd', 'year']).sum()[['total_pop']].rename(
        columns = {'total_pop' : 'lsoa_total_pop'})).reset_index()
final_south_pop_oa_df_tmp['total_pop_perc'] = final_south_pop_oa_df_tmp['total_pop'] / final_south_pop_oa_df_tmp['lsoa_total_pop']
print(final_south_pop_oa_df_tmp.shape)

filtered_jobs_df = filtered_jobs_df.set_index(['oa21cd', 'year']
         ).join(final_pop_oa_df_tmp.set_index(['oa21cd', 'year'])[['total_pop_perc']]).reset_index()
filtered_jobs_df = filtered_jobs_df.rename(columns = {'jobs_count' : 'lsoa_jobs_count'})
filtered_jobs_df['jobs_count'] = (filtered_jobs_df['lsoa_jobs_count'] * filtered_jobs_df['total_pop_perc']
                                 ).apply(np.round)

filtered_south_jobs_df = filtered_south_jobs_df.set_index(['oa21cd', 'year']
         ).join(final_south_pop_oa_df_tmp.set_index(['oa21cd', 'year'])[['total_pop_perc']]).reset_index()
filtered_south_jobs_df = filtered_south_jobs_df.rename(columns = {'jobs_count' : 'lsoa_jobs_count'})
filtered_south_jobs_df['jobs_count'] = (filtered_south_jobs_df['lsoa_jobs_count'] *
                                        filtered_south_jobs_df['total_pop_perc']).apply(np.round)

filtered_jobs_df.head()

# Process Jobs Data 2010 till 2015

In [None]:
jobs_df_2010 = pd.read_csv("Data\\jobs data\\jobs_stats_clean_2010.csv")
jobs_df_2010.columns = [c.split(':')[1].strip().replace(';', ',') if ':' in c else c for c in jobs_df_2010.columns]
sic_scat_map_df = pd.read_excel("Data/sic_cat_mapping.xlsx", sheet_name = 'scat_sic_mapping')
jobs_df_2010 = jobs_df_2010.rename(columns = {'Activities of households as employers,undifferentiated goods-and services-producing activities of households for own use' :
                      'Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use'})
jobs_df_2010['lsoa01cd'] = jobs_df_2010['lsoa01cd'].apply(lambda x: x.split(':')[0].strip())
jobs_df_2010 = jobs_df_2010.sort_values(by = ['lsoa01cd', 'year'])
jobs_df_2010.head()

In [None]:
final_jobs_df_2010 = []
for col in jobs_df_2010.columns[1:-2]:
    sector = sic_scat_map_df[sic_scat_map_df['sic_category'].apply(lambda x: str.lower(x).replace(';', ',')) == col.lower().replace(';', ',')].iloc[0]['scat_category']
    sub_sector = sic_scat_map_df[sic_scat_map_df['sic_category'].apply(lambda x: str.lower(x).replace(';', ',')) == col.lower().replace(';', ',')].iloc[0]['scat_sub_category']
    tmp = jobs_df_2010[['lsoa01cd', 'year', col]].copy().rename(columns = {col : 'jobs_count'})
    tmp['sub_sector'] = sub_sector
    tmp['sector'] = sector
    final_jobs_df_2010.append(tmp)

tmp = jobs_df_2010[['lsoa01cd', 'year', 'total']].rename(columns = {'total' : 'jobs_count'})
tmp['industry'] = 'All'
tmp = tmp[['lsoa01cd', 'year', 'industry', 'jobs_count']]
final_jobs_df_2010 = pd.concat(final_jobs_df_2010, ignore_index = True)
final_jobs_df_2010 = final_jobs_df_2010.groupby(['lsoa01cd','year','sub_sector','sector']).sum().reset_index()
final_jobs_df_2010 = pd.concat([final_jobs_df_2010.groupby(['lsoa01cd', 'year', 'sub_sector']
                        ).sum()[['jobs_count']].reset_index().rename(columns = {'sub_sector' : 'industry'}),
                          final_jobs_df_2010.groupby(['lsoa01cd', 'year', 'sector']
                        ).sum()[['jobs_count']].reset_index().rename(columns = {'sector' : 'industry'}),
                          tmp])
print(final_jobs_df_2010.shape)

filtered_jobs_df_2010 = lsoa01_to_lsoa21_map.set_index('lsoa01cd').join(
    final_jobs_df_2010.set_index('lsoa01cd')).reset_index().drop(columns = ['lsoa01cd'])
print(filtered_jobs_df_2010.shape)
filtered_south_jobs_df_2010 = south_lsoa01_to_lsoa21_map.set_index('lsoa01cd').join(
    final_jobs_df_2010.set_index('lsoa01cd')).reset_index().drop(columns = ['lsoa01cd'])
print(final_jobs_df_2010.shape)
print(filtered_south_jobs_df_2010.shape)
filtered_south_jobs_df_2010 = filtered_south_jobs_df_2010.groupby([
    'lsoa11cd','lsoa21cd', 'oa21cd', 'year', 'industry']).sum().reset_index()
print(filtered_south_jobs_df_2010.shape)

print(final_pop_oa_df.shape)
final_pop_oa_df_tmp = final_pop_oa_df.set_index('oa21cd')[['year', 'total_pop']].join(
    lsoa11_to_lsoa21_map.set_index('oa21cd')).reset_index()
print(final_pop_oa_df_tmp.shape)
final_pop_oa_df_tmp = final_pop_oa_df_tmp.set_index(['lsoa21cd', 'year']).join(
    final_pop_oa_df_tmp.groupby(['lsoa21cd', 'year']).sum()[['total_pop']].rename(
        columns = {'total_pop' : 'lsoa_total_pop'})).reset_index()
final_pop_oa_df_tmp['total_pop_perc'] = final_pop_oa_df_tmp['total_pop'] / final_pop_oa_df_tmp['lsoa_total_pop']
print(final_pop_oa_df_tmp.shape)

print(final_south_pop_oa_df.shape)
final_south_pop_oa_df_tmp = final_south_pop_oa_df.set_index('oa21cd')[['year', 'total_pop']].join(
    south_lsoa11_to_lsoa21_map.set_index('oa21cd')).reset_index()
print(final_south_pop_oa_df_tmp.shape)
final_south_pop_oa_df_tmp = final_south_pop_oa_df_tmp.set_index(['lsoa21cd', 'year']).join(
    final_south_pop_oa_df_tmp.groupby(['lsoa21cd', 'year']).sum()[['total_pop']].rename(
        columns = {'total_pop' : 'lsoa_total_pop'})).reset_index()
final_south_pop_oa_df_tmp['total_pop_perc'] = final_south_pop_oa_df_tmp['total_pop'] / final_south_pop_oa_df_tmp['lsoa_total_pop']
print(final_south_pop_oa_df_tmp.shape)

filtered_jobs_df_2010 = filtered_jobs_df_2010.set_index(['oa21cd', 'year']
         ).join(final_pop_oa_df_tmp.set_index(['oa21cd', 'year'])[['total_pop_perc']]).reset_index()
filtered_jobs_df_2010 = filtered_jobs_df_2010.rename(columns = {'jobs_count' : 'lsoa_jobs_count'})
filtered_jobs_df_2010['jobs_count'] = (filtered_jobs_df_2010['lsoa_jobs_count'] * filtered_jobs_df_2010['total_pop_perc']
                                 ).apply(np.round)

filtered_south_jobs_df_2010 = filtered_south_jobs_df_2010.set_index(['oa21cd', 'year']
         ).join(final_south_pop_oa_df_tmp.set_index(['oa21cd', 'year'])[['total_pop_perc']]).reset_index()
filtered_south_jobs_df_2010 = filtered_south_jobs_df_2010.rename(columns = {'jobs_count' : 'lsoa_jobs_count'})
filtered_south_jobs_df_2010['jobs_count'] = (filtered_south_jobs_df_2010['lsoa_jobs_count'] *
                                        filtered_south_jobs_df_2010['total_pop_perc']).apply(np.round)

filtered_jobs_df_2010.head()

In [None]:
filtered_jobs_df = pd.concat([filtered_jobs_df_2010, filtered_jobs_df[filtered_jobs_df['year'] >= 2015]],
                            ignore_index = True)
filtered_south_jobs_df = pd.concat([filtered_south_jobs_df_2010, filtered_south_jobs_df[filtered_south_jobs_df['year'] >= 2015]],
                            ignore_index = True)
filtered_jobs_df['year'] = filtered_jobs_df['year'].apply(int)
filtered_south_jobs_df['year'] = filtered_south_jobs_df['year'].apply(int)
filtered_jobs_df.to_pickle("filtered_jobs_df.pkl")
filtered_south_jobs_df.to_pickle("filtered_south_jobs_df.pkl")

# Process CDRC House Prices

In [None]:
house_prices_df = pd.read_csv("Data\\House Prices\\CDRC Data\\prices_LSOA21.csv")
house_prices_df = house_prices_df.rename(columns = {'lsoa21' : 'lsoa21cd'})
south_house_prices_df = south_lsoa11_to_lsoa21_map.set_index('lsoa21cd').join(
    house_prices_df.set_index('lsoa21cd')).reset_index()
house_prices_df = lsoa11_to_lsoa21_map.set_index('lsoa21cd').join(
    house_prices_df.set_index('lsoa21cd')).reset_index()
print(len(house_prices_df['oa21cd'].unique()))
print(len(south_house_prices_df['oa21cd'].unique()))
final_house_prices_df = []
final_south_house_prices_df = []
for i in range(17):
    year = 2007 + i
    tmp = house_prices_df[['lsoa21cd', 'oa21cd', f'mean_sale_value_{year}', f'mean_rental_value_{year}']].copy()
    tmp = tmp.rename(columns = {f'mean_sale_value_{year}' : 'mean_sale_value',
                               f'mean_rental_value_{year}' : 'mean_rental_value'})
    tmp['year'] = year
    final_house_prices_df.append(tmp)

    tmp = south_house_prices_df[['lsoa21cd', 'oa21cd', f'mean_sale_value_{year}',
                                 f'mean_rental_value_{year}']].copy()
    tmp = tmp.rename(columns = {f'mean_sale_value_{year}' : 'mean_sale_value',
                               f'mean_rental_value_{year}' : 'mean_rental_value'})
    tmp['year'] = year
    final_south_house_prices_df.append(tmp)
final_house_prices_df = pd.concat(final_house_prices_df, ignore_index = True)
final_south_house_prices_df = pd.concat(final_south_house_prices_df, ignore_index = True)

final_house_prices_df.to_pickle("final_house_prices_df.pkl")
final_south_house_prices_df.to_pickle("final_south_house_prices_df.pkl")

final_house_prices_df.head()