In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import glob
import matplotlib.pyplot as plt
import seaborn as sns



In [3]:
def load_compile_epc(path_to_cleaned):

    list_paths = glob.glob(path_to_cleaned) #path_to_cleaned = "./../Data/Energy_Performance_Certificate/cerificates_processed/*.csv"

    data = pd.DataFrame()
    for i in range(len(glob.glob(path_to_cleaned))):
        x = pd.read_csv(list_paths[i])
        data = pd.concat([data,x])

    #data = gpd.GeoDataFrame(data, geometry = gpd.points_from_xy(data.LONGITUDE,data.LATITUDE), crs = "EPSG:4326")

    return(data)

def winsor(variable):
    upper_percentile = np.percentile(variable, 99)
    lower_percentile = np.percentile(variable, 1)
    new_var = np.where(variable > upper_percentile, upper_percentile, variable)
    new_var = np.where(new_var < lower_percentile, lower_percentile, new_var)
    return(new_var)

def essen_clean(data):
    # eliminate rows before 01/05/2013
    data['LODGEMENT_DATE'] = pd.to_datetime(data['LODGEMENT_DATE'],format = '%Y-%m-%d')
    data = data[data['LODGEMENT_DATE'] >= '2013-05-01 00:00:00']
    # eliminate not geolocated rows
    data = data[~data['LSOA21CD'].isna()]

    return(data)



In [4]:
lookup = pd.read_csv('./../../Volumes/Extreme_SSD/WORK/MAV/UPRN/uprn_lsoa_lookup.csv').drop(['Unnamed: 0'],axis = 1)

# ENERGY

In [None]:
ener = load_compile_epc('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/cleaned_energy/*')
ener.head()

In [None]:
ener = ener.merge(lookup, how = 'left', on = 'UPRN')
ener.info()

In [None]:
ener = essen_clean(ener)

In [None]:
# adjust the energy related variables
ener = ener.loc[:,['CURRENT_ENERGY_RATING', 'CURRENT_ENERGY_EFFICIENCY','ENERGY_CONSUMPTION_CURRENT', 'ENERGY_CONSUMPTION_POTENTIAL', 'MAINS_GAS_FLAG','LSOA21CD']]
ener['count'] = 1
dict_ener = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'INVALID!': np.nan}
ener['CURRENT_ENERGY_RATING_numbered'] = ener['CURRENT_ENERGY_RATING'].map(dict_ener)

In [None]:
ener.head()

In [None]:
ener_g = ener[ener['MAINS_GAS_FLAG'] == 'Y']

In [None]:
ener['ENERGY_CONSUMPTION_CURRENT'] = winsor(ener['ENERGY_CONSUMPTION_CURRENT'])
ener_g['ENERGY_CONSUMPTION_CURRENT'] = winsor(ener_g['ENERGY_CONSUMPTION_CURRENT'])

In [None]:
# aggregate full data
agg = round(ener.groupby(['LSOA21CD']
                  ).aggregate({'CURRENT_ENERGY_EFFICIENCY':['median','mean'],
                              'CURRENT_ENERGY_RATING_numbered':['median','mean'],
                              'ENERGY_CONSUMPTION_CURRENT':['median','mean'],
                              'ENERGY_CONSUMPTION_POTENTIAL':['median','mean'],
                              'count':['count']}).reset_index(),2)
agg.columns = [c[0] + "_" + c[1] for c in agg.columns]
agg = agg.rename(columns = {'LSOA21CD_':'LSOA21CD'})
# aggregate the gas only data
agg_g = round(ener_g.groupby(['LSOA21CD']
                  ).aggregate({'CURRENT_ENERGY_EFFICIENCY':['median','mean'],
                              'CURRENT_ENERGY_RATING_numbered':['median','mean'],
                              'ENERGY_CONSUMPTION_CURRENT':['median','mean'],
                              'ENERGY_CONSUMPTION_POTENTIAL':['median','mean'],
                              'count':['count']}).reset_index(),2)
agg_g.columns = [c[0] + "_" + c[1] for c in agg_g.columns]
agg_g = agg_g.rename(columns = {'LSOA21CD_':'LSOA21CD'})

In [None]:
agg.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_energy.csv')
agg_g.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_GAS_energy.csv')

# HEAT

In [None]:
heat = load_compile_epc('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/cleaned_heat/*')
heat.head()

In [None]:
heat = heat.merge(lookup, how = 'left', on = 'UPRN')
heat.info()

In [None]:
heat = essen_clean(heat)

In [None]:
heat['count'] = 1

def return_binary_heat(source_var, str_name):
    var = source_var.str.find(str_name)
    var = np.where(var > 0, 1,0)
    return var

#heat['electric'] = return_binary_heat(source_var = heat['MAINHEAT_DESCRIPTION'], 
                                          str_name =  'electric')


## mainheating

In [None]:
keywords = ['electric', 'trydan']
# Create a regex pattern for the keywords
pattern = '|'.join(keywords)
# Use str.contains to create the binary column
heat['MAIN_electric'] = heat['MAINHEAT_DESCRIPTION'].str.contains(pattern, case=False, na=True).astype(int)
keywords = ['wood', 'coal', 'LPG', 
' oil', 'anthracite', 'smokeless',
'bio', 'B30K', 'solid fuel', 'coal', 'wood',
 'olew', 'choed', 'LNG']
# Create a regex pattern for the keywords
pattern = '|'.join(keywords)
# Use str.contains to create the binary column
heat['MAIN_solid_fuel'] = heat['MAINHEAT_DESCRIPTION'].str.contains(pattern, case=False, na=True).astype(int)

In [None]:
heat['MAIN_other'] = np.where(((heat['MAIN_electric'] == 0) | (heat['MAIN_solid_fuel'] == 0)), 1,0)
heat['MAIN_electric_other'] = np.where((heat['MAIN_electric'] & heat['MAIN_other']), 1, 0)
heat['MAIN_electric_solid'] = np.where((heat['MAIN_electric'] & heat['MAIN_solid_fuel']), 1, 0)
heat['MAIN_solid_other'] = np.where((heat['MAIN_solid_fuel'] & heat['MAIN_other']), 1, 0)

In [None]:
for i in [ 'MAIN_electric','MAIN_solid_fuel',
       'MAIN_other', 'MAIN_electric_other', 'MAIN_electric_solid',
       'MAIN_solid_other']: 
       heat[i] = np.where(heat['MAINHEAT_DESCRIPTION'].isna(), np.nan, heat[i] )

## secondheating

In [None]:
keywords = ['electric', 'trydan']
# Create a regex pattern for the keywords
pattern = '|'.join(keywords)
# Use str.contains to create the binary column
heat['SECOND_electric'] = heat['SECONDHEAT_DESCRIPTION'].str.contains(pattern, case=False, na=True).astype(int)
keywords = ['wood', 'coal', 'LPG', 
' oil', 'anthracite', 'smokeless',
'bio', 'B30K', 'solid fuel', 'coal', 'wood',
 'olew', 'choed', 'LNG']
# Create a regex pattern for the keywords
pattern = '|'.join(keywords)
# Use str.contains to create the binary column
heat['SECOND_solid_fuel'] = heat['SECONDHEAT_DESCRIPTION'].str.contains(pattern, case=False, na=True).astype(int)

In [None]:
heat['SECOND_other'] = np.where(((heat['SECOND_electric'] == 0) | (heat['SECOND_solid_fuel'] == 0)), 1,0)
heat['SECOND_electric_other'] = np.where((heat['SECOND_electric'] & heat['SECOND_other']), 1, 0)
heat['SECOND_electric_solid'] = np.where((heat['SECOND_electric'] & heat['SECOND_solid_fuel']), 1, 0)
heat['SECOND_solid_other'] = np.where((heat['SECOND_solid_fuel'] & heat['SECOND_other']), 1, 0)

In [None]:
for i in [ 'SECOND_electric','SECOND_solid_fuel',
       'SECOND_other', 'SECOND_electric_other', 'SECOND_electric_solid',
       'SECOND_solid_other']: 
       heat[i] = np.where(heat['SECONDHEAT_DESCRIPTION'].isna(), np.nan, heat[i] )

## other the heat variables

In [None]:
heat['NUMBER_OPEN_FIREPLACES_binary'] = np.where(heat['NUMBER_OPEN_FIREPLACES'] >=1, 1, 0)

In [None]:
heat_g = heat[heat['MAINS_GAS_FLAG'] == 'Y']

## aggregate

In [None]:
agg = round(heat.groupby(['LSOA21CD']
                  ).aggregate({'NUMBER_OPEN_FIREPLACES':['sum','median','mean'],
                              'NUMBER_OPEN_FIREPLACES_binary':['sum'],
                              'MAIN_electric':['sum'],
                                'MAIN_solid_fuel':['sum'],
                                'MAIN_other':['sum'], 
                                'MAIN_electric_other':['sum'], 
                                'MAIN_electric_solid':['sum'],
                                'MAIN_solid_other':['sum'], 
                                'SECOND_electric':['sum'], 
                                'SECOND_solid_fuel':['sum'],
                                'SECOND_other':['sum'], 
                                'SECOND_electric_other':['sum'], 
                                'SECOND_electric_solid':['sum'],
                                'SECOND_solid_other':['sum'],
                              'count':['count']}).reset_index(),2)
agg.columns = [c[0] + "_" + c[1] for c in agg.columns]
agg = agg.rename(columns = {'LSOA21CD_':'LSOA21CD'})

agg_g = round(heat_g.groupby(['LSOA21CD']
                  ).aggregate({'NUMBER_OPEN_FIREPLACES':['sum','median','mean'],
                              'NUMBER_OPEN_FIREPLACES_binary':['sum'],
                              'MAIN_electric':['sum'],
                                'MAIN_solid_fuel':['sum'],
                                'MAIN_other':['sum'], 
                                'MAIN_electric_other':['sum'], 
                                'MAIN_electric_solid':['sum'],
                                'MAIN_solid_other':['sum'], 
                                'SECOND_electric':['sum'], 
                                'SECOND_solid_fuel':['sum'],
                                'SECOND_other':['sum'], 
                                'SECOND_electric_other':['sum'], 
                                'SECOND_electric_solid':['sum'],
                                'SECOND_solid_other':['sum'],
                              'count':['count']}).reset_index(),2)
agg_g.columns = [c[0] + "_" + c[1] for c in agg_g.columns]
agg_g = agg_g.rename(columns = {'LSOA21CD_':'LSOA21CD'})

In [None]:
agg.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_heat.csv')
agg_g.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_GAS_heat.csv')

In [None]:
del heat, heat_g, agg, agg_g

# HOUSE

In [5]:
house = load_compile_epc('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/cleaned_house/*')
house.head()

Unnamed: 0,PROPERTY_TYPE,BUILT_FORM,LODGEMENT_DATE,TOTAL_FLOOR_AREA,MAINS_GAS_FLAG,CONSTRUCTION_AGE_BAND,TENURE,UPRN,LATITUDE,LONGITUDE
0,House,Detached,2023-05-05,193.0,N,England and Wales: before 1900,Owner-occupied,1009311000.0,50.497252,-3.752589
1,House,End-Terrace,2023-04-24,264.0,N,England and Wales: 1900-1929,Owner-occupied,1009311000.0,50.374284,-3.98903
2,House,Detached,2011-11-26,201.5,N,England and Wales: 1996-2002,rental (private),1009311000.0,50.386777,-3.666289
3,House,Detached,2020-05-29,450.0,N,England and Wales: before 1900,owner-occupied,1009311000.0,50.404028,-4.019857
4,House,Mid-Terrace,2019-06-18,78.0,N,England and Wales: 2007 onwards,owner-occupied,1009311000.0,50.361612,-3.666855


In [None]:
#house['CONSTRUCTION_AGE_BAND'].value_counts().to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/E_CONSTRUCTION_AGE_BAND.csv')

In [6]:
house = house.merge(lookup, how = 'left', on = 'UPRN')
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18515781 entries, 0 to 18515780
Data columns (total 11 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   PROPERTY_TYPE          object 
 1   BUILT_FORM             object 
 2   LODGEMENT_DATE         object 
 3   TOTAL_FLOOR_AREA       float64
 4   MAINS_GAS_FLAG         object 
 5   CONSTRUCTION_AGE_BAND  object 
 6   TENURE                 object 
 7   UPRN                   float64
 8   LATITUDE               float64
 9   LONGITUDE              float64
 10  LSOA21CD               object 
dtypes: float64(4), object(7)
memory usage: 1.5+ GB


In [7]:
house = essen_clean(house)

In [8]:
age_lookup = pd.read_csv('./../../Volumes/Extreme_SSD/WORK/EPC/age_lookup.csv')
age_dict = age_lookup.set_index('CONSTRUCTION_AGE_BAND')['Category2'].to_dict()

In [9]:
house['CONSTRUCTION_AGE_BAND'].map(age_dict).unique()

array(['before_1929', '2003_onwards', '1930-1949', nan, '1976-2002',
       '1950-1975'], dtype=object)

In [10]:
house['CONSTRUCTION_AGE_BAND'] = house['CONSTRUCTION_AGE_BAND'].map(age_dict)
house['count'] = 1
house = house.reset_index()

In [11]:
house['BUILT_FORM'] = np.where(house['BUILT_FORM'] == 'NO DATA!', np.nan,house['BUILT_FORM'] )
house['BUILT_FORM'] = np.where(house['BUILT_FORM'] =='Enclosed Mid-Terrace', np.nan,house['BUILT_FORM'] )
house['BUILT_FORM'] = np.where(house['BUILT_FORM'] == 'Enclosed End-Terrace', np.nan,house['BUILT_FORM'] )

In [12]:
dict_ten = {'Owner-occupied':'Owner-occupied', 
'rental (private)':'Rented (private)', 
'owner-occupied':'Owner-occupied',
'Rented (private)':'Rented (private)', 
'unknown': np.nan, 
'Rented (social)':'Rented (social)',
'rental (social)':'Rented (social)',
'Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be used for an existing dwelling':np.nan,
'NO DATA!': np.nan}

house['TENURE'] = house['TENURE'].map(dict_ten)

In [13]:
gas = {'N':0,'Y':1}
house['MAINS_GAS_FLAG'] = house['MAINS_GAS_FLAG'].map(gas)

In [15]:
house_g = house[house['MAINS_GAS_FLAG'] == 1]

In [16]:
PROP_TYPE = pd.pivot(house.loc[:,['PROPERTY_TYPE','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='PROPERTY_TYPE', values='count'
         ).reset_index().loc[:,['LSOA21CD','Bungalow','Flat','House','Maisonette','Park home']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'Bungalow':'Bungalow_sum','Flat':'Flat_sum',
                                                                                            'House':'House_sum','Maisonette':'Maisonette_sum',
                                                                                            'Park home':'Park_home_sum'})
PROP_TYPE_g = pd.pivot(house_g.loc[:,['PROPERTY_TYPE','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='PROPERTY_TYPE', values='count'
         ).reset_index().loc[:,['LSOA21CD','Bungalow','Flat','House','Maisonette','Park home']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'Bungalow':'Bungalow_sum','Flat':'Flat_sum',
                                                                                            'House':'House_sum','Maisonette':'Maisonette_sum',
                                                                                            'Park home':'Park_home_sum'})

In [17]:
BUILD = pd.pivot(house.loc[:,['BUILT_FORM','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='BUILT_FORM', values='count'
         ).reset_index().loc[:,['LSOA21CD','Detached', 'End-Terrace', 'Mid-Terrace', 'Semi-Detached']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'Detached':'Detached_sum', 'End-Terrace':'End-Terrace_sum',
                                                     'Mid-Terrace':'Mid-Terrace_sum', 'Semi-Detached':'Semi-Detached_sum'})
BUILD_g = pd.pivot(house_g.loc[:,['BUILT_FORM','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='BUILT_FORM', values='count'
         ).reset_index().loc[:,['LSOA21CD','Detached', 'End-Terrace', 'Mid-Terrace', 'Semi-Detached']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'Detached':'Detached_sum', 'End-Terrace':'End-Terrace_sum',
                                                     'Mid-Terrace':'Mid-Terrace_sum', 'Semi-Detached':'Semi-Detached_sum'})

In [18]:
AGE = pd.pivot(house.loc[:,['CONSTRUCTION_AGE_BAND','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='CONSTRUCTION_AGE_BAND', values='count'
         ).reset_index().loc[:,['LSOA21CD','1976-2002', '1930-1949', 'before_1929', '1950-1975',
       '2003_onwards']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'1976-2002':'1976-2002_sum', '1930-1949':'1930-1949_sum', 
                                                    'before_1929':'before_1929_sum', '1950-1975':'1950-1975_sum','2003_onwards':'2003_onwards_sum'})

AGE_g = pd.pivot(house_g.loc[:,['CONSTRUCTION_AGE_BAND','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='CONSTRUCTION_AGE_BAND', values='count'
         ).reset_index().loc[:,['LSOA21CD','1976-2002', '1930-1949', 'before_1929', '1950-1975',
       '2003_onwards']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'1976-2002':'1976-2002_sum', '1930-1949':'1930-1949_sum', 
                                                    'before_1929':'before_1929_sum', '1950-1975':'1950-1975_sum','2003_onwards':'2003_onwards_sum'})

In [19]:
TENURE = pd.pivot(house.loc[:,['TENURE','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='TENURE', values='count'
         ).reset_index().loc[:,['LSOA21CD','Owner-occupied', 'Rented (private)', 'Rented (social)']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'Owner-occupied':'Owner-occupied_sum', 'Rented (private)':'Rented (private)_sum', 'Rented (social)':'Rented (social)_sum'})
TENURE_g = pd.pivot(house_g.loc[:,['TENURE','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='TENURE', values='count'
         ).reset_index().loc[:,['LSOA21CD','Owner-occupied', 'Rented (private)', 'Rented (social)']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'Owner-occupied':'Owner-occupied_sum', 'Rented (private)':'Rented (private)_sum', 'Rented (social)':'Rented (social)_sum'})

In [20]:
agg = round(house.groupby(['LSOA21CD']
                  ).aggregate({'TOTAL_FLOOR_AREA':['median','mean'],
                               'MAINS_GAS_FLAG':['sum'],
                              'count':['count']}).reset_index(),2)
agg.columns = [c[0] + "_" + c[1] for c in agg.columns]
agg = agg.rename(columns = {'LSOA21CD_':'LSOA21CD'})

agg = agg.merge(TENURE, on = 'LSOA21CD'
          ).merge(BUILD, on = 'LSOA21CD'
                  ).merge(PROP_TYPE, on = 'LSOA21CD'
                          ).merge(AGE, on = 'LSOA21CD')

agg_g = round(house_g.groupby(['LSOA21CD']
                  ).aggregate({'TOTAL_FLOOR_AREA':['median','mean'],
                               'MAINS_GAS_FLAG':['sum'],
                              'count':['count']}).reset_index(),2)
agg_g.columns = [c[0] + "_" + c[1] for c in agg_g.columns]
agg_g = agg_g.rename(columns = {'LSOA21CD_':'LSOA21CD'})

agg_g = agg_g.merge(TENURE_g, on = 'LSOA21CD'
          ).merge(BUILD_g, on = 'LSOA21CD'
                  ).merge(PROP_TYPE_g, on = 'LSOA21CD'
                          ).merge(AGE_g, on = 'LSOA21CD')

In [21]:
agg.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_house.csv')
agg_g.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_GAS_house.csv')

In [None]:
del house, house_g, agg, agg_g

# LOSS

In [None]:
loss = load_compile_epc('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/cleaned_loss/*')
loss.head()

In [None]:
loss = loss.merge(lookup, how = 'left', on = 'UPRN')
loss.info()

In [None]:
loss = essen_clean(loss)
loss['count'] = 1

In [None]:
keywords = ['insulat', 'inswleid']
# Create a regex pattern for the keywords
pattern = '|'.join(keywords)
# Use str.contains to create the binary column
loss['Roof_insulati'] = loss['ROOF_DESCRIPTION'].str.contains(pattern, case=False, na=True).astype(int)

keywords = ['thatch']
# Create a regex pattern for the keywords
pattern = '|'.join(keywords)
# Use str.contains to create the binary column
loss['Roof_thatch'] = loss['ROOF_DESCRIPTION'].str.contains(pattern, case=False, na=True).astype(int)

In [None]:
loss['PHOTO_SUPPLY_bi'] = np.where(loss['PHOTO_SUPPLY'] > 0 , 1 , 0)

In [None]:
loss_g = loss[loss['MAINS_GAS_FLAG'] == 'Y']

In [None]:
agg = round(loss.groupby(['LSOA21CD']
                  ).aggregate({'MULTI_GLAZE_PROPORTION':['median','mean'],
                               'EXTENSION_COUNT':['median','mean'],
                               'PHOTO_SUPPLY': ['median','mean'],
                               'PHOTO_SUPPLY_bi':['sum'],
                               'Roof_insulati':['sum'],
                               'Roof_thatch':['sum'],
                              'count':['count']}).reset_index(),2)
agg.columns = [c[0] + "_" + c[1] for c in agg.columns]
agg = agg.rename(columns = {'LSOA21CD_':'LSOA21CD'})

agg_g = round(loss_g.groupby(['LSOA21CD']
                  ).aggregate({'MULTI_GLAZE_PROPORTION':['median','mean'],
                               'EXTENSION_COUNT':['median','mean'],
                               'PHOTO_SUPPLY': ['median','mean'],
                               'PHOTO_SUPPLY_bi':['sum'],
                               'Roof_insulati':['sum'],
                               'Roof_thatch':['sum'],
                              'count':['count']}).reset_index(),2)
agg_g.columns = [c[0] + "_" + c[1] for c in agg_g.columns]
agg_g = agg_g.rename(columns = {'LSOA21CD_':'LSOA21CD'})

In [None]:
agg.info()

In [None]:
agg_g.info()

In [None]:
agg.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_loss.csv')
agg_g.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_GAS_loss.csv')

In [None]:
del loss, loss_g, agg, agg_g

# OTHER

In [None]:
other = load_compile_epc('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/cleaned_other/*')
other.head()

In [None]:
other = other.merge(lookup, how = 'left', on = 'UPRN')
other = essen_clean(other)
other['count'] = 1

In [None]:
other['PROPERTY_TYPE'].value_counts()

In [None]:
other['TOP_FLOOR'] = np.where(other['FLOOR_LEVEL']=='top floor',1,0)

In [None]:
other['TOP_FLOOR'].value_counts()

In [None]:
other['TOP_FLOOR_FLAT'] = np.where((other['TOP_FLOOR'] == 1) & (other['PROPERTY_TYPE'].isin(['Flat','Maisonette'])), 1,0)

In [None]:
dict_tr = {'marketed sale' :'other', 'ECO assessment' :'G_deal', 'rental' :'other', 'rental (private)' :'other',
       'new dwelling' :'other', 'FiT application' :'G_deal', 'none of the above' :'other',
       'rental (social)' :'other', 'assessment for green deal' :'G_deal',
       'Stock condition survey' :'other', 'non marketed sale' :'other',
       'not sale or rental' :'other', 'RHI application' :'G_deal', 'following green deal' :'G_deal',
       'Stock Condition Survey' :'other', 'unknown' :'other',
       'rental (private) - this is for backwards compatibility only and should not be used' :'other',
       'not recorded - this is for backwards compatibility only and should not be used' :'other',
       'rental (social) - this is for backwards compatibility only and should not be used':'other'}

other['TRAN_TYPE'] =  other['TRANSACTION_TYPE'].map(dict_tr)
other = other.reset_index()

In [None]:
other_g = other[other['MAINS_GAS_FLAG'] == 'Y']

In [None]:
TRAN_TYPE = pd.pivot(other.loc[:,['TRAN_TYPE','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='TRAN_TYPE', values='count'
         ).reset_index().loc[:,['LSOA21CD','other', 'G_deal']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'other':'TRANS_TYPE_other_sum','G_deal':'TRANS_TYPE_govdeal_sum'})

TRAN_TYPE_g = pd.pivot(other_g.loc[:,['TRAN_TYPE','UPRN','LSOA21CD','count','index']], 
         index = ['index','LSOA21CD'],columns='TRAN_TYPE', values='count'
         ).reset_index().loc[:,['LSOA21CD','other', 'G_deal']
                                ].groupby('LSOA21CD').sum().reset_index(

                                ).rename(columns = {'other':'TRANS_TYPE_other_sum','G_deal':'TRANS_TYPE_govdeal_sum'})

In [None]:
agg = round(other.groupby(['LSOA21CD']
                  ).aggregate({'TOP_FLOOR_FLAT':'sum',
                              'count':['count']}).reset_index(),2)
agg.columns = [c[0] + "_" + c[1] for c in agg.columns]
agg = agg.rename(columns = {'LSOA21CD_':'LSOA21CD'})

agg_g = round(other_g.groupby(['LSOA21CD']
                  ).aggregate({'TOP_FLOOR_FLAT':'sum',
                              'count':['count']}).reset_index(),2)
agg_g.columns = [c[0] + "_" + c[1] for c in agg_g.columns]
agg_g = agg_g.rename(columns = {'LSOA21CD_':'LSOA21CD'})

In [None]:
agg = agg.merge(TRAN_TYPE, on = 'LSOA21CD')
agg_g = agg_g.merge(TRAN_TYPE_g, on = 'LSOA21CD')

In [None]:
agg_g.info()

In [None]:
agg_g[agg_g['count_count'] >5]

In [None]:
agg.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_other.csv')
agg_g.to_csv('./../../Volumes/Extreme_SSD/WORK/EPC/english_EPC/LSOA_GAS_other.csv')