In [94]:
from etl.extract import ProjectZero
from etl.utilities import Utilities
from scipy.stats import zscore
import pandas as pd

design_data = ProjectZero().get_data()['hz_model'].copy()
training_data = Utilities().get_training_data().copy()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [95]:
training_data.head(3)

Unnamed: 0,building_typology,building_gfa,primary_gfa,secondary_typology,secondary_gfa,year_built,occupancy,num_buildings,electricity_demmand
0,office,169416,164754.0,bank_branch,4662.0,1909,95,1,1920103.6
1,school,94380,94380.0,school,94380.0,1963,100,1,180640.0
3,hotel,50000,50000.0,hotel,50000.0,1994,100,1,579335.2


In [96]:
design_data.head()

Unnamed: 0,ID,Typology,Area,Volume,Plot,Building,Envelope,Elevation,Colour
0,37108380-bb59-4614-9e39-2de7f9036322,Retail,400,1400,3,Tower 03,280.0,0,#FF5442
1,a3b644fb-21ee-468c-b31f-fae4b5662842,Retail,400,1400,3,Podium 03,280.0,0,#FF5442
2,0ea705aa-80aa-4619-9d9a-b86fd3f0c322,Retail,400,1400,2,Tower 02,280.0,0,#FF5442
3,36d9da83-6389-45b1-8b66-5300aae9d977,Retail,400,1400,1,Tower 01,280.0,0,#FF5442
4,56a2686a-5346-4f86-8b84-d9f7de354eaf,Community,400,1400,1,Podium 01,280.0,0,#96D200


In [97]:
# drop irrelevant columns
design_data.drop(columns=['ID','Volume','Elevation','Colour', 'Envelope'], inplace=True)

In [98]:
design_data.Building.unique()

array(['Tower 03', 'Podium 03', 'Tower 02', 'Tower 01', 'Podium 01',
       'Podium 02'], dtype=object)

In [99]:
# Rank areas within each building type
design_data['Area_Rank'] = design_data.groupby('Building')['Area'].rank(ascending=False)

# Sort by building and area rank
design_data = design_data.sort_values(by=['Building', 'Area_Rank'])

# Rank areas within each building and typology
design_data['Area_Rank'] = design_data.groupby(['Building', 'Typology'])['Area'].rank(ascending=False)

# Sort by building, typology, and area rank
design_data = design_data.sort_values(by=['Building', 'Typology', 'Area_Rank'])

# Set index before groupby to preserve all rows
design_data.set_index(['Plot', 'Building', 'Typology', 'Area_Rank'], inplace=True)

# Group by index and calculate the sum
result_df = design_data.groupby(level=[0, 1, 2, 3]).sum()

# Reset index to bring back the original DataFrame structure
result_df.reset_index(inplace=True)
result_df

Unnamed: 0,Plot,Building,Typology,Area_Rank,Area
0,1,Podium 01,Community,1.0,400
1,1,Tower 01,Office,3.0,2000
2,1,Tower 01,Retail,1.0,400
3,2,Podium 02,Education,1.0,400
4,2,Tower 02,Residential,2.0,1200
5,2,Tower 02,Retail,1.0,400
6,3,Podium 03,Retail,1.0,400
7,3,Tower 03,Hotel,1.5,800
8,3,Tower 03,Retail,1.0,400


In [100]:
# Find the index of the maximum 'Area_Rank' within each group
max_rank_index = result_df.groupby(['Plot', 'Building'])['Area_Rank'].idxmax()

# Create a new DataFrame with the highest ranked Typology and Area for each building
primary_asset_df = result_df.loc[max_rank_index, ['Plot', 'Building', 'Typology', 'Area_Rank', 'Area']].reset_index(drop=True)

# Rename the columns for clarity
primary_asset_df.rename(columns={'Typology': 'Primary_Asset', 'Area': 'Primary_Asset_Area'}, inplace=True)

# Merge the primary_asset_df back to the original DataFrame without automatic renaming of 'Area_Rank'
result_df = pd.merge(result_df, primary_asset_df, left_on=['Plot', 'Building', 'Area_Rank'], right_on=['Plot', 'Building', 'Area_Rank'], how='left')

# Exclude the rows corresponding to the maximum rank
result_df_excluded_max = result_df[~result_df.index.isin(max_rank_index)]

# Find the index of the second maximum 'Area_Rank' within each group in the remaining rows
second_max_rank_index = result_df_excluded_max.groupby(['Plot', 'Building'])['Area_Rank'].idxmax()

# Create a new DataFrame with the second highest ranked Typology and Area for each building
second_primary_asset_df = result_df_excluded_max.loc[second_max_rank_index, ['Plot', 'Building', 'Typology', 'Area_Rank', 'Area']].reset_index(drop=True)

# Rename the columns for clarity
second_primary_asset_df.rename(columns={'Typology': 'Second_Asset', 'Area': 'Second_Asset_Area'}, inplace=True)

# Merge the second_primary_asset_df back to the original DataFrame without automatic renaming of 'Area_Rank'
result_df = pd.merge(result_df, second_primary_asset_df, left_on=['Plot', 'Building', 'Area_Rank'], right_on=['Plot', 'Building', 'Area_Rank'], how='left')
result_df


Unnamed: 0,Plot,Building,Typology,Area_Rank,Area,Primary_Asset,Primary_Asset_Area,Second_Asset,Second_Asset_Area
0,1,Podium 01,Community,1.0,400,Community,400.0,,
1,1,Tower 01,Office,3.0,2000,Office,2000.0,,
2,1,Tower 01,Retail,1.0,400,,,Retail,400.0
3,2,Podium 02,Education,1.0,400,Education,400.0,,
4,2,Tower 02,Residential,2.0,1200,Residential,1200.0,,
5,2,Tower 02,Retail,1.0,400,,,Retail,400.0
6,3,Podium 03,Retail,1.0,400,Retail,400.0,,
7,3,Tower 03,Hotel,1.5,800,Hotel,800.0,,
8,3,Tower 03,Retail,1.0,400,,,Retail,400.0


In [107]:
result_df.groupby(['Plot', 'Building']).sum().reset_index(drop=False)

  result_df.groupby(['Plot', 'Building']).sum().reset_index(drop=False)


Unnamed: 0,Plot,Building,Area_Rank,Area,Primary_Asset_Area,Second_Asset_Area
0,1,Podium 01,1.0,400,400.0,0.0
1,1,Tower 01,4.0,2400,2000.0,400.0
2,2,Podium 02,1.0,400,400.0,0.0
3,2,Tower 02,3.0,1600,1200.0,400.0
4,3,Podium 03,1.0,400,400.0,0.0
5,3,Tower 03,2.5,1200,800.0,400.0
