# Clean up input files

The input files in data/input_data are from a spreadsheet that RMI gave us. These are not in an ideal format for Pandas modeling so we'll clean them up here.

Files cleaned up:
    
- Scenario 1 & 2 (Jan/June) target GHGI tables
- Initial baseline calculations from RMI
- Buildings data

In [2]:
import pandas as pd
import numpy as np

## Scenarios 1 & 2

Converting these wide tables to long table format.

Each table row will consist of:

- year
- building type
- sq ft (range)
- target GHGI
- max sq ft
- min sq ft

Replace zeros before GHGHI targets set in with NaNs. 

In [9]:
scen_1_raw = pd.read_csv('../data/input_data/input_scenarios_scenario_1_jan.csv', header=None)

In [10]:
scen_1_raw[:30]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,123,124,125,126,127,128,129,130,131,132
0,Year,College/University,College/University,College/University,College/University,College/University,College/University,Entertainment/Public Assembly,Entertainment/Public Assembly,Entertainment/Public Assembly,...,Worship Facility,Worship Facility,Worship Facility,Worship Facility,,,,,,
1,Building sq ft,>220K Buildings,>90-220K Buildings,>50-90K Buildings,>30-50K Buildings,>20-30K Buildings,<20k buildings,>220K Buildings,>90-220K Buildings,>50-90K Buildings,...,>50-90K Buildings,>30-50K Buildings,>20-30K Buildings,<20k buildings,>220K Buildings,>90-220K Buildings,>50-90K Buildings,>30-50K Buildings,>20-30K Buildings,<20k buildings
2,Building sq ft category,A,B,C,D,E,F,A,B,C,...,C,D,E,F,A,B,C,D,E,F
3,2027,2.83,2.83,0,0,0,0,1.25,1.25,0,...,0,0,0,0,0,0,0,0,0,0
4,2028,2.83,2.83,2.83,0,0,0,1.25,1.25,1.25,...,1.19,0,0,0,0,0,0,0,0,0
5,2029,2.83,2.83,2.83,2.83,0,0,1.25,1.25,1.25,...,1.19,1.19,0,0,0,0,0,0,0,0
6,2030,2.83,2.83,2.83,2.83,2.83,0,1.25,1.25,1.25,...,1.19,1.19,1.19,0,0,0,0,0,0,0
7,2031,2,2.83,2.83,2.83,2.83,0,0.89,1.25,1.25,...,1.19,1.19,1.19,0,0,0,0,0,0,0
8,2032,2,2,2.83,2.83,2.83,0,0.89,0.89,1.25,...,1.19,1.19,1.19,0,0,0,0,0,0,0
9,2033,2,2,2,2.83,2.83,0,0.89,0.89,0.89,...,0.84,1.19,1.19,0,0,0,0,0,0,0


In [22]:
scen_1_raw[0][3]

'2027'

In [32]:
def enlongate_scenarios(raw_df):
    col_names = ['year', 'building_type', 'sq_ft', 'sq_ft_classification', 'ghgi']
    long_df = pd.DataFrame(columns=col_names)
    for row_index in range(3, len(raw_df)):
        for col_index in range(1, len(raw_df.columns)):
            year = raw_df[0][row_index]
            building_type = raw_df[col_index][0]
            sq_ft = raw_df[col_index][1]
            sq_ft_classification = raw_df[col_index][2]
            ghgi = raw_df[col_index][row_index]
            long_df.loc[len(long_df)] = [year, building_type, sq_ft, sq_ft_classification, ghgi]
    return long_df

In [34]:
scen_1_formatted = enlongate_scenarios(scen_1_raw)

In [36]:
scen_1_formatted[:20]

Unnamed: 0,year,building_type,sq_ft,sq_ft_classification,ghgi
0,2027,College/University,>220K Buildings,A,2.83
1,2027,College/University,>90-220K Buildings,B,2.83
2,2027,College/University,>50-90K Buildings,C,0.0
3,2027,College/University,>30-50K Buildings,D,0.0
4,2027,College/University,>20-30K Buildings,E,0.0
5,2027,College/University,<20k buildings,F,0.0
6,2027,Entertainment/Public Assembly,>220K Buildings,A,1.25
7,2027,Entertainment/Public Assembly,>90-220K Buildings,B,1.25
8,2027,Entertainment/Public Assembly,>50-90K Buildings,C,0.0
9,2027,Entertainment/Public Assembly,>30-50K Buildings,D,0.0


In [81]:
building_size_params = {
    '>220K Buildings': {
        'min': 220000,
        'max': 1000000 #no buildings over a million
    },
        '>90-220K Buildings': {
        'min': 90000,
        'max': 220000
    },
        '>50-90K Buildings': {
        'min': 50000,
        'max': 90000
    },
        '>30-50K Buildings': {
        'min': 30000,
        'max': 50000
    },
        '>20-30K Buildings': {
        'min': 20000,
        'max': 50000
    },
        '<20k buildings': {
        'min': 0,
        'max': 20000
    },
}

In [82]:
def replace_building_size_params(df):
    # todo: fix this to use series mapping properly
    min_vals = [building_size_params[x]['min'] for x in df['sq_ft']]
    max_vals = [building_size_params[x]['max'] for x in df['sq_ft']]
    df['max_size'] = max_vals
    df['min_size'] = min_vals
    return df

In [85]:
scen_1_formatted = replace_building_size_params(scen_1_formatted)

In [86]:
scen_1_formatted.head()

Unnamed: 0,year,building_type,sq_ft,sq_ft_classification,ghgi,max_size,min_size
0,2027,College/University,>220K Buildings,A,2.83,1000000,220000
1,2027,College/University,>90-220K Buildings,B,2.83,220000,90000
2,2027,College/University,>50-90K Buildings,C,0.0,90000,50000
3,2027,College/University,>30-50K Buildings,D,0.0,50000,30000
4,2027,College/University,>20-30K Buildings,E,0.0,50000,20000


In [94]:
def reset_datatypes_in_ghgi_targets(df):
    types = {
        'year': 'int64',
        'building_type': 'str',
        'sq_ft': 'str',
        'sq_ft_classification': 'str',
        'ghgi': 'float64',
        'max_size': 'int64',
        'min_size': 'int64'
    }
    
    return df.astype(types)

In [95]:
scen_1_formatted = reset_datatypes_in_ghgi_targets(scen_1_formatted)

In [125]:
# original dataset was ghgi = 0 if no compliance requirement yet
# this is confusing, switch to NaN
# also NaN if it's less than 20k sq ft, which isn't covered by the plan

def convert_zero_to_nan_before_compliance_deadline(df):
    first_compliance_yr = 2041
    df['ghgi'][(df['year'] < first_compliance_yr) & (df['ghgi'] == 0)] = pd.NA
    df['ghgi'][df['max_size'] == 20000] = pd.NA
    return df

In [126]:
scen_1_formatted = convert_zero_to_nan_before_compliance_deadline(scen_1_formatted)

scen_1_formatted.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ghgi'][(df['year'] < first_compliance_yr) & (df['ghgi'] == 0)] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ghgi'][df['max_size'] == 20000] = pd.NA


Unnamed: 0,year,building_type,sq_ft,sq_ft_classification,ghgi,max_size,min_size
0,2027,College/University,>220K Buildings,A,2.83,1000000,220000
1,2027,College/University,>90-220K Buildings,B,2.83,220000,90000
2,2027,College/University,>50-90K Buildings,C,,90000,50000
3,2027,College/University,>30-50K Buildings,D,,50000,30000
4,2027,College/University,>20-30K Buildings,E,,50000,20000


In [127]:
scen_1_formatted[:20]

Unnamed: 0,year,building_type,sq_ft,sq_ft_classification,ghgi,max_size,min_size
0,2027,College/University,>220K Buildings,A,2.83,1000000,220000
1,2027,College/University,>90-220K Buildings,B,2.83,220000,90000
2,2027,College/University,>50-90K Buildings,C,,90000,50000
3,2027,College/University,>30-50K Buildings,D,,50000,30000
4,2027,College/University,>20-30K Buildings,E,,50000,20000
5,2027,College/University,<20k buildings,F,,20000,0
6,2027,Entertainment/Public Assembly,>220K Buildings,A,1.25,1000000,220000
7,2027,Entertainment/Public Assembly,>90-220K Buildings,B,1.25,220000,90000
8,2027,Entertainment/Public Assembly,>50-90K Buildings,C,,90000,50000
9,2027,Entertainment/Public Assembly,>30-50K Buildings,D,,50000,30000


In [111]:
# todo: the <20k buildings have all targets listed at 0

def reformat_scenario_csv(csv_name):
    # takes csv name w path
    df = pd.read_csv(csv_name, header=None)
    df = enlongate_scenarios(df)
    df = replace_building_size_params(df)
    df = reset_datatypes_in_ghgi_targets(df)
    df = convert_zero_to_nan_before_compliance_deadline(df)
    return df

In [128]:
s1 = reformat_scenario_csv('../data/input_data/input_scenarios_scenario_1_jan.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ghgi'][(df['year'] < first_compliance_yr) & (df['ghgi'] == 0)] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ghgi'][df['max_size'] == 20000] = pd.NA


In [129]:
s1[:20]

Unnamed: 0,year,building_type,sq_ft,sq_ft_classification,ghgi,max_size,min_size
0,2027,College/University,>220K Buildings,A,2.83,1000000,220000
1,2027,College/University,>90-220K Buildings,B,2.83,220000,90000
2,2027,College/University,>50-90K Buildings,C,,90000,50000
3,2027,College/University,>30-50K Buildings,D,,50000,30000
4,2027,College/University,>20-30K Buildings,E,,50000,20000
5,2027,College/University,<20k buildings,F,,20000,0
6,2027,Entertainment/Public Assembly,>220K Buildings,A,1.25,1000000,220000
7,2027,Entertainment/Public Assembly,>90-220K Buildings,B,1.25,220000,90000
8,2027,Entertainment/Public Assembly,>50-90K Buildings,C,,90000,50000
9,2027,Entertainment/Public Assembly,>30-50K Buildings,D,,50000,30000


In [130]:
s1.to_csv('../data/input_data/scen_1_reformatted.csv')

In [131]:
s2 = reformat_scenario_csv('../data/input_data/input_scenarios_scenario_2_june.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ghgi'][(df['year'] < first_compliance_yr) & (df['ghgi'] == 0)] = pd.NA
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ghgi'][df['max_size'] == 20000] = pd.NA


In [132]:
s2['ghgi'][s2['year'] < 2041].unique()

array([ nan, 2.69, 1.18, 2.23, 4.68, 2.06, 0.95, 6.3 , 0.77, 0.81, 2.48,
       3.22, 0.98, 1.16, 5.73, 1.03, 0.31, 2.11, 1.36, 3.42, 1.2 , 0.89,
       1.57, 0.69, 1.3 , 2.73, 0.56, 3.68, 0.45, 0.47, 1.45, 1.88, 0.57,
       0.68, 3.34, 0.6 , 0.18, 1.23, 0.79, 2.  , 0.7 , 0.63])

In [133]:
s2.to_csv('../data/input_data/scen_2_reformatted.csv')

## Look at calculations in analysis tab

Double-check that the calculated baseline emissions looks correct as a dataframe, using the numbers calculated by RMI.

TODO: generate these numbers in Pandas model so that we can change it easily.

In [134]:
baseline_df = pd.read_csv('../data/input_data/RMI-BEPS_Calculator-Analysis-All-Benchmark-Data(2019).csv')

In [135]:
baseline_df.head()

Unnamed: 0,OSEBuildingID,BuildingName,BuildingType,Type of Bulding,PropertyGFATotal,PropertyGFABuilding(s),PropertyGFAParking,Total sqft,% sqft 1st,% sqft 2nd,...,ThirdLargestPropertyUseType,ThirdLargestPropertyUseType OSE,ThirdLargestPropertyUseTypeGFA,ThirdLargestPropertyUseTypeGFA Analysis,Electricity(kBtu),Steam(kBtu),NaturalGas(kBtu),TotalGHGEmissions,GHGEmissionsIntensity,Total GFA
0,1,MAYFLOWER PARK HOTEL,NonResidential,NonResidential,88434,88434,0,88434,100%,0%,...,,,0,0,3871996,2159078,1320791,208.8,2.4,88434
1,2,PARAMOUNT HOTEL,NonResidential,NonResidential,103566,88502,15064,88502,95%,0%,...,Restaurant,Restaurant,4622,4622,2946902,0,5059502,286.9,3.2,103566
2,3,WESTIN HOTEL (Parent Building),NonResidential,NonResidential,956110,759392,196718,756493,100%,0%,...,Swimming Pool,Recreation,0,0,44075841,22601024,1426400,1549.0,2.0,895128
3,5,HOTEL MAX,NonResidential,NonResidential,61320,61320,0,61320,100%,0%,...,,,0,0,2201145,2104444,2013415,232.4,3.8,61320
4,8,WARWICK SEATTLE HOTEL,NonResidential,NonResidential,175580,113580,62000,123445,100%,0%,...,Swimming Pool,Recreation,0,0,5444815,0,10430292,587.5,5.2,191454


In [136]:
baseline_df.columns

Index(['OSEBuildingID', 'BuildingName', 'BuildingType', 'Type of Bulding',
       'PropertyGFATotal', 'PropertyGFABuilding(s)', 'PropertyGFAParking',
       'Total sqft', '% sqft 1st', '% sqft 2nd', '% sqft 3rd',
       'LargestPropertyUseType', 'LargestPropertyUseType OSE',
       'LargestPropertyUseTypeGFA', 'LargestPropertyUseTypeGFA Analysis',
       'SecondLargestPropertyUseType', 'SecondLargestPropertyUseType OSE',
       'SecondLargestPropertyUseTypeGFA',
       'SecondLargestPropertyUseTypeGFA Analysis',
       'ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseType OSE',
       'ThirdLargestPropertyUseTypeGFA',
       'ThirdLargestPropertyUseTypeGFA Analysis', 'Electricity(kBtu)',
       'Steam(kBtu)', 'NaturalGas(kBtu)', 'TotalGHGEmissions',
       'GHGEmissionsIntensity', 'Total  GFA'],
      dtype='object')

In [138]:
baseline_df.columns = ['OSEBuildingID', 'BuildingName', 'BuildingType', 'Type_of_Bulding',
       'PropertyGFATotal', 'PropertyGFABuilding(s)', 'PropertyGFAParking',
       'Total_sqft', 'percent_sqft_1st', 'percent_sqft_2nd', 'percent_sqft_3rd',
       'LargestPropertyUseType', 'LargestPropertyUseType OSE',
       'LargestPropertyUseTypeGFA', 'LargestPropertyUseTypeGFA Analysis',
       'SecondLargestPropertyUseType', 'SecondLargestPropertyUseType OSE',
       'SecondLargestPropertyUseTypeGFA',
       'SecondLargestPropertyUseTypeGFA Analysis',
       'ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseType OSE',
       'ThirdLargestPropertyUseTypeGFA',
       'ThirdLargestPropertyUseTypeGFA Analysis', 'Electricity(kBtu)',
       'Steam(kBtu)', 'NaturalGas(kBtu)', 'TotalGHGEmissions',
       'GHGEmissionsIntensity', 'Total_GFA']

In [11]:
baseline_df.to_csv('../data/input_data/All-Benchmark-Data(2019)_reformatted.csv')

NameError: name 'baseline_df' is not defined

In [12]:
# didn't change Multifamily Housing2 to Multifamily Housing in the property type columns

reformatted_building_df = pd.read_csv('../data/input_data/All-Benchmark-Data(2019)_reformatted.csv')

In [6]:
reformatted_building_df['LargestPropertyUseType OSE'].unique()

array(['Hotel', 'Fire/Police Station', 'Entertainment/Public Assembly',
       'Multifamily Housing2', 'Services', 'Recreation', 'Other',
       'K-12 School', 'College/University', 'Office',
       'Self-Storage Facility', 'Retail Store', 'Senior Living Community',
       'Supermarket/Grocery Store', 'Hospital',
       'Residence Hall/Dormitory', 'Non-Refrigerated Warehouse', nan,
       'Worship Facility', 'Laboratory', 'Restaurant',
       'Refrigerated Warehouse'], dtype=object)

In [8]:
reformatted_building_df['SecondLargestPropertyUseType OSE'].unique()

array([nan, 'Retail Store', 'Office', 'Restaurant', 'K-12 School',
       'Laboratory', 'Non-Refrigerated Warehouse', 'Other', 'Services',
       'Entertainment/Public Assembly', 'Residence Hall/Dormitory',
       'Self-Storage Facility', 'College/University',
       'Supermarket/Grocery Store', 'Recreation', 'Hotel',
       'Multifamily Housing2', 'Refrigerated Warehouse', 'Hospital',
       'Worship Facility', 'Senior Living Community',
       'Fire/Police Station'], dtype=object)

In [17]:
multifamily_map = { 'Multifamily Housing2': 'Multifamily Housing' }

reformatted_building_df['LargestPropertyUseType OSE'] = reformatted_building_df['LargestPropertyUseType OSE'].replace(multifamily_map)

In [18]:
reformatted_building_df['LargestPropertyUseType OSE'].unique()

array(['Hotel', 'Fire/Police Station', 'Entertainment/Public Assembly',
       'Multifamily Housing', 'Services', 'Recreation', 'Other',
       'K-12 School', 'College/University', 'Office',
       'Self-Storage Facility', 'Retail Store', 'Senior Living Community',
       'Supermarket/Grocery Store', 'Hospital',
       'Residence Hall/Dormitory', 'Non-Refrigerated Warehouse', nan,
       'Worship Facility', 'Laboratory', 'Restaurant',
       'Refrigerated Warehouse'], dtype=object)

In [19]:
reformatted_building_df['SecondLargestPropertyUseType OSE'] = reformatted_building_df['SecondLargestPropertyUseType OSE'].replace(multifamily_map)

In [20]:
reformatted_building_df['SecondLargestPropertyUseType OSE'].unique()

array([nan, 'Retail Store', 'Office', 'Restaurant', 'K-12 School',
       'Laboratory', 'Non-Refrigerated Warehouse', 'Other', 'Services',
       'Entertainment/Public Assembly', 'Residence Hall/Dormitory',
       'Self-Storage Facility', 'College/University',
       'Supermarket/Grocery Store', 'Recreation', 'Hotel',
       'Multifamily Housing', 'Refrigerated Warehouse', 'Hospital',
       'Worship Facility', 'Senior Living Community',
       'Fire/Police Station'], dtype=object)

In [21]:
reformatted_building_df['ThirdLargestPropertyUseType OSE'] = reformatted_building_df['ThirdLargestPropertyUseType OSE'].replace(multifamily_map)

reformatted_building_df['ThirdLargestPropertyUseType OSE'].unique()

array([nan, 'Restaurant', 'Recreation', 'Office', 'Hotel',
       'Entertainment/Public Assembly', 'Non-Refrigerated Warehouse',
       'Retail Store', 'Other', 'Services', 'K-12 School',
       'Supermarket/Grocery Store', 'Multifamily Housing', 'Laboratory',
       'Self-Storage Facility', 'Senior Living Community', 'Hospital',
       'Refrigerated Warehouse', 'Worship Facility', 'College/University'],
      dtype=object)

In [22]:
reformatted_building_df.to_csv('../data/input_data/All-Benchmark-Data(2019)_reformatted_8_3.csv')

In [23]:
# wrong file was reformatted
cleaned_building_df = pd.read_csv('../data/input_data/building_data_recleaned.csv')

In [26]:
multifamily_map = { 'Multifamily Housing2': 'Multifamily Housing' }
for column in ['LargestPropertyUseType OSE', 'SecondLargestPropertyUseType OSE', 'ThirdLargestPropertyUseType OSE']:
    cleaned_building_df[column] = cleaned_building_df[column].replace(multifamily_map)
    print(cleaned_building_df[column].unique())

['Hotel' 'Fire/Police Station' 'Entertainment/Public Assembly'
 'Multifamily Housing' 'Services' 'Recreation' 'Other' 'K-12 School'
 'College/University' 'Office' 'Self-Storage Facility' 'Retail Store'
 'Senior Living Community' 'Supermarket/Grocery Store' 'Hospital'
 'Residence Hall/Dormitory' 'Non-Refrigerated Warehouse' nan
 'Worship Facility' 'Laboratory' 'Restaurant' 'Refrigerated Warehouse']
[nan 'Retail Store' 'Office' 'Restaurant' 'K-12 School' 'Laboratory'
 'Non-Refrigerated Warehouse' 'Other' 'Services'
 'Entertainment/Public Assembly' 'Residence Hall/Dormitory'
 'Self-Storage Facility' 'College/University' 'Supermarket/Grocery Store'
 'Recreation' 'Hotel' 'Multifamily Housing' 'Refrigerated Warehouse'
 'Hospital' 'Worship Facility' 'Senior Living Community'
 'Fire/Police Station']
[nan 'Restaurant' 'Recreation' 'Office' 'Hotel'
 'Entertainment/Public Assembly' 'Non-Refrigerated Warehouse'
 'Retail Store' 'Other' 'Services' 'K-12 School'
 'Supermarket/Grocery Store' 'Multifam

In [27]:
cleaned_building_df.to_csv('../data/input_data/building_data_recleaned_8_3.csv')

## Energy emissions factors

Look at predicted emissions factors.

RMI's asssumptions: 

- Electricity--SCL gets to 100% carbon free by 2045 due to state law
- Gas--PSE is unable to decarbonize gas sector
- Steam--District heating is electrified, assuming linear emission factor trend

In [140]:
emissions = pd.read_csv('../data/input_data/energy_emissions.csv')

In [141]:
emissions.head()

Unnamed: 0,Year,Electricity emission factor (kgCO2e/kBtu),Steam emission factor (kgCO2e/kBtu),Gas emission factor (kgCO2e/kBtu)
0,2027,0.0055,0.083,0.053
1,2028,0.0055,0.083,0.053
2,2029,0.0055,0.083,0.053
3,2030,0.0026,0.083,0.053
4,2031,0.0026,0.083,0.053


In [11]:
jan_scen = pd.read_csv('../data/input_data/scen_1_reformatted.csv')
jan_scen['building_type'].unique()

array(['College/University', 'Entertainment/Public Assembly',
       'Fire/Police Station', 'Hospital', 'Hotel', 'K-12 School',
       'Laboratory', 'Multifamily Housing', 'Non-Refrigerated Warehouse',
       'Office', 'Other', 'Recreation', 'Refrigerated Warehouse',
       'Residence Hall/Dormitory', 'Restaurant', 'Retail Store',
       'Self-Storage Facility', 'Senior Living Community', 'Services',
       'Supermarket/Grocery Store', 'Worship Facility', nan], dtype=object)

In [3]:
june_scen = pd.read_csv('../data/input_data/scen_2_reformatted.csv')

In [15]:
june_scen['building_type'].unique()

array(['College/University', 'Entertainment/Public Assembly',
       'Fire/Police Station', 'Hospital', 'Hotel', 'K-12 School',
       'Laboratory', 'Multifamily Housing2', 'Multifamily Housing3',
       'Multifamily Housing4', 'Multifamily Housing5',
       'Multifamily Housing6', 'Multifamily Housing7',
       'Non-Refrigerated Warehouse', 'Office', 'Other', 'Recreation',
       'Refrigerated Warehouse', 'Residence Hall/Dormitory', 'Restaurant',
       'Retail Store', 'Self-Storage Facility', 'Senior Living Community',
       'Services', 'Supermarket/Grocery Store', 'Worship Facility', nan],
      dtype=object)

In [29]:
june_scen = june_scen.replace(to_replace='Multifamily Housing[0-9]',value='Multifamily Housing',regex=True)

In [30]:
june_scen['building_type'].unique()

array(['College/University', 'Entertainment/Public Assembly',
       'Fire/Police Station', 'Hospital', 'Hotel', 'K-12 School',
       'Laboratory', 'Multifamily Housing', 'Non-Refrigerated Warehouse',
       'Office', 'Other', 'Recreation', 'Refrigerated Warehouse',
       'Residence Hall/Dormitory', 'Restaurant', 'Retail Store',
       'Self-Storage Facility', 'Senior Living Community', 'Services',
       'Supermarket/Grocery Store', 'Worship Facility', nan], dtype=object)

In [31]:
june_scen.to_csv('../data/input_data/scen_2_reformatted_8_3.csv')