# Seattle Building Data Pre-processing
## Team 5 - Connor, John, Libby, & Natalie

This file contains the code used to preprocess and clean our selected dataset. The final step will be outputting a cleaned and processed dataset as a new .csv file.

#### Setup

In [1]:
#Makes paths work if you just clone or pull the repo
import os
os.chdir('../')
os.getcwd()

'c:\\Users\\jfsal\\Documents\\DATA422'

In [2]:
#Install Required Libraries (Only needs to run once)
%pip install -q pandas numpy 

Note: you may need to restart the kernel to use updated packages.


In [6]:
#import libraries
import pandas as pd

#### Import Data

In [7]:
#Paths
PATH_TO_DATASET = "Data/Seattle Building Energy Benchmarking/2022_Building_Energy_Benchmarking_20240906.csv"

In [8]:
#Import Dataset
building_DF = pd.read_csv(PATH_TO_DATASET) #raw data

print(building_DF.columns)

Index(['OSEBuildingID', 'DataYear', 'BuildingName', 'BuildingType',
       'TaxParcelIdentificationNumber', 'Address', 'City', 'State', 'ZipCode',
       'Latitude', 'Longitude', 'Neighborhood', 'CouncilDistrictCode',
       'YearBuilt', 'NumberofFloors', 'NumberofBuildings', 'PropertyGFATotal',
       'PropertyGFABuilding(s)', 'PropertyGFAParking', 'ENERGYSTARScore',
       'SiteEUIWN(kBtu/sf)', 'SiteEUI(kBtu/sf)', 'SiteEnergyUse(kBtu)',
       'SiteEnergyUseWN(kBtu)', 'SourceEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)',
       'EPAPropertyType', 'LargestPropertyUseType',
       'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseType',
       'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType',
       'ThirdLargestPropertyUseTypeGFA', 'Electricity(kWh)', 'SteamUse(kBtu)',
       'NaturalGas(therms)', 'ComplianceStatus', 'ComplianceIssue',
       'Electricity(kBtu)', 'NaturalGas(kBtu)', 'TotalGHGEmissions',
       'GHGEmissionsIntensity'],
      dtype='object')


#### Data Cleaning

In [9]:
#Non-compliant Buildings
compliant_buildings = building_DF[building_DF['ComplianceStatus'] == 'Compliant']
print(building_DF.shape[0] - compliant_buildings.shape[0], 'Dropped Rows')

#uncomment to save change (im lazy will delete later)
#building_DF = compliant_buildings

547 Dropped Rows


In [10]:
#Drop Columns That We Decided Not To Use
Dropped_Columns = ['TaxParcelIdentificationNumber', 'City', 'State', 'CouncilDistrictCode', 'PropertyGFABuilding(s)', 
                   'PropertyGFAParking', 'SiteEUIWN(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 
                   'SourceEUIWN(kBtu/sf)', 'LargestPropertyUseType', 'LargestPropertyUseTypeGFA', 
                   'SecondLargestPropertyUseType', 'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType',
                   'ThirdLargestPropertyUseTypeGFA', 'Electricity(kWh)', 'NaturalGas(therms)', 'TotalGHGEmissions']
#Drop Listed Columns
df_after_drop = building_DF.drop(columns=Dropped_Columns)

In [11]:
# Column wise Null counts
column_nulls = df_after_drop.isnull().sum()
sorted_column_nulls = column_nulls[column_nulls > 0].sort_values(ascending=False)
column_dtypes = df_after_drop.dtypes
nulls_and_dtypes = pd.DataFrame({
    'Null Count': sorted_column_nulls,
    'Data Type': column_dtypes[sorted_column_nulls.index]
})

print(nulls_and_dtypes)

                       Null Count Data Type
SteamUse(kBtu)               3517   float64
NaturalGas(kBtu)             1669   float64
ENERGYSTARScore              1174   float64
SourceEUI(kBtu/sf)            458   float64
SiteEUI(kBtu/sf)              458   float64
EPAPropertyType               234    object
GHGEmissionsIntensity         209   float64
Electricity(kBtu)             208   float64
Neighborhood                    1    object


In [12]:
def handle_null_values(b_df):
    '''This function takes the building dataframe and returns a new dataframe with the null values handled'''
    b_df['SteamUse(kBtu)'] = b_df['SteamUse(kBtu)'].fillna(0.0) #Buildings that don't have steam have zero steam use
    b_df['NaturalGas(kBtu)'] = b_df['NaturalGas(kBtu)'].fillna(0.0) #Buildings that don't have natural gas use zero natural gas
    b_df['ENERGYSTARScore'] = b_df['ENERGYSTARScore'] #not sure what to do with this yet
    b_df['SourceEUI'] = b_df['SourceEUI'].fillna(b_df['SourceEUI'].mean()) #Fill with mean
    b_df['SiteEUI'] = b_df['SiteEUI'].fillna(b_df['SiteEUI'].mean()) #Fill with mean
    b_df['EPAPropertyType'] = b_df['EPAPropertyType'].fillna('Other') #Fill with 'Other'
    b_df = b_df['GHGEmissionsIntensity'].fillna(b_df['EmissionsIntensity'].mean()) #drop Rows with Nulls
    b_df['Electricity(kBtu)'] = b_df['Electricity(kBtu)'].fillna(b_df['Electricity(kBtu)'].mean()) #leaving as zero electricity use for now but needs to be fixed
    b_df = b_df.dropna(subset=['Neighborhood']) #drop the single row with a null, goodbye :(
        
    return b_df

df_nulls_cleaned = handle_null_values(df_after_drop)

df_nulls_cleaned.to_csv('Data/Building_Data_Cleaned.csv', index=False)

KeyError: 'SourceEUI'

#### Processing Steps

In [None]:
clean_data = pd.read_csv('Data/Building_Data_Cleaned.csv')
#encoding
from sklearn.preprocessing import StandardScaler, OneHotEncoder

cat_cols = []
num_cols = []


#### Export Processed Dataset

In [None]:
print(building_DF)
building_DF.to_csv('Data/Building_Data_PP.csv', index=False)