---

### Load Data

In [402]:
%load_ext autoreload
%autoreload 2

# import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plot

from etl.extract import ProjectZero

# import projectzero data
from etl.extract import ProjectZero
data = ProjectZero().get_data()

# df_model instance
df_nyc = data['ext_nyc']
df_seattle = data['ext_seattle'].copy()
pd.options.display.float_format = '{:.2f}'.format


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [403]:
# NYC columns
for i in list(df_nyc.columns):
    print(i)

Unnamed: 0
Order
Property Id
Property Name
Parent Property Id
Parent Property Name
BBL - 10 digits
NYC Borough, Block and Lot (BBL) self-reported
NYC Building Identification Number (BIN)
Address 1 (self-reported)
Address 2 (self-reported)
Postal Code
Street Number
Street Name
Borough
DOF Gross Floor Area (ft²)
Self-Reported Gross Floor Area (ft²)
Primary Property Type - Self Selected
List of All Property Use Types at Property
Largest Property Use Type
Largest Property Use Type - Gross Floor Area (ft²)
2nd Largest Property Use Type
2nd Largest Property Use - Gross Floor Area (ft²)
3rd Largest Property Use Type
3rd Largest Property Use Type - Gross Floor Area (ft²)
Year Built
Number of Buildings
Occupancy
Metered Areas (Energy)
Metered Areas  (Water)
ENERGY STAR Score
Source EUI (kBtu/ft²)
Weather Normalized Source EUI (kBtu/ft²)
Site EUI (kBtu/ft²)
Weather Normalized Site EUI (kBtu/ft²)
Weather Normalized Site Electricity Intensity (kWh/ft²)
Weather Normalized Site Natural Gas Intensity

In [404]:
# Seattle Columns
for i in list(df_seattle.columns):
    print(i)

OSEBuildingID
DataYear
BuildingType
PrimaryPropertyType
PropertyName
TaxParcelIdentificationNumber
Location
CouncilDistrictCode
Neighborhood
YearBuilt
NumberofBuildings
NumberofFloors
PropertyGFATotal
PropertyGFAParking
PropertyGFABuilding(s)
ListOfAllPropertyUseTypes
LargestPropertyUseType
LargestPropertyUseTypeGFA
SecondLargestPropertyUseType
SecondLargestPropertyUseTypeGFA
ThirdLargestPropertyUseType
ThirdLargestPropertyUseTypeGFA
YearsENERGYSTARCertified
ENERGYSTARScore
SiteEUI(kBtu/sf)
SiteEUIWN(kBtu/sf)
SourceEUI(kBtu/sf)
SourceEUIWN(kBtu/sf)
SiteEnergyUse(kBtu)
SiteEnergyUseWN(kBtu)
SteamUse(kBtu)
Electricity(kWh)
Electricity(kBtu)
NaturalGas(therms)
NaturalGas(kBtu)
OtherFuelUse(kBtu)
GHGEmissions(MetricTonsCO2e)
GHGEmissionsIntensity(kgCO2e/ft2)
DefaultData
Comment
ComplianceStatus
Outlier
2010 Census Tracts
Seattle Police Department Micro Community Policing Plan Areas
City Council Districts
SPD Beats
Zip Codes


In [405]:
seattle_features = [
    'PrimaryPropertyType',
    'NumberofFloors',
    'PropertyGFABuilding(s)',
    'PropertyGFAParking',
    'YearBuilt',
    'BuildingType',
    'Electricity(kWh)']

df_seattle = df_seattle[seattle_features]

nyc_features = [
    'Primary Property Type - Self Selected',
    'Self-Reported Gross Floor Area (ft²)',
    'Year Built',
    'Electricity Use - Grid Purchase (kWh)',
    'Occupancy',
    'Number of Buildings'
]

df_nyc = df_nyc[nyc_features]

# rename nyc columns to match seattle
df_nyc_renamed = df_nyc.rename(columns={
    'Primary Property Type - Self Selected': 'PrimaryPropertyType' ,
    'Self-Reported Gross Floor Area (ft²)': 'PropertyGFABuilding(s)',
    'Year Built': 'YearBuilt',
    'Electricity Use - Grid Purchase (kWh)': 'Electricity(kWh)',
    'Occupancy': 'Occupancy' 
})

# Concatenate the two DataFrames
merged_df = pd.concat([df_nyc_renamed, df_seattle])

merged_df.isna().sum()
cities_df = merged_df

df_nyc_renamed.dropna(subset=['Electricity(kWh)'], inplace=True)
df_nyc_renamed['Number of Buildings'].unique()

array([  1,  11,   3,   4,   6,   8,   2,   0,  23,   9,   5,  10,  52,
        35,  22,  45, 107,  13,  68,  83,  98,  19,  32,  12,  26,  14,
        43, 126,  33,   7,  15,  31,  56, 161,  16,  17, 101,  41, 111,
        37,  25,  18,  29,  24, 102, 150,  62, 155,  30,  21,  49,  60],
      dtype=int64)

In [406]:
df_nyc_renamed['PrimaryPropertyType'].unique()
    

array(['Office', 'K-12 School', 'Hotel', 'Worship Facility',
       'Multifamily Housing', 'Distribution Center',
       'Refrigerated Warehouse', 'Manufacturing/Industrial Plant',
       'Hospital (General Medical & Surgical)', 'Other',
       'Other - Education', 'Museum',
       'Other - Entertainment/Public Assembly', 'Retail Store',
       'College/University', 'Mixed Use Property', 'Food Service',
       'Non-Refrigerated Warehouse', 'Residence Hall/Dormitory',
       'Laboratory', 'Medical Office',
       'Urgent Care/Clinic/Other Outpatient',
       'Ambulatory Surgical Center', 'Other - Mall',
       'Senior Care Community', 'Pre-school/Daycare',
       'Social/Meeting Hall', 'Performing Arts',
       'Other - Lodging/Residential', 'Other - Specialty Hospital',
       'Self-Storage Facility', 'Financial Office', 'Strip Mall',
       'Prison/Incarceration', 'Fitness Center/Health Club/Gym',
       'Parking', 'Bank Branch', 'Wholesale Club/Supercenter',
       'Data Center', 'Ot

In [407]:
# Encode 'PrimaryPropertyType' using one-hot encoding
encoded_df = pd.get_dummies(df_nyc_renamed, columns=['PrimaryPropertyType'], prefix='BuildingType')

# Calculate the correlation matrix
correlation_matrix = encoded_df.corr()

# Extract correlations related to 'Electricity(kWh)'
correlation_with_electricity = correlation_matrix['Electricity(kWh)']

# Create a DataFrame with correlations
correlation_df = pd.DataFrame({'BuildingType': correlation_with_electricity.index, 'Correlation': correlation_with_electricity.values})

# Sort correlations in descending order
sorted_correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)

# Display correlations
sorted_correlation_df = sorted_correlation_df[(sorted_correlation_df['Correlation'] > 0.01) | (sorted_correlation_df['Correlation'] < -0.01)]

# Remove the row with 'Electricity(kWh)' as it's perfectly correlated (value 1)
sorted_correlation_df = sorted_correlation_df[sorted_correlation_df['BuildingType'] != 'Electricity(kWh)']

# Remove the row with 'PropertyGFABuilding(s)' as it's also included
sorted_correlation_df = sorted_correlation_df[sorted_correlation_df['BuildingType'] != 'PropertyGFABuilding(s)']
sorted_correlation_df


'''NEED TO TELL IT TO EXTRACT ONLY FROM BUILDING TYPE NOT ALL OTHER FEATURES'''

Unnamed: 0,BuildingType,Correlation
70,BuildingType_Wastewater Treatment Plant,0.25
35,BuildingType_Office,0.23
21,BuildingType_Hospital (General Medical & Surgi...,0.17
4,Number of Buildings,0.12
13,BuildingType_Data Center,0.11
16,BuildingType_Financial Office,0.1
45,BuildingType_Other - Specialty Hospital,0.09
1,YearBuilt,0.09
25,BuildingType_Laboratory,0.08
9,BuildingType_College/University,0.04


In [408]:
# saving buildings with high correlation in a list
high_correlation = list(sorted_correlation_df['BuildingType'])
high_correlation = [building.replace('BuildingType_','') for building in high_correlation]


# filter to show only those with high correlations
df_nyc_renamed = df_nyc_renamed[df_nyc_renamed.PrimaryPropertyType.isin(high_correlation)] 
df_nyc_renamed

Unnamed: 0,PrimaryPropertyType,PropertyGFABuilding(s),YearBuilt,Electricity(kWh),Occupancy,Number of Buildings
0,Office,169416,1909,1920103.60,95,1
1,K-12 School,94380,1963,180640.00,100,1
2,Hotel,125000,1999,2354605.30,85,1
3,Hotel,50000,1994,579335.20,100,1
4,Hotel,50000,2012,299809.90,0,1
...,...,...,...,...,...,...
34349,Self-Storage Facility,69050,2008,123163.60,80,1
34350,Multifamily Housing,32800,1931,92768.00,100,1
34351,Multifamily Housing,32350,1925,81381.00,100,1
34352,Multifamily Housing,33800,1926,184548.00,100,1


### Encoding and Scaling

In [409]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output= False) 

# Fit encoder
ohe.fit(df_nyc_renamed[['PrimaryPropertyType']])

# Transform the current "Street" column
df_nyc_renamed[ohe.get_feature_names_out()] = ohe.transform(df_nyc_renamed[['PrimaryPropertyType']])

# Drop the column "Street" which has been encoded
df_nyc_renamed.drop(columns = ['PrimaryPropertyType'], inplace = True)

df_nyc_renamed.head(3)

Unnamed: 0,PropertyGFABuilding(s),YearBuilt,Electricity(kWh),Occupancy,Number of Buildings,PrimaryPropertyType_Ambulatory Surgical Center,PrimaryPropertyType_College/University,PrimaryPropertyType_Courthouse,PrimaryPropertyType_Data Center,PrimaryPropertyType_Enclosed Mall,...,PrimaryPropertyType_Residence Hall/Dormitory,PrimaryPropertyType_Retail Store,PrimaryPropertyType_Self-Storage Facility,PrimaryPropertyType_Senior Care Community,PrimaryPropertyType_Stadium (Open),PrimaryPropertyType_Supermarket/Grocery Store,PrimaryPropertyType_Transportation Terminal/Station,PrimaryPropertyType_Wastewater Treatment Plant,PrimaryPropertyType_Wholesale Club/Supercenter,PrimaryPropertyType_Worship Facility
0,169416,1909,1920103.6,95,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,94380,1963,180640.0,100,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,125000,1999,2354605.3,85,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [410]:
from sklearn.preprocessing import OrdinalEncoder

# finding string features to encode
nan_features = [i for i in df_nyc_renamed.columns if df_nyc_renamed[i].dtype == 'object']

# Instantiate Ordinal Encoder
ordinal_encoder = OrdinalEncoder()

# Fit to features
df_encoded = df_nyc_renamed.copy()
ordinal_encoder.fit(df_encoded[nan_features])

# Transforming categories into ordered numbers
df_encoded[nan_features] = ordinal_encoder.transform(df_encoded[nan_features])
df_encoded.head(3)



Unnamed: 0,PropertyGFABuilding(s),YearBuilt,Electricity(kWh),Occupancy,Number of Buildings,PrimaryPropertyType_Ambulatory Surgical Center,PrimaryPropertyType_College/University,PrimaryPropertyType_Courthouse,PrimaryPropertyType_Data Center,PrimaryPropertyType_Enclosed Mall,...,PrimaryPropertyType_Residence Hall/Dormitory,PrimaryPropertyType_Retail Store,PrimaryPropertyType_Self-Storage Facility,PrimaryPropertyType_Senior Care Community,PrimaryPropertyType_Stadium (Open),PrimaryPropertyType_Supermarket/Grocery Store,PrimaryPropertyType_Transportation Terminal/Station,PrimaryPropertyType_Wastewater Treatment Plant,PrimaryPropertyType_Wholesale Club/Supercenter,PrimaryPropertyType_Worship Facility
0,169416,1909,1920103.6,95,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,94380,1963,180640.0,100,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,125000,1999,2354605.3,85,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [411]:
from sklearn.preprocessing import StandardScaler

# drop property type columns
# df_seattle.drop(columns='PrimaryPropertyType', inplace=True)

# instantiate scaler
s_scaler = StandardScaler()

# instantiate scaled df before transform
df_scaled = df_encoded.copy().dropna()

# define numerical features 
numerical_features = list(df_scaled.columns)
numerical_features.remove('Electricity(kWh)') # remove target before scaling

# fit scaler
s_scaler.fit(df_scaled[numerical_features])

# apply tranform to features
df_scaled[numerical_features] = s_scaler.transform(df_scaled[numerical_features])
df_scaled.head(3)

Unnamed: 0,PropertyGFABuilding(s),YearBuilt,Electricity(kWh),Occupancy,Number of Buildings,PrimaryPropertyType_Ambulatory Surgical Center,PrimaryPropertyType_College/University,PrimaryPropertyType_Courthouse,PrimaryPropertyType_Data Center,PrimaryPropertyType_Enclosed Mall,...,PrimaryPropertyType_Residence Hall/Dormitory,PrimaryPropertyType_Retail Store,PrimaryPropertyType_Self-Storage Facility,PrimaryPropertyType_Senior Care Community,PrimaryPropertyType_Stadium (Open),PrimaryPropertyType_Supermarket/Grocery Store,PrimaryPropertyType_Transportation Terminal/Station,PrimaryPropertyType_Wastewater Treatment Plant,PrimaryPropertyType_Wholesale Club/Supercenter,PrimaryPropertyType_Worship Facility
0,0.26,-1.12,1920103.6,-0.44,-0.07,-0.01,-0.16,-0.03,-0.02,-0.02,...,-0.13,-0.09,-0.09,-0.08,-0.01,-0.06,-0.01,-0.02,-0.02,-0.07
1,-0.11,0.53,180640.0,0.18,-0.07,-0.01,-0.16,-0.03,-0.02,-0.02,...,-0.13,-0.09,-0.09,-0.08,-0.01,-0.06,-0.01,-0.02,-0.02,-0.07
2,0.04,1.63,2354605.3,-1.68,-0.07,-0.01,-0.16,-0.03,-0.02,-0.02,...,-0.13,-0.09,-0.09,-0.08,-0.01,-0.06,-0.01,-0.02,-0.02,-0.07


### Training Model

In [412]:
X = df_scaled[numerical_features]
y = df_scaled['Electricity(kWh)']

from sklearn.linear_model import Ridge, Lasso, LinearRegression

linreg = LinearRegression().fit(X, y)
ridge = Ridge(alpha=0.2).fit(X, y)
lasso = Lasso(alpha=0.2).fit(X, y)

coefs = pd.DataFrame({
    "coef_linreg": pd.Series(linreg.coef_, index = X.columns),
    "coef_ridge": pd.Series(ridge.coef_, index = X.columns),
    "coef_lasso": pd.Series(lasso.coef_, index= X.columns)})\

coefs\
    .applymap(lambda x: int(x))\
    .style.applymap(lambda x: 'color: red' if x == 0 else 'color: white')

coefs.sort_values('coef_linreg', ascending=False, inplace=True)
coefs.head(50)

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,coef_linreg,coef_ridge,coef_lasso
PrimaryPropertyType_Multifamily Housing,1.4913041835111056e+17,-264566.18,-359060.33
PrimaryPropertyType_Office,9.372012637906558e+16,267076.8,207690.53
PrimaryPropertyType_K-12 School,7.167190184686455e+16,-144273.34,-189687.93
PrimaryPropertyType_College/University,5.112524026328812e+16,40799.65,8404.36
PrimaryPropertyType_Hotel,4.172644103353565e+16,37651.06,11211.33
PrimaryPropertyType_Residence Hall/Dormitory,4.139276445508688e+16,-75270.89,-101499.12
PrimaryPropertyType_Non-Refrigerated Warehouse,4.054574239885763e+16,-91388.01,-117079.48
PrimaryPropertyType_Other,3.1970074219919444e+16,56556.64,36299.28
PrimaryPropertyType_Retail Store,3.062128383816902e+16,34582.75,15180.19
PrimaryPropertyType_Self-Storage Facility,3.039044765162981e+16,-57244.32,-76500.65


In [413]:
# import model type
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

# Instantiate model 
sgd_regressor = SGDRegressor(random_state=3, alpha=0.1)

# define x and y 
new_features = list(df_scaled.columns)
new_features.remove('Electricity(kWh)')

X = df_scaled[new_features]
y = df_scaled['Electricity(kWh)']

# train model 

scores = cross_val_score(sgd_regressor, X, y, cv=3)
r2_score = pd.DataFrame({'score': scores.mean()}, index=[0])
r2_score

Unnamed: 0,score
0,-1725921.72
