---

### Load Data

In [26]:
%load_ext autoreload
%autoreload 2

# import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plot

from etl.extract import ProjectZero

# import projectzero data
from etl.extract import ProjectZero
data = ProjectZero().get_data()

# df_model instance
df_nyc = data['ext_nyc']
df_seattle = data['ext_seattle'].copy()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
# NYC columns
for i in list(df_nyc.columns):
    print(i)

Unnamed: 0
Order
Property Id
Property Name
Parent Property Id
Parent Property Name
BBL - 10 digits
NYC Borough, Block and Lot (BBL) self-reported
NYC Building Identification Number (BIN)
Address 1 (self-reported)
Address 2 (self-reported)
Postal Code
Street Number
Street Name
Borough
DOF Gross Floor Area (ft²)
Self-Reported Gross Floor Area (ft²)
Primary Property Type - Self Selected
List of All Property Use Types at Property
Largest Property Use Type
Largest Property Use Type - Gross Floor Area (ft²)
2nd Largest Property Use Type
2nd Largest Property Use - Gross Floor Area (ft²)
3rd Largest Property Use Type
3rd Largest Property Use Type - Gross Floor Area (ft²)
Year Built
Number of Buildings
Occupancy
Metered Areas (Energy)
Metered Areas  (Water)
ENERGY STAR Score
Source EUI (kBtu/ft²)
Weather Normalized Source EUI (kBtu/ft²)
Site EUI (kBtu/ft²)
Weather Normalized Site EUI (kBtu/ft²)
Weather Normalized Site Electricity Intensity (kWh/ft²)
Weather Normalized Site Natural Gas Intensity

In [28]:
# Seattle Columns
for i in list(df_seattle.columns):
    print(i)

OSEBuildingID
DataYear
BuildingType
PrimaryPropertyType
PropertyName
TaxParcelIdentificationNumber
Location
CouncilDistrictCode
Neighborhood
YearBuilt
NumberofBuildings
NumberofFloors
PropertyGFATotal
PropertyGFAParking
PropertyGFABuilding(s)
ListOfAllPropertyUseTypes
LargestPropertyUseType
LargestPropertyUseTypeGFA
SecondLargestPropertyUseType
SecondLargestPropertyUseTypeGFA
ThirdLargestPropertyUseType
ThirdLargestPropertyUseTypeGFA
YearsENERGYSTARCertified
ENERGYSTARScore
SiteEUI(kBtu/sf)
SiteEUIWN(kBtu/sf)
SourceEUI(kBtu/sf)
SourceEUIWN(kBtu/sf)
SiteEnergyUse(kBtu)
SiteEnergyUseWN(kBtu)
SteamUse(kBtu)
Electricity(kWh)
Electricity(kBtu)
NaturalGas(therms)
NaturalGas(kBtu)
OtherFuelUse(kBtu)
GHGEmissions(MetricTonsCO2e)
GHGEmissionsIntensity(kgCO2e/ft2)
DefaultData
Comment
ComplianceStatus
Outlier
2010 Census Tracts
Seattle Police Department Micro Community Policing Plan Areas
City Council Districts
SPD Beats
Zip Codes


In [29]:
seattle_features = [
    'PrimaryPropertyType',
    'NumberofFloors',
    'PropertyGFABuilding(s)',
    'PropertyGFAParking',
    'YearBuilt',
    'BuildingType',
    'Electricity(kWh)']

df_seattle = df_seattle[seattle_features]

nyc_features = [
    'Primary Property Type - Self Selected',
    'Self-Reported Gross Floor Area (ft²)',
    'Year Built',
    'Electricity Use - Grid Purchase (kWh)',
    'Occupancy']

df_nyc = df_nyc[nyc_features]

# rename nyc columns to match seattle
df_nyc_renamed = df_nyc.rename(columns={
    'Primary Property Type - Self Selected': 'PrimaryPropertyType' ,
    'Self-Reported Gross Floor Area (ft²)': 'PropertyGFABuilding(s)',
    'Year Built': 'YearBuilt',
    'Electricity Use - Grid Purchase (kWh)': 'Electricity(kWh)',
    'Occupancy': 'Occupancy' 
})

# Concatenate the two DataFrames
merged_df = pd.concat([df_nyc_renamed, df_seattle])

merged_df.isna().sum()
cities_df = merged_df

df_nyc_renamed.dropna(subset=['Electricity(kWh)'], inplace=True)
df_nyc_renamed.isna().sum()

PrimaryPropertyType       0
PropertyGFABuilding(s)    0
YearBuilt                 0
Electricity(kWh)          0
Occupancy                 0
dtype: int64

In [30]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output= False) 

# Fit encoder
ohe.fit(df_nyc_renamed[['PrimaryPropertyType']])

# Transform the current "Street" column
df_nyc_renamed[ohe.get_feature_names_out()] = ohe.transform(df_nyc_renamed[['PrimaryPropertyType']])

# Drop the column "Street" which has been encoded
df_nyc_renamed.drop(columns = ['PrimaryPropertyType'], inplace = True)

df_nyc_renamed.head(3)

Unnamed: 0,PropertyGFABuilding(s),YearBuilt,Electricity(kWh),Occupancy,PrimaryPropertyType_Adult Education,PrimaryPropertyType_Ambulatory Surgical Center,PrimaryPropertyType_Automobile Dealership,PrimaryPropertyType_Bank Branch,PrimaryPropertyType_College/University,PrimaryPropertyType_Convenience Store without Gas Station,...,PrimaryPropertyType_Stadium (Open),PrimaryPropertyType_Strip Mall,PrimaryPropertyType_Supermarket/Grocery Store,PrimaryPropertyType_Transportation Terminal/Station,PrimaryPropertyType_Urgent Care/Clinic/Other Outpatient,PrimaryPropertyType_Veterinary Office,PrimaryPropertyType_Wastewater Treatment Plant,PrimaryPropertyType_Wholesale Club/Supercenter,PrimaryPropertyType_Worship Facility,PrimaryPropertyType_Zoo
0,169416,1909,1920103.6,95,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,94380,1963,180640.0,100,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,125000,1999,2354605.3,85,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
from sklearn.preprocessing import OrdinalEncoder

# finding string features to encode
nan_features = [i for i in df_nyc_renamed.columns if df_nyc_renamed[i].dtype == 'object']

# Instantiate Ordinal Encoder
ordinal_encoder = OrdinalEncoder()

# Fit to features
df_encoded = df_nyc_renamed.copy()
ordinal_encoder.fit(df_encoded[nan_features])

# Transforming categories into ordered numbers
df_encoded[nan_features] = ordinal_encoder.transform(df_encoded[nan_features])
df_encoded.head(3)



Unnamed: 0,PropertyGFABuilding(s),YearBuilt,Electricity(kWh),Occupancy,PrimaryPropertyType_Adult Education,PrimaryPropertyType_Ambulatory Surgical Center,PrimaryPropertyType_Automobile Dealership,PrimaryPropertyType_Bank Branch,PrimaryPropertyType_College/University,PrimaryPropertyType_Convenience Store without Gas Station,...,PrimaryPropertyType_Stadium (Open),PrimaryPropertyType_Strip Mall,PrimaryPropertyType_Supermarket/Grocery Store,PrimaryPropertyType_Transportation Terminal/Station,PrimaryPropertyType_Urgent Care/Clinic/Other Outpatient,PrimaryPropertyType_Veterinary Office,PrimaryPropertyType_Wastewater Treatment Plant,PrimaryPropertyType_Wholesale Club/Supercenter,PrimaryPropertyType_Worship Facility,PrimaryPropertyType_Zoo
0,169416,1909,1920103.6,95,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,94380,1963,180640.0,100,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,125000,1999,2354605.3,85,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
from sklearn.preprocessing import StandardScaler

# drop property type columns
# df_seattle.drop(columns='PrimaryPropertyType', inplace=True)

# instantiate scaler
s_scaler = StandardScaler()

# instantiate scaled df before transform
df_scaled = df_encoded.copy().dropna()

# define numerical features 
numerical_features = list(df_scaled.columns)
numerical_features.remove('Electricity(kWh)') # remove target before scaling

# fit scaler
s_scaler.fit(df_scaled[numerical_features])

# apply tranform to features
df_scaled[numerical_features] = s_scaler.transform(df_scaled[numerical_features])
df_scaled.head(3)

Unnamed: 0,PropertyGFABuilding(s),YearBuilt,Electricity(kWh),Occupancy,PrimaryPropertyType_Adult Education,PrimaryPropertyType_Ambulatory Surgical Center,PrimaryPropertyType_Automobile Dealership,PrimaryPropertyType_Bank Branch,PrimaryPropertyType_College/University,PrimaryPropertyType_Convenience Store without Gas Station,...,PrimaryPropertyType_Stadium (Open),PrimaryPropertyType_Strip Mall,PrimaryPropertyType_Supermarket/Grocery Store,PrimaryPropertyType_Transportation Terminal/Station,PrimaryPropertyType_Urgent Care/Clinic/Other Outpatient,PrimaryPropertyType_Veterinary Office,PrimaryPropertyType_Wastewater Treatment Plant,PrimaryPropertyType_Wholesale Club/Supercenter,PrimaryPropertyType_Worship Facility,PrimaryPropertyType_Zoo
0,0.276242,-1.128175,1920103.6,-0.403578,-0.020295,-0.009748,-0.023209,-0.021061,-0.155768,-0.009748,...,-0.007959,-0.031348,-0.059126,-0.013786,-0.040221,-0.007959,-0.021801,-0.018668,-0.069319,-0.009748
1,-0.106512,0.518612,180640.0,0.177171,-0.020295,-0.009748,-0.023209,-0.021061,-0.155768,-0.009748,...,-0.007959,-0.031348,-0.059126,-0.013786,-0.040221,-0.007959,-0.021801,-0.018668,-0.069319,-0.009748
2,0.049679,1.616469,2354605.3,-1.565078,-0.020295,-0.009748,-0.023209,-0.021061,-0.155768,-0.009748,...,-0.007959,-0.031348,-0.059126,-0.013786,-0.040221,-0.007959,-0.021801,-0.018668,-0.069319,-0.009748


In [33]:
# import model type
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

# Instantiate model 
sgd_regressor = SGDRegressor(random_state=3, alpha=0.1)

# define x and y 
new_features = list(df_scaled.columns)
new_features.remove('Electricity(kWh)')

X = df_scaled[new_features]
y = df_scaled['Electricity(kWh)']

# train model 

scores = cross_val_score(sgd_regressor, X, y, cv=3)
r2_score = pd.DataFrame({'score': scores.mean()}, index=[0])
r2_score

Unnamed: 0,score
0,-95207790.0
