In [1]:
import csv
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model,ensemble, tree, model_selection

import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.externals.six import StringIO
from sklearn.ensemble import RandomForestRegressor

In [2]:
## Load in the merged SF EUI/SF Property Info Map Data
datafile = "merged_SF_EUI_SF_Info_Map_df.csv"
df=pd.read_csv(datafile)

# Delete year built >2017

df = df.loc[df['year_built_cl']<=2017,:].reset_index()
all_vals_y_data = df.dropna(subset = ['2015 Weather Normalized Site EUI (kBtu/ft2)']).copy().reset_index

In [3]:

all_vals_y_data = df.dropna(subset = ['2015 Weather Normalized Site EUI (kBtu/ft2)']).copy()
all_vals_y_data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Energy Audit Status_clean.1', 'index'], axis=1, inplace = True)
all_vals_y_data.dropna(thresh= (len(all_vals_y_data) - 200), axis=1, inplace = True)

In [4]:
def choose_building_type(n):
    building_type = 'Other'
    if n in ['Urgent Care/Clinic/Other Outpatient', 'Hospital (General Medical & Surgical)', 'Veterinary Office', 'Laboratory']:
        building_type = 'Healthcare or Laboratory'
    if n in ['Restaurant', 'Food Sales','Supermarket/Grocery Store', 'Food Service', 'Other - Restaurant/Bar']:
        building_type = 'Food Service'
    if n in ['Office', 'Financial Office', 'Medical Office', 'Bank Branch']:
        building_type = 'Office'
    if n in ['Worship Facility', 'Social/Meeting Hall', 'Other - Recreation', 'Senior Care Community', 'Other - Entertainment/Public Assembly', 'Museum']:   
        building_type = 'Community Facility'
    if n in ['Hotel']:
        building_type = 'Hospitality'
    if n in ['College/University', 'Other - Education','Adult Education', 'K-12 School', 'Pre-school/Daycare', 'Vocational School']:
        building_type = 'Education'
    if n in ['Mixed Use Property', 'Other - Mall','Enclosed Mall','Repair Services (Vehicle, Shoe, Locksmith, etc.)','Automobile Dealership','Other - Services', 'Retail Store', 'Personal Services (Health/Beauty, Dry Cleaning, etc.)', 'Strip Mall']:
        building_type = 'Mixed Use or Retail'
    if n in ['Parking']:
        building_type = 'Parking'
    if n in ['Manufacturing/Industrial Plant', '']:
        building_type = 'Manufacturing'
    if n in ['Fitness Center/Health Club/Gym']:
        building_type = 'Fitness'
    if n in ['Movie Theater', 'Performing Arts', 'Bar/Nightclub']:
        building_type = 'Entertainment Venue'
    if n in ['Self-Storage Facility', 'Non-Refrigerated Warehouse', 'Refrigerated Warehouse','Wholesale Club/Supercenter', 'Distribution Center']  :
        building_type = 'Warehouse Type'
    if n in ['Data Center']:
        building_type = 'Data Center'    
    return building_type    
    

In [5]:
all_vals_y_data['Grouped_Building_Type'] = all_vals_y_data['Property Type - Self Selected_clean'].apply(choose_building_type)

In [6]:

x_complete_all_vals_y_data = all_vals_y_data.dropna().copy()
#need to make new datetime column for energy audit due date
x_complete_all_vals_y_data['Datetime Energy Audit Due'] = pd.to_datetime(x_complete_all_vals_y_data['Datetime Energy Audit Due'], format = "%Y-%m-%d")
x_complete_all_vals_y_data['Datetime Energy Audit Due'] = pd.to_numeric(x_complete_all_vals_y_data['Datetime Energy Audit Due'])
x_complete_all_vals_y_data = pd.get_dummies(x_complete_all_vals_y_data, columns=['Property Type - Self Selected_clean'])
x_complete_all_vals_y_data = pd.get_dummies(x_complete_all_vals_y_data, columns=['Energy Audit Status_clean'])
x_complete_all_vals_y_data = pd.get_dummies(x_complete_all_vals_y_data, columns=['Benchmark 2015 Status_clean'])
x_complete_all_vals_y_data = pd.get_dummies(x_complete_all_vals_y_data, columns = ['Grouped_Building_Type'])

In [7]:
x_complete_all_vals_y_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 811 entries, 0 to 1439
Data columns (total 83 columns):
parcel_index                                                                                 811 non-null object
Postal Code                                                                                  811 non-null int64
Full_Address_clean                                                                           811 non-null object
Building Name_clean                                                                          811 non-null object
Datetime Energy Audit Due                                                                    811 non-null int64
2015 Weather Normalized Site EUI (kBtu/ft2)                                                  811 non-null float64
address_cl                                                                                   811 non-null object
land_value_cl                                                                                811 non-nul

In [8]:
#Separate our features from our target for model

X=x_complete_all_vals_y_data.drop(['2015 Weather Normalized Site EUI (kBtu/ft2)', 'parcel_index', 'Full_Address_clean', 'Building Name_clean', 'address_cl'], axis = 1)

y=x_complete_all_vals_y_data['2015 Weather Normalized Site EUI (kBtu/ft2)']

In [9]:
X,holdoutX,y,holdouty = model_selection.train_test_split(X,y,test_size = .1, random_state=42)


In [10]:

models = {}
parameters = {}

models['linear_model'] = linear_model.LinearRegression()
models['ridge_model'] = linear_model.Ridge()
models['lasso_model'] = linear_model.Lasso(alpha=.5)
models['robust_regression'] = linear_model.SGDRegressor(loss='huber',n_iter=20)
models['eps_insensitive'] = linear_model.SGDRegressor(loss='epsilon_insensitive',n_iter=20)


models['cart'] = tree.DecisionTreeRegressor(max_depth=7)
models['extratrees'] = tree.ExtraTreeRegressor(max_depth=7)
models['randomForest'] = ensemble.RandomForestRegressor()
models['adaboostedTrees'] = ensemble.AdaBoostRegressor()
models['gradboostedTrees'] = ensemble.GradientBoostingRegressor()


for name,model in models.items():
    scores = model_selection.cross_val_score(model, X, y, n_jobs=1)
    print('Model: '+name)
    print("Score: " + str(np.mean(scores)))
    print()

Model: linear_model
Score: -0.0226044163086

Model: ridge_model
Score: 0.311807434888

Model: lasso_model
Score: 0.256958683729

Model: robust_regression
Score: -2.23242995967e+60

Model: eps_insensitive
Score: -2.70337307506e+62

Model: cart
Score: -0.451933492529

Model: extratrees
Score: -0.217041621211

Model: randomForest
Score: 0.0756431243302

Model: adaboostedTrees
Score: -0.00646716583959

Model: gradboostedTrees
Score: -0.188728972461

