In [1]:
import pandas as pd
import numpy as np
import os, warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [2]:
# preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, FunctionTransformer

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# model
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor


# evaluation
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

# metric
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('../data/clean_data.csv')

In [4]:
df.isna().sum()[df.isna().sum() > 0]

numberof_floors       6
energystar_score    799
dtype: int64

In [5]:
def data_selection(data:pd.DataFrame, target:str, skip_features:list, stratify:str=None, test_size:float=.2, random_state:int=0):
    
    X = data.drop(columns = [target] + skip_features)
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [6]:
def model_selection(data:pd.DataFrame, target:str, skip_features:list, preprocessing_pipeline, models_dict:dict, preprocessing_param_grid:dict, metric:str, cv_n_splits:int=5, stratify:str=None, test_size:float=.2, random_state:int=0):
    assert metric in ['rmse', 'r2']
    
    X_train, X_test, y_train, y_test = data_selection(df, target, skip_features, stratify, test_size, random_state)
    
    cv = KFold(n_splits=cv_n_splits, shuffle = True, random_state=0)
    
    for key, values in models_dict.items():
        
        model = values.get('model')
        model_param_grid=values.get('param_grid')
        param_grid = dict(list(preprocessing_param_grid.items()) + list(model_param_grid.items()))
        pipe=Pipeline((('preprocessing', preprocessing), ('model', model)))
        
        if metric == 'rmse':
            search = GridSearchCV(pipe, param_grid, cv=cv, scoring='neg_root_mean_squared_error')
            search.fit(X_train, y_train)
            y_pred=search.best_estimator_.predict(X_test)
            score=mean_squared_error(y_test, y_pred, squared=False)
            
        elif metric == 'r2':
            search = GridSearchCV(pipe, param_grid, cv=cv, scoring='r2')
            search.fit(X_train, y_train)
            y_pred=search.best_estimator_.predict(X_test)
            score=r2_score(y_test, y_pred)
            
        print(f"""
{100*'_'}
Model: {model}
Score: {score}
Parameters: {search.best_params_}
""")
    

In [7]:
target='ghg_emissions', 
skip_features=['site_energy_use_wn', 'energystar_score']

### GHG emissions model

In [9]:
num_features = df.columns[~df.columns.str.contains('cat_')]
cat_features = df.columns[df.columns.str.contains('cat_')]
num_features = num_features[~np.isin(num_features, [target]+skip_features)].tolist()
cat_features = cat_features[~np.isin(cat_features, [target]+skip_features)].tolist()

num_preprocessing = Pipeline((
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)), 
    ('normalizer', 'passthrough'),
    ('scaler', 'passthrough')
), verbose = False)

preprocessing = ColumnTransformer([('num_preprocessing', num_preprocessing, num_features)])
pipe=Pipeline((('preprocessing', preprocessing), ('model', LinearRegression())))

In [10]:
preprocessing_param_grid = {
    'preprocessing__remainder': ['drop', 'passthrough'],
    'preprocessing__num_preprocessing__normalizer': ['passthrough', FunctionTransformer(lambda x: np.log(np.abs(x)+1))],
    'preprocessing__num_preprocessing__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()]
}

In [11]:
models_dict = {
    'linear_regression': {
        'model': LinearRegression(),
        'param_grid': {},
    },
    'SGD': {
        'model': SGDRegressor(),
        'param_grid': {},
    }
}

In [12]:
model_selection(
    df, 
    target='ghg_emissions', 
    skip_features=['site_energy_use_wn', 'energystar_score'],
    preprocessing_pipeline=preprocessing,
    models_dict=models_dict,
    preprocessing_param_grid=preprocessing_param_grid,
    metric='rmse'
)


____________________________________________________________________________________________________
Model: LinearRegression()
Score: 405.05715836754786
Parameters: {'preprocessing__num_preprocessing__normalizer': FunctionTransformer(func=<function <lambda> at 0x11ca13c20>), 'preprocessing__num_preprocessing__scaler': RobustScaler(), 'preprocessing__remainder': 'drop'}


____________________________________________________________________________________________________
Model: SGDRegressor()
Score: 249.5548395529121
Parameters: {'preprocessing__num_preprocessing__normalizer': 'passthrough', 'preprocessing__num_preprocessing__scaler': MinMaxScaler(), 'preprocessing__remainder': 'passthrough'}



In [13]:
model_selection(
    df, 
    target='ghg_emissions', 
    skip_features=['site_energy_use_wn', 'energystar_score'],
    preprocessing_pipeline=preprocessing,
    models_dict=models_dict,
    preprocessing_param_grid=preprocessing_param_grid,
    metric='r2'
)


____________________________________________________________________________________________________
Model: LinearRegression()
Score: -1.2311532689011777
Parameters: {'preprocessing__num_preprocessing__normalizer': FunctionTransformer(func=<function <lambda> at 0x11ca13c20>), 'preprocessing__num_preprocessing__scaler': RobustScaler(), 'preprocessing__remainder': 'drop'}


____________________________________________________________________________________________________
Model: SGDRegressor()
Score: -0.17498788725299264
Parameters: {'preprocessing__num_preprocessing__normalizer': FunctionTransformer(func=<function <lambda> at 0x11ca13c20>), 'preprocessing__num_preprocessing__scaler': MinMaxScaler(), 'preprocessing__remainder': 'passthrough'}

