In [29]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeCV
import copy

In [6]:
# Get train and test data as Pandas DataFrames
# For test.csv, we do not have 'site_eui' columns because that is what we are trying to predict.
def get_data(train_df, test_df):
    
    s = (train_df.dtypes == 'object')
    object_cols = list(s[s].index)

    ordinal_encoder = OrdinalEncoder()
    label_train_df = train_df.copy()
    label_test_df = test_df.copy()
    label_train_df[object_cols] = ordinal_encoder.fit_transform(train_df[object_cols])
    label_test_df[object_cols] = ordinal_encoder.transform(test_df[object_cols])
    
    label_train_df = label_train_df.fillna(0)
    label_test_df = label_test_df.fillna(0)

    y_train = np.array(label_train_df['site_eui']).reshape(-1,1)
    #X = label_train_df.drop()
    X_train = label_train_df.drop(columns=['site_eui', 'id'])

    #X = label_train_df.drop()
    X_test = label_test_df.drop(columns=['id'])

    return X_train, X_test, y_train

In [7]:
# Get feature names
train_df = pd.read_csv('train.csv')
train_df = train_df.drop(columns=['site_eui', 'id'])

feature_names = train_df.columns
print(feature_names)
print()
print("Number of features =", len(feature_names))

Index(['Year_Factor', 'State_Factor', 'building_class', 'facility_type',
       'floor_area', 'year_built', 'energy_star_rating', 'ELEVATION',
       'january_min_temp', 'january_avg_temp', 'january_max_temp',
       'february_min_temp', 'february_avg_temp', 'february_max_temp',
       'march_min_temp', 'march_avg_temp', 'march_max_temp', 'april_min_temp',
       'april_avg_temp', 'april_max_temp', 'may_min_temp', 'may_avg_temp',
       'may_max_temp', 'june_min_temp', 'june_avg_temp', 'june_max_temp',
       'july_min_temp', 'july_avg_temp', 'july_max_temp', 'august_min_temp',
       'august_avg_temp', 'august_max_temp', 'september_min_temp',
       'september_avg_temp', 'september_max_temp', 'october_min_temp',
       'october_avg_temp', 'october_max_temp', 'november_min_temp',
       'november_avg_temp', 'november_max_temp', 'december_min_temp',
       'december_avg_temp', 'december_max_temp', 'cooling_degree_days',
       'heating_degree_days', 'precipitation_inches', 'snowfall_inc

In [8]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

X_train, X_test, y_train = get_data(train_df, test_df)

# Perform forward and backward selection
ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X_train, y_train)
sfs_forward = SequentialFeatureSelector(
    ridge, n_features_to_select=30, direction="forward"
).fit(X_train, y_train)

sfs_backward = SequentialFeatureSelector(
    ridge, n_features_to_select=30, direction="backward"
).fit(X_train, y_train)

print(
    "Features selected by forward sequential selection: "
    f"{feature_names[sfs_forward.get_support()]}"
)

print(
    "Features selected by backward sequential selection: "
    f"{feature_names[sfs_backward.get_support()]}"
)

KeyboardInterrupt: 

In [30]:
fore_features = ['State_Factor', 'building_class', 'facility_type', 'energy_star_rating',
       'ELEVATION', 'january_min_temp', 'january_avg_temp',
       'february_min_temp', 'february_max_temp', 'march_avg_temp',
       'march_max_temp', 'april_min_temp', 'may_avg_temp', 'june_avg_temp',
       'july_max_temp', 'august_avg_temp', 'september_min_temp',
       'october_avg_temp', 'november_min_temp', 'december_min_temp',
       'heating_degree_days', 'snowfall_inches', 'avg_temp', 'days_below_0F',
       'days_above_100F', 'days_above_110F', 'direction_max_wind_speed',
       'direction_peak_wind_speed', 'max_wind_speed', 'days_with_fog']

back_features = ['facility_type', 'floor_area', 'energy_star_rating', 'january_min_temp',
       'february_min_temp', 'february_avg_temp', 'february_max_temp',
       'march_max_temp', 'april_max_temp', 'may_avg_temp', 'may_max_temp',
       'june_max_temp', 'july_min_temp', 'july_avg_temp', 'august_max_temp',
       'september_avg_temp', 'october_min_temp', 'october_max_temp',
       'november_avg_temp', 'november_max_temp', 'december_avg_temp',
       'cooling_degree_days', 'precipitation_inches', 'snowfall_inches',
       'snowdepth_inches', 'days_below_30F', 'days_below_20F',
       'days_below_10F', 'days_above_80F', 'days_above_90F']

print("Forward selection:", fore_features)
print()
print("Backward selection:", back_features)

Forward selection: ['State_Factor', 'building_class', 'facility_type', 'energy_star_rating', 'ELEVATION', 'january_min_temp', 'january_avg_temp', 'february_min_temp', 'february_max_temp', 'march_avg_temp', 'march_max_temp', 'april_min_temp', 'may_avg_temp', 'june_avg_temp', 'july_max_temp', 'august_avg_temp', 'september_min_temp', 'october_avg_temp', 'november_min_temp', 'december_min_temp', 'heating_degree_days', 'snowfall_inches', 'avg_temp', 'days_below_0F', 'days_above_100F', 'days_above_110F', 'direction_max_wind_speed', 'direction_peak_wind_speed', 'max_wind_speed', 'days_with_fog']

Backward selection: ['facility_type', 'floor_area', 'energy_star_rating', 'january_min_temp', 'february_min_temp', 'february_avg_temp', 'february_max_temp', 'march_max_temp', 'april_max_temp', 'may_avg_temp', 'may_max_temp', 'june_max_temp', 'july_min_temp', 'july_avg_temp', 'august_max_temp', 'september_avg_temp', 'october_min_temp', 'october_max_temp', 'november_avg_temp', 'november_max_temp', 'dec

In [35]:
### Linear regression on full dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
X_train, X_test, y_train = get_data(train_df, test_df)
ridge_full = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X_train, y_train)
full_score = ridge_full.score(X_train, y_train)
print("Score for regression on full dataset =", full_score)

Score for regression on full dataset = 0.14589887209177332


In [36]:
### Linear regression on forward features
cols = copy.deepcopy(fore_features)
cols.append('site_eui')
cols.append('id')
train_df_fore = train_df.loc[:,cols]
train_df_fore['site_eui'] = y_train
X_train, X_test, y_train = get_data(train_df_fore, test_df)

ridge_fore = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X_train, y_train)
fore_score = ridge_fore.score(X_train, y_train)
print("Score for regression on forward selection dataset =", fore_score)

Score for regression on forward selection dataset = 0.1387670274270152


In [37]:
### Linear regression on backward features
cols = copy.deepcopy(back_features)
cols.append('site_eui')
cols.append('id')
train_df_back = train_df.loc[:,cols]
train_df_back['site_eui'] = y_train
X_train, X_test, y_train = get_data(train_df_back, test_df)

ridge_back = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X_train, y_train)
back_score = ridge_back.score(X_train, y_train)
print("Score for regression on backward selection dataset =", back_score)

Score for regression on backward selection dataset = 0.12796388810042425
