# Energy Consumption Prediction By Appliances

In [45]:
#Import Required Libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from time import time
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPRegressor
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.cross_validation import cross_val_score

In [46]:
df_trn = pd.read_csv('training.csv')

In [47]:
df_test = pd.read_csv('testing.csv')

In [48]:
def weekstatus(x):
    return 0 if x == 'Weekday' else 1

def featureengineering(df):
    # Converting datatype of Date column to date time
    df['date'] = pd.to_datetime(df['date'])
    # Converting values of Day of week as Monday:0 , Tuesday:1 ...
    df['Day_of_week'] = df['date'].dt.weekday
    # Converting values of weekstatus as Weekday:0 and Weekend:1
    df['WeekStatus'] = df['WeekStatus'].apply(lambda x:weekstatus(x))
    # Assigning Features and Target to X and Y
    X = df.drop(['date','Appliances','rv1','rv2','WeekStatus', 'Day_of_week','T6', 'T9'],axis =1)
    Y = df['Appliances']
    return X,Y

In [49]:
def datapreprocessing(df_trn,df_test):
    
    # Remove correlated features T6 and T9
    train = df_trn.drop(['date','rv1','rv2','WeekStatus', 'Day_of_week','T6', 'T9'], axis=1)
    test = df_test.drop(['date','rv1','rv2','WeekStatus', 'Day_of_week','T6', 'T9'], axis=1)
    
    # Scales the data to zero mean and unit variance
    standard_scaler = StandardScaler()
    
    # Create dummy dataframes to hold the scaled train and test data
    train_scaled = pd.DataFrame(columns=train.columns, index=train.index)
    test_scaled = pd.DataFrame(columns=test.columns, index=test.index)

    # Store the scaled data in new dataframes
    train_scaled[train_scaled.columns] = standard_scaler.fit_transform(train)
    test_scaled[test_scaled.columns] = standard_scaler.fit_transform(test)
    
    # Prepare training and testing data
    X_trn = train_scaled.drop("Appliances", axis=1)
    y_trn = train_scaled["Appliances"]

    X_test = test_scaled.drop("Appliances", axis=1)
    y_test = test_scaled["Appliances"]
    
    return X_trn, y_trn, X_test, y_test

In [50]:
X_scaled_trn, y_scaled_trn, X_scaled_test, y_scaled_test = datapreprocessing(df_trn,df_test)

In [51]:
def model_Implementation(X_trn,y_trn,X_test,y_test):
    
    models = [ExtraTreesRegressor(random_state=42)]

    TestModels = pd.DataFrame()
    tmp = {}

    for model in models:
        # get model name
        m = str(model)
        tmp['Model'] = m[:m.index('(')]
        # fit model on training dataset

        start = time()
        model.fit(X_trn, y_trn)
        end = time()

        #Predictions and Validation for Testing and Training Set
        predictions = model.predict(X_test)
        predictions_trn = model.predict(X_trn)
        #R2 score
        tmp['R2_Test'] = round(r2_score(y_test,predictions),3)
        tmp['R2_Train'] = round(r2_score(y_trn,predictions_trn),3)
        #Mean Absolute Error(MAE)
        tmp['MAE_Test']= round(mean_absolute_error(y_test,predictions),3)
        tmp['MAE_Train']= round(mean_absolute_error(y_trn,predictions_trn),3)
        #Mean Squared Error(MSE)
        tmp['MSE_Test']= round(mean_squared_error(y_test,predictions),3)
        tmp['MSE_Train']= round(mean_squared_error(y_trn,predictions_trn),3)
        #Root Mean Squared Error (RMSE)
        tmp['RMSE_Test'] = round(np.sqrt(mean_squared_error(y_test,predictions)),3)
        tmp['RMSE_Train'] = round(np.sqrt(mean_squared_error(y_trn,predictions_trn)),3)
        #Mean Absolute Percentage Error
        tmp['MAPE_Test'] =  round(np.mean(np.abs((y_test - predictions) / y_test)) * 100,3)
        tmp['MAPE_Train'] =  round(np.mean(np.abs((y_trn - predictions_trn) / y_trn)) * 100,3)
        #Training and Testing Scores
        tmp['Training Score(%)'] = round(model.score(X_trn, y_trn) * 100,3)
        tmp['Testing Score(%)'] = round(model.score(X_test, y_test) * 100,3)
        # Training Time
        tmp['Training Time'] = round(end-start,3)
        # write obtained data
        TestModels = TestModels.append([tmp])

    TestModels.set_index('Model', inplace=True)
    #print('Iteration'+str(i))
    TestModels
    return TestModels

In [52]:
results_scaled = model_Implementation(X_scaled_trn, y_scaled_trn, X_scaled_test, y_scaled_test)
results_scaled

Unnamed: 0_level_0,MAE_Test,MAE_Train,MAPE_Test,MAPE_Train,MSE_Test,MSE_Train,R2_Test,R2_Train,RMSE_Test,RMSE_Train,Testing Score(%),Training Score(%),Training Time
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ExtraTreesRegressor,0.307,0.0,124.967,0.0,0.451,0.0,0.549,1.0,0.671,0.0,54.934,100.0,1.426


<h4>Hyperparameter Tuning</h4>

<h4>Hyper Parameters selection for Extra Trees Regressor</h4>

We will try adjusting the following set of hyperparameters:

- n_estimators = number of trees in the foreset
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree

1 - Randomized Search

The most important arguments in RandomizedSearchCV are n_iter, which controls the number of different combinations to try, and cv which is the number of folds to use for cross validation (we use 20 and 5 respectively). More iterations will cover a wider search space and more cv folds reduces the chances of overfitting, but raising each will increase the run time. Machine learning is a field of trade-offs, and performance vs time is one of the most fundamental.

In [53]:
%%time
from sklearn.model_selection import RandomizedSearchCV

# Initialize the model based on best performance from above, We got ExtraTreesRegressor 
sel_model = ExtraTreesRegressor(random_state=42)

# Define the parameter subset

param_grid = {
    "n_estimators": [10, 50, 100, 200, 250, 300, 500, 800],
    "max_features": ["auto", "sqrt", "log2"],
    "max_depth": [None, 10, 50, 100, 200, 500]
}

# Use Randomized search to try 20 subsets from parameter space with 5-fold cross validation
random_search = RandomizedSearchCV(sel_model, param_grid, n_iter=20, scoring="r2", cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_scaled_trn, y_scaled_trn)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.4min finished


Wall time: 4min 33s


In [54]:
# Best Parameters for the model from Randomized Search CV
print(random_search.best_params_)

{'n_estimators': 200, 'max_features': 'log2', 'max_depth': None}


In [55]:
# Creating a function to measure performance
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    rmse = np.sqrt(mean_squared_error(test_labels,predictions))
    r2 = model.score(test_features, test_labels)
    print('Model Performance')
    print('R2 Test: {:0.3f}'.format(r2))
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('RMSE Test: {:0.4f}'.format(rmse))    
    return r2

In [56]:
sel_model.get_params

<bound method BaseEstimator.get_params of ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
          oob_score=False, random_state=42, verbose=0, warm_start=False)>

In [57]:
base_model = ExtraTreesRegressor(random_state = 42)
base_model.fit(X_scaled_trn, y_scaled_trn)
base_accuracy = evaluate(base_model, X_scaled_test, y_scaled_test)

Model Performance
R2 Test: 0.549
Average Error: 0.3067 degrees.
RMSE Test: 0.6713


In [58]:
%%time
best_model = random_search.best_estimator_
best_accuracy = evaluate(best_model, X_scaled_test, y_scaled_test)

Model Performance
R2 Test: 0.598
Average Error: 0.2847 degrees.
RMSE Test: 0.6341
Wall time: 798 ms


In [59]:
print('Improvement of {:0.2f}%.'.format( 100 * (best_accuracy - base_accuracy) / base_accuracy))

Improvement of 8.84%.


Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, we can explicitly specify every combination of settings to try. We do this with GridSearchCV, a method that, instead of sampling randomly from a distribution, evaluates all combinations we define. To use Grid Search, we make another grid based on the best values provided by random search:

In [65]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    "n_estimators": [150,200,250,300,350],
    "max_features": ["log2"],
    "max_depth": [None,5,10,15]
}

# Create a based model
sel_model = ExtraTreesRegressor(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = sel_model, param_grid = param_grid, 
                          cv = 3, n_jobs = 2, verbose = 2)
# Fit the grid search to the data
grid_search.fit(X_scaled_trn, y_scaled_trn)
grid_search.best_params_

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed:  2.2min finished


{'max_depth': 5, 'max_features': 'log2', 'n_estimators': 150}

In [61]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_scaled_test, y_scaled_test)

Model Performance
R2 Test: 0.107
Average Error: 0.5220 degrees.
RMSE Test: 0.9450


In [62]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Improvement of -80.51%.


In [63]:
from sklearn.model_selection import learning_curve

In [64]:
learning_curve()

TypeError: learning_curve() missing 3 required positional arguments: 'estimator', 'X', and 'y'