<a href="https://colab.research.google.com/github/kartik-mohan/Time_Series_Analysis_of_Household_Power_Consumption/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels

In [2]:
from sklearn.model_selection import train_test_split                              # to split the data into two parts
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics                                                       # for the check the error and accuracy of the model
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,mean_absolute_percentage_error

In [3]:
## for Deep-learing:
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dropout

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Data Preprocessing

In [5]:
# importing dataset
df = pd.read_csv('/content/drive/MyDrive/SML_project/household_power_consumption.txt', sep=";", parse_dates={'dt' : ['Date', 'Time']}, infer_datetime_format=True, low_memory=False, na_values=['nan','?'], index_col='dt')
df.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [6]:
df = df.fillna(method='ffill')
df.isnull().sum()

Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64

In [7]:
df['energy_consumed'] = (df['Global_active_power'] * 1000 / 60) - (df['Sub_metering_1'] + df['Sub_metering_2'] + df['Sub_metering_3'])

In [8]:
data_resampled = df.resample('SM').sum()
data_resampled.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,energy_consumed
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-12-15,38332.01,2739.412,4965281.53,161961.8,27536.0,48403.0,156485.0,406442.833333
2006-12-31,34634.82,3090.402,5195859.08,146674.4,19277.0,36935.0,144067.0,376968.0
2007-01-15,35512.854,2797.424,5558097.23,150307.0,37156.0,36736.0,176994.0,340994.9
2007-01-31,34341.684,2543.752,5194277.02,144951.6,29936.0,40371.0,172991.0,329063.4
2007-02-15,23996.272,2110.592,4500232.82,101306.6,17648.0,29843.0,110396.0,242050.866667


Converting time series into supervised machine learning problem

In [9]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	dff = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(dff.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(dff.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

#### Feature Scaling:

In [10]:
#Minmax standard scaling technique:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(data_resampled.values)

# frame as supervised learning
data_reframed = series_to_supervised(scaled, 1, 1)

In [11]:
data_reframed.head()

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var1(t),var2(t),var3(t),var4(t),var5(t),var6(t),var7(t),var8(t)
1,0.950534,0.352012,0.554706,0.963389,0.626331,0.939737,0.586722,1.0,0.84742,0.462026,0.705259,0.859667,0.438472,0.680222,0.531019,0.919942
2,0.84742,0.462026,0.705259,0.859667,0.438472,0.680222,0.531019,0.919942,0.871909,0.370195,0.941778,0.884313,0.845146,0.675718,0.67872,0.822234
3,0.871909,0.370195,0.941778,0.884313,0.845146,0.675718,0.67872,0.822234,0.839245,0.290684,0.704226,0.847978,0.680921,0.757977,0.660763,0.789826
4,0.839245,0.290684,0.704226,0.847978,0.680921,0.757977,0.660763,0.789826,0.550714,0.154914,0.251057,0.551854,0.401419,0.519733,0.37998,0.553488
5,0.550714,0.154914,0.251057,0.551854,0.401419,0.519733,0.37998,0.553488,0.567277,0.228134,0.707229,0.570262,0.607997,0.940032,0.388135,0.50032


In [12]:
# drop columns we don't want to predict
data_reframed.drop(data_reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
print(data_reframed.head())

   var1(t-1)  var2(t-1)  var3(t-1)  ...  var7(t-1)  var8(t-1)   var1(t)
1   0.950534   0.352012   0.554706  ...   0.586722   1.000000  0.847420
2   0.847420   0.462026   0.705259  ...   0.531019   0.919942  0.871909
3   0.871909   0.370195   0.941778  ...   0.678720   0.822234  0.839245
4   0.839245   0.290684   0.704226  ...   0.660763   0.789826  0.550714
5   0.550714   0.154914   0.251057  ...   0.379980   0.553488  0.567277

[5 rows x 9 columns]


### Splitting dataset to train and test:

In [13]:
data_reframed.shape

(94, 9)

In [14]:
def split():
  values = data_reframed.values

  num_trains = 94-18
  #train data
  train = values[:num_trains, :]
  #test data
  test = values[num_trains:, :]

  # split into train and test
  train_X, train_y = train[:, :-1], train[:, -1]
  test_X, test_y = test[:, :-1], test[:, -1]

  # reshape input to be 3D [samples, timesteps, features]
  train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
  test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

  return train_X, train_y, test_X, test_y

  Modeling

In [15]:
models = {}
for unit in [30,50,100] :  
  train_X, train_y, test_X, test_y = split()
  model = Sequential()
  model.add(LSTM(unit, input_shape=(train_X.shape[1], train_X.shape[2])))
  model.add(Dropout(0.2))
  model.add(Dense(1))
  model.compile(loss='mean_squared_error', optimizer='adam')

  # fit network
  history = model.fit(train_X, train_y, epochs=100, batch_size=10, validation_data=(test_X, test_y), verbose=2, shuffle=False)

  # make a prediction
  yhat = model.predict(test_X)
  test_X = test_X.reshape((test_X.shape[0], 8))

  # invert scaling for forecast
  inv_yhat = np.concatenate((yhat, test_X[:, -7:]), axis=1)
  inv_yhat = scaler.inverse_transform(inv_yhat)
  inv_yhat = inv_yhat[:,0]

  # invert scaling for actual
  test_y = test_y.reshape((len(test_y), 1))
  inv_y = np.concatenate((test_y, test_X[:, -7:]), axis=1)
  inv_y = scaler.inverse_transform(inv_y)
  inv_y = inv_y[:,0]
  
  # calculate RMSE
  rmse = np.sqrt(mean_squared_error(inv_y, inv_yhat))
  mse = mean_squared_error(inv_y, inv_yhat)
  mae = mean_absolute_error(inv_y, inv_yhat)
  r2 = r2_score(inv_y, inv_yhat)
  mape = mean_absolute_percentage_error(inv_y, inv_yhat)

  models[unit] = [rmse, mse, mae, r2, mape]

Epoch 1/100
8/8 - 3s - loss: 0.3208 - val_loss: 0.1806 - 3s/epoch - 361ms/step
Epoch 2/100
8/8 - 0s - loss: 0.2406 - val_loss: 0.1260 - 39ms/epoch - 5ms/step
Epoch 3/100
8/8 - 0s - loss: 0.1736 - val_loss: 0.0838 - 39ms/epoch - 5ms/step
Epoch 4/100
8/8 - 0s - loss: 0.1211 - val_loss: 0.0526 - 35ms/epoch - 4ms/step
Epoch 5/100
8/8 - 0s - loss: 0.0733 - val_loss: 0.0319 - 39ms/epoch - 5ms/step
Epoch 6/100
8/8 - 0s - loss: 0.0459 - val_loss: 0.0208 - 49ms/epoch - 6ms/step
Epoch 7/100
8/8 - 0s - loss: 0.0346 - val_loss: 0.0169 - 40ms/epoch - 5ms/step
Epoch 8/100
8/8 - 0s - loss: 0.0296 - val_loss: 0.0171 - 40ms/epoch - 5ms/step
Epoch 9/100
8/8 - 0s - loss: 0.0278 - val_loss: 0.0184 - 47ms/epoch - 6ms/step
Epoch 10/100
8/8 - 0s - loss: 0.0268 - val_loss: 0.0192 - 51ms/epoch - 6ms/step
Epoch 11/100
8/8 - 0s - loss: 0.0298 - val_loss: 0.0189 - 52ms/epoch - 7ms/step
Epoch 12/100
8/8 - 0s - loss: 0.0284 - val_loss: 0.0188 - 47ms/epoch - 6ms/step
Epoch 13/100
8/8 - 0s - loss: 0.0281 - val_loss: 

In [16]:
models_df = pd.DataFrame.from_dict(models, orient='index', columns=['RMSE', 'MSE', 'MAE', 'R_sq', 'MAPE'])
models_df

Unnamed: 0,RMSE,MSE,MAE,R_sq,MAPE
30,4155.051531,17264450.0,3356.115623,0.173938,0.19474
50,4158.991754,17297210.0,3353.187823,0.172371,0.198293
100,4068.715886,16554450.0,3255.219831,0.20791,0.189771
