In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import glob
import re
from tqdm import tqdm

import plotly.graph_objects as go

# proprocessing
from sklearn import preprocessing
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout,Activation 
from keras.layers import LSTM
from keras.models import load_model
import matplotlib.pyplot as plt
import h5py
import datetime
import tensorflow as tf

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
def downsample_dataframe(df, downsampling_frequency, fill_method='mean'):
  """
  Downsamples a DataFrame and fills missing values.

  Args:
    df: The DataFrame to downsample.
    downsampling_frequency: The frequency to downsample the DataFrame to.
    fill_method: The method to use to fill missing values.

  Returns:
    The downsampled DataFrame.
  """

  # Convert the Timestamp column to a datetime object.
  df['Timestamp'] = pd.to_datetime(df['Timestamp'])

  # Set the index of the DataFrame to the Timestamp column.
  df = df.set_index('Timestamp')

  # Fill the missing values.
  if fill_method == 'mean':
    df = df.fillna(df.mean())
  elif fill_method == 'median':
    df = df.fillna(df.median())
  else:
    raise ValueError('Invalid fill_method: {}'.format(fill_method))

  # Downsample the DataFrame.
  df = df.resample(downsampling_frequency).mean()
  return df
  # Check if any missing values were created after downsampling.
  if df.isna().any():
    df = df.fillna(df.mean())

  return df





In [None]:
path_lists = glob.glob('GreenD_reduced_version_03/'+'*.csv')
sorted_file_paths = sorted(path_lists)
def extract_digits(string):
    # Extract digits from the string using regular expression
    digits = re.findall(r'\d+', string)
    return int(digits[0]) if digits else 0

sorted_file_paths = sorted(path_lists, key=extract_digits)
print(sorted_file_paths)

conct_list = []
for path in tqdm(sorted_file_paths ,desc='processing'):
    
    data = pd.read_csv(path)
    data['Timestamp'] = pd.to_datetime(data['Timestamp'],utc=True,unit='s')#format='%Y-%m-%d %H-%M-%S')
    data['Timestamp'] = data['Timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
    conct_list.append(data)
!mkdir 'Combined-Dataset-version-03'
df = pd.concat(conct_list)


In [None]:


# Downsample the DataFrame to 1-hour frequency and fill missing values using mean imputation.
df_1_hour = downsample_dataframe(df, '1H', fill_method='mean')
df_1_hour.reset_index('Timestamp',inplace=True)
df_1_hour.fillna(method='ffill',inplace=True)



In [None]:
df_1_hour['smooth_Summe'] = df_1_hour['Summe'].diff(1)

In [None]:
df_1_hour.dropna(inplace=True)

In [None]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=15):
    """_summary_

    Args:
        dataset (_type_): _description_
        look_back (int, optional): _description_. Defaults to 15.

    Returns:
        _type_: _description_
    """
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
        # print('Value of a : {}'.format(a))
        # print('Value of y : {}'.format(dataset[i + look_back, 0]))
    return np.array(dataX), np.array(dataY)

In [None]:
data_df = df_1_hour

In [None]:
plt.plot(data_df['smooth_Summe'])
plt.show()

In [None]:
data_df['date'] = data_df.index
data_df.head()

In [None]:
data_df['date'] = pd.to_datetime(data_df['date'])
data_df

In [None]:
import numpy as np
import math
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing

def create_dataset(data, look_back=20):
    X, Y = [], []
    for i in range(len(data) - look_back):
        X.append(data[i:(i + look_back)])
        Y.append(data[i + look_back])
    return np.array(X), np.array(Y)

data_df = df_1_hour # Your data dataframe

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
dataset = min_max_scaler.fit_transform(data_df['smooth_Summe'].values.reshape(-1, 1))

# split into train, validation, and test sets
train_size = int(len(dataset) * 0.7)
val_size = int(len(dataset) * 0.2)
test_size = len(dataset) - train_size - val_size

train_data = dataset[:train_size]
val_data = dataset[train_size:train_size+val_size]
test_data = dataset[train_size+val_size:]

# create train, validation, and test datasets
look_back = 20
x_train, y_train = create_dataset(train_data, look_back)
x_val, y_val = create_dataset(val_data, look_back)
x_test, y_test = create_dataset(test_data, look_back)

# reshape the input data
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_val = np.reshape(x_val, (x_val.shape[0], 1, x_val.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

# print the sizes of the datasets
print('Training Data Size:', train_data.shape)
print('Validation Data Size:', val_data.shape)
print('Testing Data Size:', test_data.shape)


In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
look_back = 20

# Check if GPU is available
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("GPU is available.")
    print("List of physical GPUs:")
    for device in physical_devices:
        print(device)
else:
    print("GPU is not available. Using CPU instead.")
# Create the LSTM model
model = Sequential()

model.add(LSTM(20, input_shape=(1, look_back), return_sequences=True))
model.add(LSTM(5 ,return_sequences=True))
# model.add(LSTM(30,return_sequences=True))
# model.add(LSTM(20,return_sequences=True))

model.add(Dense(1))

# Compile the model
model.compile(loss='mean_absolute_error', optimizer='adam')

# Define early stopping callback
early_stopping = EarlyStopping(patience=5, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(x_train, y_train,validation_data=(x_val, y_val), epochs=100, batch_size=32, verbose=2, callbacks=[early_stopping])

# Print model summary
print(model.summary())


In [None]:
import matplotlib.pyplot as plt

# Get the training and validation loss values from the history object
train_loss = history.history['loss']
val_loss = history.history['val_loss']
 
# Plot the training loss and validation loss
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import mean_absolute_error
 
Error_dataframe = {
    'Actual': [],
    'Predicted':[],
    'Pro_ratio' : []
}

prop_ratio_list = []
actual = []
predicted = []
for i in tqdm(range(x_test.shape[0])):
    test_x_i = np.reshape(x_test[i], (x_test[i].shape[0], 1, x_test[i].shape[1]))
    actual = min_max_scaler.inverse_transform(y_test[i].reshape(-1,1))
    actual =  actual.flatten().tolist()[0]
    predicted = model.predict(test_x_i)
    predicted =  min_max_scaler.inverse_transform(predicted.reshape(-1,1))
    predicted =  predicted.flatten().tolist()[0]
    prop_ratio  = predicted/actual
    
    Error_dataframe['Actual'].append(actual)
    Error_dataframe['Predicted'].append(predicted)
    Error_dataframe['Pro_ratio'].append(prop_ratio)
    
    
    

In [56]:
from sklearn.metrics import mean_absolute_percentage_error , mean_absolute_error
Error_df = pd.DataFrame.from_dict(Error_dataframe)
print('Mean Absolute Percentage Error : ',mean_absolute_percentage_error(Error_df['Actual'] , Error_df['Predicted'] ))
print('Mean Absolute Error : ',mean_absolute_error(Error_df['Actual'] , Error_df['Predicted'] ))

Mean Absolute Percentage Error :  3.5361172431687056
Mean Absolute Error :  42.617810194557904


In [57]:
Error_df

Unnamed: 0,Actual,Predicted,Pro_ratio
0,-3.498898,-12.164487,3.476663
1,6.711766,-10.897676,-1.623667
2,107.423122,-0.819710,-0.007631
3,151.600472,-5.461486,-0.036026
4,-44.143485,-6.405290,0.145102
...,...,...,...
1115,56.156192,0.590933,0.010523
1116,-28.170767,-3.394106,0.120483
1117,-6.231361,0.991581,-0.159128
1118,12.508439,0.742243,0.059339


In [58]:
Error_df.to_csv('Error.csv',index=False)

In [59]:
model_name = 'RNN'
# Create a subplot with two rows and one column
fig = go.Figure()

fig.add_trace(
go.Scatter(
    x=Error_df.index,
    y=Error_df['Predicted'],
    name='Predicted',
    mode='lines+markers'
))

# Add a trace for actual values
fig.add_trace(
    go.Scatter(
        x=Error_df.index,
        y=Error_df['Actual'],
        name='Actual Values',
        mode='lines+markers'
    )
)


# Update xaxis properties
fig.update_xaxes(title_text='Time')

# Update yaxis properties
fig.update_yaxes(title_text='Summe')

# Update title and height
fig.update_layout(
    title=f'Forecasting using ',
    height=500,
    width=1000
)

# Save the plot as an HTML file
fig.show()
!mkdir 'Visualization_models_results'
fig.write_html(f'Visualization_models_results/forecasting_using_{model_name}'+'.html')


mkdir: cannot create directory ‘Visualization_models_results’: File exists
