## Importing the libraries

In [760]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

from sklearn.pipeline import Pipeline, make_pipeline

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [761]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import Input, Embedding, Concatenate, Flatten
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text  import one_hot

In [762]:
# To get reproducible results
from numpy.random import seed 
seed(0) 
tensorflow.random.set_seed(0)

## Importing the dataset

In [763]:
df = pd.read_csv('/content/drive/MyDrive/Biogas RMS project/Datasets/Paper 13/paper13.csv')
df.head()

Unnamed: 0,Biomass type,Reactor/feeding,VS (%),pH,OLR (g VS/l.d),HRT (d),T (°C),Reactor Volume (m³),Cumulated biogas volume (L/(g VS))
0,0,0,10.0,7.62,0.627,19.2,55,0.05,0.0668
1,0,2,15.3,8.0,3.1702,47.0,37,0.0473,0.6765
2,0,0,4.78,7.25,1.24,15.0,37,0.045,0.8227
3,0,0,4.78,7.25,1.76,15.0,37,0.045,0.6219
4,0,2,6.36,7.3,3.2,25.0,35,0.04,0.5755


## Splitting the data into X and y sets

In [764]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Cumulated biogas volume (L/(g VS))'])
y = df['Cumulated biogas volume (L/(g VS))']

## Function to pre-process the inputs


### Variation 1: For the first version of the ANN model, gives sequences for cat values

In [765]:
# scaler = StandardScaler()
# scaler.fit(X.drop(columns=['Biomass type', 'Reactor/feeding']))
# def pre_process1(x):
#   # Splitting into categorical and numerical data
#   x_cat = x[['Biomass type', 'Reactor/feeding']]
#   x_num = x.drop(columns=['Biomass type', 'Reactor/feeding'])

#   ### Treating Categorical values ###
#   # 1. Replace the numbers in the columns with unique numbers
#   x_cat['Biomass type'].replace(-1, 2, inplace= True) # -1,0,1 -> 2,0,1
#   x_cat['Reactor/feeding'].replace([0,2,-1,1,-3], [3,4,5,6,7], inplace=True)

#   # 2. Convert the value list into sequences of categorical values
#   x_cat_encoded = x_cat.to_numpy()

#   ## Scaling the numerical values
#   scaler = StandardScaler()
#   scaler.fit(x_num)
#   x_num = scaler.transform(x_num)
  
#   return x_cat_encoded, x_num 

### Variation 2: For the second version of the ANN model, gives 3 arrays as output

In [766]:
scaler = StandardScaler()
scaler.fit(X.drop(columns=['Biomass type', 'Reactor/feeding']))

def pre_process2(x):
  # Splitting into categorical and numerical data
  x_biomassType = x['Biomass type']
  x_reactorFeed = x['Reactor/feeding']
  x_num = x.drop(columns=['Biomass type', 'Reactor/feeding'])

  ### Treating Categorical values ###
  # 1. Replace the numbers in the columns with unique numbers
  x_biomassType.replace(-1, 2, inplace= True) # -1,0,1 -> 2,0,1
  x_reactorFeed.replace([-3, -1], [3,4], inplace=True) # -3,-1,0,1,2 -> 3,4,0,1,2

  ## Scaling the numerical values
  x_num = scaler.transform(x_num)
  
  return x_biomassType, x_reactorFeed, x_num 

## Building the ANN


### Version 1: Treating the categorical variables as sequences of size 2

In [767]:
# def build_model1(sequence_size, num_numerical):
#   # Define the embedding size
#   embedding_size = 8

#   #### Building the model ####
#   # 1. Define the input layers
#   input_cat = Input(shape = (sequence_size,))
#   input_num = Input(shape = (num_numerical,))

#   # 2. Define the embedding layer
#   embedding_layer = Embedding(input_dim = 8, 
#                               output_dim = embedding_size,
#                               input_length = sequence_size)(input_cat)
#   # 3. Define the flatten layer
#   flatten_layer = Flatten()(embedding_layer)

#   # 4. Concatenate the layers 
#   concatenate = Concatenate()([flatten_layer, input_num])
#   hidden_layer1 = Dense(20, activation='relu', )(concatenate)
#   hidden_layer2 = Dense(20, activation='relu',)(hidden_layer1)

#   # 5. Final output layer
#   output_layer = Dense(1, activation='linear')(hidden_layer2)

#   model = Model(inputs=[input_cat, input_num], outputs = [output_layer])
#   # model.summary()

#   return model

### Version 2: Having 2 embedding layers for the 2 categorical vairables.

In [768]:
# def build_model2(num_numerical):
#   # Define the embedding size
#   embedding_sizeBT = 3 # input is of size 3
#   embedding_sizeRF = 5 # input is of size 5

#   #### Building the model ####
#   # 1. Define the input layers
#   input_catBT = Input(shape = (1,))
#   input_catRF = Input(shape = (1,))
#   input_num = Input(shape = (num_numerical,))

#   # 2. Define the embedding layers
#   embedding_layerBT = Embedding(input_dim = 3, 
#                               output_dim = embedding_sizeBT)(input_catBT)
#   embedding_layerRF = Embedding(input_dim = 5, 
#                               output_dim = embedding_sizeRF)(input_catRF)
#   # 3. Define the flatten layers
#   flatten_layerBT = Flatten()(embedding_layerBT)
#   flatten_layerRF = Flatten()(embedding_layerRF)

#   # 4. Concatenate the layer 
#   concatenate = Concatenate()([flatten_layerBT, flatten_layerRF, input_num])

#   hidden_layer1 = Dense(20, activation='relu',
#                         kernel_regularizer=regularizers.L1(1e-4),
#                         )(concatenate)
#   dropout1 = Dropout(0.2)(hidden_layer1)

#   hidden_layer2 = Dense(20, activation='relu',)(dropout1)

#   # 5. Final output layer
#   output_layer = Dense(1, activation='linear')(hidden_layer2)

#   model = Model(inputs=[input_catBT, input_catRF, input_num], outputs = [output_layer])
#   # model.summary()
#   return model


### Version 3: Having 2 embedding layers and separete dense layers for the categorical values

In [769]:
# def build_model3(num_numerical):
#   # Define the embedding size
#   embedding_sizeBT = 2 # input is of size 3
#   embedding_sizeRF = 2 # input is of size 5

#   #### Building the model ####
#   # 1. Define the input layers
#   input_catBT = Input(shape = (1,))
#   input_catRF = Input(shape = (1,))
#   input_num = Input(shape = (num_numerical,))

#   # 2. Define the embedding layers
#   embedding_layerBT = Embedding(input_dim = 3, 
#                               output_dim = embedding_sizeBT)(input_catBT)
#   embedding_layerRF = Embedding(input_dim = 5, 
#                               output_dim = embedding_sizeRF)(input_catRF)
#   # 3. Define the flatten layers
#   flatten_layerBT = Flatten()(embedding_layerBT)
#   flatten_layerRF = Flatten()(embedding_layerRF)

#   concatenate1 = Concatenate()([flatten_layerBT, flatten_layerRF,])
#   hidden_cat =  Dense(20, activation='relu',
#                       kernel_regularizer=regularizers.L1(1e-4), 
#                       bias_regularizer=regularizers.L1(1e-4)
#                       )(concatenate1)

#   # 4. Concatenate the layer 
#   concatenate2 = Concatenate()([hidden_cat, input_num])

#   hidden_layer2 = Dense(20, activation='relu',)(concatenate2)
#   dropout2 = Dropout(0)(hidden_layer2)

#   # 5. Final output layer
#   output_layer = Dense(1, activation='linear')(dropout2)

#   model = Model(inputs=[input_catBT, input_catRF, input_num], outputs = [output_layer])
#   # model.summary()

#   return model

### Version 4:   
* 2 separate embedding layers
* 1 dense layer for numerical values which is concatenated with the flattened embedding layer
* 1 dense layer for concatenated input from previous layer

In [770]:
# def build_model4(num_numerical):
#   # Define the embedding size
#   embedding_sizeBT = 1 # input is of size 3
#   embedding_sizeRF = 1 # input is of size 5

#   #### Building the model ####
#   # 1. Define the input layers
#   input_catBT = Input(shape = (1,))
#   input_catRF = Input(shape = (1,))
#   input_num = Input(shape = (num_numerical,))

#   # 2. Define the embedding layers
#   embedding_layerBT = Embedding(input_dim = 3, 
#                               output_dim = embedding_sizeBT)(input_catBT)
#   embedding_layerRF = Embedding(input_dim = 5, 
#                               output_dim = embedding_sizeRF)(input_catRF)
#   # 3. Define the flatten layers
#   flatten_layerBT = Flatten()(embedding_layerBT)
#   flatten_layerRF = Flatten()(embedding_layerRF)

#   dense_num =  Dense(20, activation='relu',
#                      kernel_regularizer=regularizers.L1(1e-4), 
#                     bias_regularizer=regularizers.L1(1e-4))(input_num)
#   dropout1 = Dropout(0)(dense_num)
#   # 4. Concatenate the layer 
#   concatenate1 = Concatenate()([flatten_layerBT, flatten_layerRF, dropout1])

#   hidden_layer1 = Dense(20, activation='relu',)(concatenate1)
#   dropout2 = Dropout(0)(hidden_layer1)

#   # 5. Final output layer
#   output_layer = Dense(1, activation='linear')(dropout2)

#   model = Model(inputs=[input_catBT, input_catRF, input_num], outputs = [output_layer])
#   # model.summary()

#   return model

### Version 5:   
* 2 separate embedding layers
* 1 dense layer for concatenated categorical values
* 1 dense layer for numerical values 
* concatenate the 2 dense layers
* 1 dense layer for concatenated input from previous layer

In [771]:
def build_model5(num_numerical):
  # Define the embedding size
  embedding_sizeBT = 1 # input is of size 3
  embedding_sizeRF = 1 # input is of size 5

  #### Building the model ####
  # 1. Define the input layers
  input_catBT = Input(shape = (1,))
  input_catRF = Input(shape = (1,))
  input_num = Input(shape = (num_numerical,))

  # 2. Define the embedding layers
  embedding_layerBT = Embedding(input_dim = 3, 
                              output_dim = embedding_sizeBT)(input_catBT)
  embedding_layerRF = Embedding(input_dim = 5, 
                              output_dim = embedding_sizeRF)(input_catRF)
  # 3. Define the flatten layers
  flatten_layerBT = Flatten()(embedding_layerBT)
  flatten_layerRF = Flatten()(embedding_layerRF)

  concatenate1 = Concatenate()([flatten_layerBT, flatten_layerRF])
  dense_cat = Dense(20, activation='relu',
                    )(concatenate1)
  dropout1 = Dropout(0)(dense_cat)

  dense_num =  Dense(20, activation='relu',
                     kernel_regularizer=regularizers.L1(1e-4), 
                    bias_regularizer=regularizers.L1(1e-4)
                     )(input_num)
  dropout2 = Dropout(0)(dense_num)

  # 4. Concatenate the layer 
  concatenate2 = Concatenate()([dropout1, dropout2])

  hidden_layer1 = Dense(20, activation='relu',
                        )(concatenate2)
  dropout3 = Dropout(0)(hidden_layer1)

  # 5. Final output layer
  output_layer = Dense(1, activation='linear')(dropout3)

  model = Model(inputs=[input_catBT, input_catRF, input_num], outputs = [output_layer])
  # model.summary()

  return model

### Early Stopping
This is used to detect over fitting and stop further epochs. It is sent as a callback when the model is compiled

In [772]:
callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=80,
    verbose=1,
    mode="auto",
    baseline=None,
    restore_best_weights=False
)

## Using K-Fold Cross Validation



In [773]:
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle= True, random_state = 1)
kf.get_n_splits(X)

test_result = []
train_result = []
history = []
model = None

for i, (train_index, test_index) in enumerate(kf.split(X)):

  print("Fold:", i+1)
  X_train = X[X.index.isin(train_index)]
  X_test = X[X.index.isin(test_index)]

  y_train = y[y.index.isin(train_index)]
  y_test = y[y.index.isin(test_index)]

  # #################### Variation 1 ##########################
  # # Run the split through the pipeline
  # X_train_cat, X_train_num = pre_process1(X_train)
  # X_test_cat, X_test_num = pre_process1(X_test)

  # ########## Build the model first #############
  # model = None
  # model = build_model1(X_train_cat.shape[1], X_train_num.shape[1])

  # ####### Compile and fit the model ########
  # model.compile(loss='mse', optimizer='Adam')

  # history_curr = model.fit([X_train_cat, X_train_num], y_train, epochs=150, shuffle=False, verbose=0);
  # history.append(history_curr)
 
  # ######## Predicting Results #########
  # y_pred = model.predict([X_test_cat, X_test_num])
  # test_result.append(r2_score(y_test, y_pred))

  # y_pred_train = model.predict([X_train_cat, X_train_num])
  # train_result.append(r2_score(y_train, y_pred_train))


  ################### Variation 2 ##########################
  # Run the split through the pipeline
  X_trainBT, X_trainRF, X_train_num = pre_process2(X_train)
  X_testBT, X_testRF, X_test_num = pre_process2(X_test)

  ########## Build the model first #############
  model = None
  model = build_model5(X_train_num.shape[1])

  ####### Compile and fit the model ########
  model.compile(loss='mse', optimizer='Adam')

  history_curr = model.fit([X_trainBT, X_trainRF, X_train_num], y_train, 
                           validation_data=([X_testBT, X_testRF, X_test_num], y_test),
                           epochs=1000,  callbacks=callback, shuffle=False, verbose=0);
  history.append(history_curr)
 
  ######## Predicting Results #########
  y_pred = model.predict([X_testBT, X_testRF, X_test_num])
  test_result.append(r2_score(y_test, y_pred))

  y_pred_train = model.predict([X_trainBT, X_trainRF, X_train_num])
  train_result.append(r2_score(y_train, y_pred_train))

Fold: 1
Epoch 259: early stopping
Fold: 2
Epoch 293: early stopping
Fold: 3
Epoch 266: early stopping
Fold: 4
Epoch 493: early stopping
Fold: 5
Epoch 425: early stopping
Fold: 6
Epoch 143: early stopping
Fold: 7
Epoch 192: early stopping
Fold: 8
Epoch 271: early stopping
Fold: 9
Epoch 261: early stopping
Fold: 10
Epoch 144: early stopping


## Plotting graphs of the training history

In [774]:
# import matplotlib.pyplot as plt

# fig, ax = plt.subplots(nrows=1, ncols=10,figsize=(200,20) )
# idx = 0
# for h in history:
#   ax[idx].plot(h.history['loss'], label='train_loss')
#   ax[idx].plot(h.history['val_loss'], label='val_loss')
#   ax[idx].legend()
#   idx+=1
  

## Results on the test set

In [775]:
test = ""
for res in test_result:
  test+= (str(round(res, 3)) + ", ")

# Print the CV_score
print(test)
print("cv_score= ", round(np.mean(test_result), 4))

0.22, -0.025, 0.323, 0.936, 0.85, -0.067, 0.577, 0.616, 0.788, 0.509, 
cv_score=  0.4729


## Results on training set

In [776]:
train = ""
for res in train_result:
  train+= (str(round(res, 3)) + ", ")

# Print the CV_score
print(train)
print("cv_score= ", round(np.mean(train_result), 4))

0.93, 0.951, 0.931, 0.966, 0.922, 0.871, 0.888, 0.923, 0.902, 0.869, 
cv_score=  0.9153


## Printing the weights and biases

In [777]:
# model.layers[0].get_weights()