## Importing the libraries

In [195]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

from sklearn.pipeline import Pipeline, make_pipeline

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [196]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import Input, Embedding, Concatenate, Flatten
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text  import one_hot

In [197]:
# To get reproducible results
from numpy.random import seed 
seed(0) 
tensorflow.random.set_seed(0)

## Importing the dataset

In [198]:
df = pd.read_csv('/content/drive/MyDrive/Biogas RMS project/Datasets/Paper 13/paper13.csv')
df.head()

Unnamed: 0,Biomass type,Reactor/feeding,VS (%),pH,OLR (g VS/l.d),HRT (d),T (°C),Reactor Volume (m³),Cumulated biogas volume (L/(g VS))
0,0,0,10.0,7.62,0.627,19.2,55,0.05,0.0668
1,0,2,15.3,8.0,3.1702,47.0,37,0.0473,0.6765
2,0,0,4.78,7.25,1.24,15.0,37,0.045,0.8227
3,0,0,4.78,7.25,1.76,15.0,37,0.045,0.6219
4,0,2,6.36,7.3,3.2,25.0,35,0.04,0.5755


## Splitting the data into X and y sets

In [199]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Cumulated biogas volume (L/(g VS))'])
y = df['Cumulated biogas volume (L/(g VS))']

## Pipeline information

(Since this data does not have missing values, not imputation is required.)

The following transformers will be applied in the pipeline.

Step 1: OneHotEncoding the categorical columns. 

Step 2: Scaling using StandardScaler





In [200]:
# 1. One Hot Encoding
trf0 = ColumnTransformer(transformers = [
    ('ohe0', OneHotEncoder(drop='first'), ['Biomass type','Reactor/feeding'])
], remainder='passthrough')

In [201]:
# 2. Scaling
#  Scaling only the temperature column
trf2 = ColumnTransformer([
    ('scale', StandardScaler(), slice(6,12))
], remainder = 'passthrough')

## Creating and fitting the pipeline

In [202]:
pipe = Pipeline([
    ('ohe0', trf0),
    ('scaling', trf2)
])

# Display Pipeline

from sklearn import set_config
set_config(display='diagram')
# Show the steps involved in the pipeline
pipe.named_steps

# Visualize 
X_trans = pipe.fit(X)

## Function to pre-process the inputs


### Returns scaled, OHE cat values and scaled numerical values as 2 separate dataframes

In [203]:
def pre_process(x):
  scale = 0.2 # Converting the 1s in the one hot encoding to the scaled value
  
  x_trans = pipe.transform(x)
  # Categorical values
  x_cat = x_trans[:, 6:15]
  x_cat = x_cat*scale
  
  # Numerical values
  x_num = x_trans[:, 0:6]

  return x_cat, x_num 

## Building the ANN


### Version 1: Basic Sequential ANN

In [204]:
# def build_model1():
#   model = Sequential()
#   model.add(Dense(20, activation='relu', 
#     # kernel_regularizer=regularizers.L1(1e-4), 
#     bias_regularizer=regularizers.L1(1e-4),
#     ))
#   model.add(Dropout(0.2))
#   model.add(Dense(20, activation='relu', ))
#   # model.add(Dropout(0.1))
#   model.add(Dense(1, activation='linear'))

#   # model.summary()
#   return model

### Version 2: 
* 1 Dense layer for categorical values
* 1 Combined dense layer 

In [205]:
def build_model2(num_numerical):

  #### Building the model ####
  # 1. Define the input layers
  input_cat = Input(shape = (6,))
  input_num = Input(shape = (num_numerical,))
 
 # 2. First dense layer
  hidden_cat =  Dense(20, activation='relu',
                      kernel_regularizer=regularizers.L1(1e-4), 
                      bias_regularizer=regularizers.L1(1e-4)
                      )(input_cat)

  dropout1 = Dropout(0)(hidden_cat)
  # 3. Concatenate the layer 
  concatenate1 = Concatenate()([input_num, dropout1])

  hidden_layer2 = Dense(20, activation='relu',
                        kernel_regularizer=regularizers.L1(1e-4), 
                      bias_regularizer=regularizers.L1(1e-4))(concatenate1)
  dropout2 = Dropout(0.2)(hidden_layer2)

  # 4. Final output layer
  output_layer = Dense(1, activation='linear')(dropout2)

  model = Model(inputs=[input_cat, input_num], outputs = [output_layer])
  # model.summary()

  return model

### Early Stopping
This is used to detect over fitting and stop further epochs. It is sent as a callback when the model is compiled

In [206]:
callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=80,
    verbose=1,
    mode="auto",
    baseline=None,
    restore_best_weights=False
)

## Using K-Fold Cross Validation



In [207]:
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle= True, random_state = 1)
kf.get_n_splits(X)

test_result = []
train_result = []
history = []
model = None

for i, (train_index, test_index) in enumerate(kf.split(X)):

  print("Fold:", i+1)
  X_train = X[X.index.isin(train_index)]
  X_test = X[X.index.isin(test_index)]

  y_train = y[y.index.isin(train_index)]
  y_test = y[y.index.isin(test_index)]

  # ################### Variation 1: Sequential ANN ##########################
  # # Preprocess the values
  # X_train_cat, X_train_num = pre_process(X_train)
  # X_train_scaled = np.hstack(( X_train_num, X_train_cat))

  # X_test_cat, X_test_num = pre_process(X_test)
  # X_test_scaled = np.hstack(( X_test_num, X_test_cat,))

  # ########## Build the model #############
  # model = None
  # model = build_model1()

  # ####### Compile and fit the model ########
  # model.compile(loss='mse', optimizer='Adam')

  # history_curr = model.fit(X_train_scaled, y_train, epochs=600, shuffle=False, verbose=0);
  # history.append(history_curr)
 
  # ######## Predicting Results #########
  # y_pred = model.predict(X_test_scaled)
  # test_result.append(r2_score(y_test, y_pred))

  # y_pred_train = model.predict(X_train_scaled)
  # train_result.append(r2_score(y_train, y_pred_train))


  #################### Variation 2: Functional ANN ##########################
  # Run the split through the pipeline
  X_train_cat, X_train_num = pre_process(X_train)

  X_test_cat, X_test_num = pre_process(X_test)

  ########## Build the model first #############
  model = None
  model = build_model2(X_train_num.shape[1])

  ####### Compile and fit the model ########
  model.compile(loss='mse', optimizer='Adam')

  history_curr = model.fit([X_train_cat, X_train_num], y_train, 
                           validation_data=([X_test_cat, X_test_num], y_test),
                           epochs=1000,  callbacks=callback, shuffle=False, verbose=0);
  history.append(history_curr)
 
  ######## Predicting Results #########
  y_pred = model.predict([X_test_cat, X_test_num])
  test_result.append(r2_score(y_test, y_pred))

  y_pred_train = model.predict([X_train_cat, X_train_num])
  train_result.append(r2_score(y_train, y_pred_train))

Fold: 1
Epoch 722: early stopping
Fold: 2
Epoch 151: early stopping
Fold: 3
Epoch 273: early stopping
Fold: 4
Epoch 715: early stopping
Fold: 5
Fold: 6
Epoch 101: early stopping
Fold: 7
Epoch 549: early stopping
Fold: 8
Epoch 494: early stopping
Fold: 9
Epoch 378: early stopping
Fold: 10
Epoch 421: early stopping


## Plotting graphs of the training history

In [208]:
# import matplotlib.pyplot as plt

# fig, ax = plt.subplots(nrows=1, ncols=10,figsize=(200,20) )
# idx = 0
# for h in history:
#   ax[idx].plot(h.history['loss'], label='train_loss')
#   ax[idx].plot(h.history['val_loss'], label='val_loss')
#   ax[idx].legend()
#   idx+=1
  

## Results on the test set

In [209]:
test = ""
for res in test_result:
  test+= (str(round(res, 3)) + ", ")

# Print the CV_score
print(test)
print("cv_score= ", round(np.mean(test_result), 4))

0.391, -0.229, 0.484, 0.786, 0.876, -0.107, 0.777, 0.721, 0.465, 0.587, 
cv_score=  0.475


## Results on training set

In [210]:
train = ""
for res in train_result:
  train+= (str(round(res, 3)) + ", ")

# Print the CV_score
print(train)
print("cv_score= ", round(np.mean(train_result), 4))

0.839, 0.607, 0.739, 0.807, 0.815, 0.547, 0.797, 0.792, 0.791, 0.798, 
cv_score=  0.7531


## Printing the weights and biases

In [211]:
# model.layers[0].get_weights()