## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline, make_pipeline

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import regularizers

In [3]:
# To get reproducible results
from numpy.random import seed 
seed(0) 
tensorflow.random.set_seed(0)

## Importing the dataset

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Biogas RMS project/Datasets/Paper 13/paper13.csv')
df.head()

Unnamed: 0,Biomass type,Reactor/feeding,VS (%),pH,OLR (g VS/l.d),HRT (d),T (°C),Reactor Volume (m³),Cumulated biogas volume (L/(g VS))
0,0,0,10.0,7.62,0.627,19.2,55,0.05,0.0668
1,0,2,15.3,8.0,3.1702,47.0,37,0.0473,0.6765
2,0,0,4.78,7.25,1.24,15.0,37,0.045,0.8227
3,0,0,4.78,7.25,1.76,15.0,37,0.045,0.6219
4,0,2,6.36,7.3,3.2,25.0,35,0.04,0.5755


## Splitting the data into Test and training data

In [5]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Cumulated biogas volume (L/(g VS))'])
y = df['Cumulated biogas volume (L/(g VS))']

## Pipeline information

(Since this data does not have missing values, not imputation is required.)

The following transformers will be applied in the pipeline.

Step 1: OneHotEncoding the categorical columns. 

Step 2: Scaling using StandardScaler





In [6]:
# 1. One Hot Encoding
trf0 = ColumnTransformer(transformers = [
    ('ohe0', OneHotEncoder(drop='first'), ['Biomass type','Reactor/feeding'])
], remainder='passthrough')

In [7]:
# 2. Scaling
#  Scaling only the temperature column
trf2 = ColumnTransformer([
    ('scale', StandardScaler(), slice(6,12))
], remainder = 'passthrough')

## Creating the pipeline

In [8]:
pipe = Pipeline([
    ('ohe0', trf0),
    ('scaling', trf2)
])

# Display Pipeline

from sklearn import set_config
set_config(display='diagram')
# Show the steps involved in the pipeline
pipe.named_steps


{'ohe0': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe0', OneHotEncoder(drop='first'),
                                  ['Biomass type', 'Reactor/feeding'])]),
 'scaling': ColumnTransformer(remainder='passthrough',
                   transformers=[('scale', StandardScaler(),
                                  slice(6, 12, None))])}

## Transforming data using pipeline
This step only fits the pipeline onto the data

In [9]:
# Visualize 
X_trans = pipe.fit(X)

## Building the ANN

Changes to the architecture need to be made here.

In [10]:
def build_model():
  model = Sequential()
  model.add(Dense(20, activation='relu', 
    kernel_regularizer=regularizers.L1(1e-4), bias_regularizer=regularizers.L1(1e-4),))
  # model.add(Dropout(0.1))
  model.add(Dense(20, activation='relu', ))
  # model.add(Dropout(0.1))
  model.add(Dense(1, activation='linear'))

  # model.summary()
  return model

## Using K-Fold Cross Validation



In [11]:
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle= True, random_state = 1)

test_result = []
train_result = []
history = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
  print("Fold:", i+1)
  X_train = X[X.index.isin(train_index)]
  X_test = X[X.index.isin(test_index)]

  y_train = y[y.index.isin(train_index)]
  y_test = y[y.index.isin(test_index)]

  ## Run the split through the pipeline
  X_train_trans = pipe.transform(X_train)
  X_test_trans = pipe.transform(X_test)
  
  ######### Build the model first #############
  model = None
  model = build_model()

  ######## Compile and fit the model ########
  model.compile(loss='mse', optimizer='Adam')

  history_curr = model.fit(X_train_trans, y_train, epochs=250, shuffle=False, verbose=0);
  history.append(history_curr)
 
  ######## Predicting Results #########
  y_pred = model.predict(X_test_trans)
  test_result.append(r2_score(y_test, y_pred))

  y_pred_train = model.predict(X_train_trans)
  train_result.append(r2_score(y_train, y_pred_train))


Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10


## Plotting graphs of the training history

In [12]:
# import matplotlib.pyplot as plt

# fig, ax = plt.subplots(nrows=1, ncols=10,figsize=(200,20) )
# idx = 0
# for h in history:
#   ax[idx].plot(h.history['loss'], label='train_loss')
#   ax[idx].plot(h.history['val_loss'], label='val_loss')
#   ax[idx].legend()
#   idx+=1
  

## Results on the test set

In [13]:
test = ""
for res in test_result:
  test+= (str(round(res, 3)) + ", ")

# Print the CV_score
print(test)
print("cv_score= ", round(np.mean(test_result), 4))

0.35, 0.576, 0.271, 0.804, 0.919, 0.198, 0.779, 0.924, 0.877, 0.112, 
cv_score=  0.5808


## Results on training set

In [14]:
for res in train_result:
  print(round(res, 4))

# Print the CV_score
print("cv_score= ", round(np.mean(train_result), 4))

0.95
0.9371
0.9688
0.963
0.8576
0.9268
0.9482
0.9335
0.9324
0.9139
cv_score=  0.9331
