## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer


from sklearn.svm import SVR

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import r2_score

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing the dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Biogas RMS project/Datasets/Paper 13/paper13.csv')
df.head()

Unnamed: 0,Biomass type,Reactor/feeding,VS (%),pH,OLR (g VS/l.d),HRT (d),T (°C),Reactor Volume (m³),Cumulated biogas volume (L/(g VS))
0,0,0,10.0,7.62,0.627,19.2,55,0.05,0.0668
1,0,2,15.3,8.0,3.1702,47.0,37,0.0473,0.6765
2,0,0,4.78,7.25,1.24,15.0,37,0.045,0.8227
3,0,0,4.78,7.25,1.76,15.0,37,0.045,0.6219
4,0,2,6.36,7.3,3.2,25.0,35,0.04,0.5755


## Dropping columns 
Since EDA points out that 
1. Biomass Type,   
2. OLR   
3. Reactor Volume   
are not strongly corelated with the output, they can be dropped.  

In [None]:
df.drop(columns=['Biomass type', 'Reactor Volume (m³)'], inplace=True)
df.head()

Unnamed: 0,Reactor/feeding,VS (%),pH,OLR (g VS/l.d),HRT (d),T (°C),Cumulated biogas volume (L/(g VS))
0,0,10.0,7.62,0.627,19.2,55,0.0668
1,2,15.3,8.0,3.1702,47.0,37,0.6765
2,0,4.78,7.25,1.24,15.0,37,0.8227
3,0,4.78,7.25,1.76,15.0,37,0.6219
4,2,6.36,7.3,3.2,25.0,35,0.5755


## Outlier Detection and Removal
### 1. Temperature

In [None]:
# percentile25 = df['T (°C)'].quantile(0.25)
# percentile75 = df['T (°C)'].quantile(0.75)

# iqr = percentile75 - percentile25
# upper_limit = percentile75 + 1.5 * iqr
# lower_limit = percentile25 - 1.5 * iqr

# print(upper_limit)
# print(lower_limit)

# # Removing outliers
# df = df[df['T (°C)'] <= 50]

# df.shape

40.0
32.0


## 2. VS%

In [None]:
percentile25 = df['VS (%)'].quantile(0.25)
percentile75 = df['VS (%)'].quantile(0.75)

iqr = percentile75 - percentile25
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr

print(upper_limit)
print(lower_limit)

# Removing outliers
df = df[df['VS (%)'] <= upper_limit]

df.shape

25.784999999999997
-6.7349999999999985


(106, 7)

## 3. HRT


In [None]:
# percentile25 = df['HRT (d)'].quantile(0.25)
# percentile75 = df['HRT (d)'].quantile(0.75)

# iqr = percentile75 - percentile25
# upper_limit = percentile75 + 1.5 * iqr
# lower_limit = percentile25 - 1.5 * iqr

# print(upper_limit)
# print(lower_limit)

# # Removing outliers
# new_df = df[df['VS (%)'] > upper_limit]

# new_df.shape
# These outliers were removed as part of the VS% outliers

47.5
3.5


(0, 7)

## 4. OLR

In [None]:
# Using the percentile method of outlier removal

upper_limit = df['OLR (g VS/l.d)'].quantile(0.99)
lower_limit = df['OLR (g VS/l.d)'].quantile(0.01)

print(upper_limit)
print(lower_limit)

# Removing outliers
df = df[df['OLR (g VS/l.d)'] <= upper_limit]

df.shape

16.002800000000008
0.2


(104, 7)

## Splitting the data into Test and training data

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Cumulated biogas volume (L/(g VS))'])
y = df['Cumulated biogas volume (L/(g VS))']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)

# Pipeline information

(Since this data does not have missing values, not imputation is required.)

The following transformers will be applied in the pipeline.

Step 1: OneHotEncoding the categorical columns. 

Step 2: Scaling using StandardScaler

Step 3: Training

(We could have added feature selection before training as well)

Note: After OneHot Encoding, the number of columns becomes:  
5 (for reactor/feeding)  
\+ 4

= 9




In [None]:
# 1. One Hot Encoding
trf0 = ColumnTransformer(transformers = [
    ('ohe0', OneHotEncoder(drop='first'), ['Reactor/feeding'])
], remainder='passthrough')

In [None]:
# 2. Scaling
#  Scaling only the temperature column
trf2 = ColumnTransformer([
    ('scale', StandardScaler(), slice(8,9))
    # ('scale', StandardScaler(), slice(9,10))
    # ('scale', StandardScaler(), slice(10,11))
    # ('scale', StandardScaler(), slice(12,13))
    # ('scale', StandardScaler(), slice(12,13))
], remainder = 'passthrough')

In [None]:
# 3. Transformation

# The slicing column numbers have to be changed since after the first transformation, the columns are reordered.
trf3 = ColumnTransformer([
    ('transform', PowerTransformer(), slice(5,9))
    # ('transform', PowerTransformer(), slice(6,10))
    # ('transform', PowerTransformer(), slice(6,11))
    # ('transform', PowerTransformer(), slice(9,14))
    # ('transform', PowerTransformer(), slice(9,13))
], remainder='passthrough') 

In [None]:
# 3. Training
trf4 = SVR(kernel = 'rbf')

## Creating the pipeline

In [None]:
pipe = Pipeline([
    ('ohe0', trf0),
    ('scaling', trf2),
    ('transform', trf3),
    ('svr', trf4),
])

## Training and predicting using the pipeline

Here we are using default parameters

In [None]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')
# Show the steps involved in the pipeline
pipe.named_steps

# train and predict
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

# Visualize 
# X_trans = pipe.fit_transform(X_train)
# X_pd = pd.DataFrame(X_trans)
# X_pd.head()

0.84714352449327

## GridSearch using the pipeline

In [None]:
# gridsearchcv
params = {
    'svr__kernel':['rbf', 'poly', 'sigmoid'],
    'svr__shrinking':[False, True],
    'svr__degree': [2, 3],
    'svr__gamma': ['scale', 'auto'],
    'svr__C' : [1, 10, 100],
    'svr__tol' : [1e-3, 1e-2]

}

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=10, scoring='r2')
grid.fit(X, y)


# GridSearchCV results


In [None]:
grid.best_score_

0.46941320930892677

In [None]:
grid.best_params_

{'svr__C': 1,
 'svr__degree': 2,
 'svr__gamma': 'auto',
 'svr__kernel': 'rbf',
 'svr__shrinking': False,
 'svr__tol': 0.01}

# Important Results: 
### a. Biomass type, OLR and reactor volume have been dropped

1. The cross validation score increased from 0.25 to 0.32 after applying the Yeo-Johnson transform.
2. The r2 score on train-test split came out to around 0.63 (random_state = 3). It may increase based on the randomness of the split.

### b. Biomass type and reactor volume have been dropped

1. The cross validation score increased from 0.32 to 0.4571 after applying the Yeo-Johnson transform.
2. The r2 score on train-test split came out to around 0.82 (random_state = 3). It may increase based on the randomness of the split.

### c. Biomass type has been dropped

1. The cross validation score decreased from 0.457 to 0.4371 after applying the Yeo-Johnson transform.
2. The r2 score on train-test split came out to around 0.82 (random_state = 3). It may increase based on the randomness of the split.

### d. Reactor volume has been dropped

1. The cross validation score decreased from 0.4371 to 0.4521 after applying the Yeo-Johnson transform.
2. The r2 score on train-test split came out to around 0.78999 (random_state = 3). It may increase based on the randomness of the split.

### e. Nothing dropped

1. The cross validation score decreased from 0.4371 to 0.436 after applying the Yeo-Johnson transform.
2. The r2 score on train-test split came out to around 0.7955 (random_state = 3). It may increase based on the randomness of the split.

## BEST Results (without outlier removal):
Biomass Type and Reactor Volume dropped.   
CV_score = 0.4571  
r2_score = 0.817

## Outlier removal

1. Temperature: The results were significantly worse.  
  r2_score = 0.35  
  cv_Score = 0.33  

2. VS% : The CV score improved.  
  r2_score = 0.446  (random_state = 3)  
  cv_score = 0.4895  

3. OLR and VS%:  
  r2_score = 0.8471  
  cv_score = 0.4694



## OLD Results

### 1. SVR with RBF kernel (SCALED data)
Optimal C value = 10  
Shrinking = false  
Tolerance value = 1e-2  
epsilon = 0.09  

BEST result = 0.552

## 2.SVR with Poly kernel, degree = 2

SVR with Poly kernel, Optimum
degree = 2  
C = 1.25  
coef0 = 0.1  
tol = 0.1  
shrinking = False 

MOST Optimum result: 0.53679
