# Random Forest Regression
Official documentation link [here](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html).

## Importing the libraries

In [122]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import r2_score

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing the dataset

In [148]:
df = pd.read_csv('/content/drive/MyDrive/Biogas RMS project/Datasets/Lab dataset/reactor1_biogas.csv', 
                 index_col="day", 
                 usecols = ["day", "TS_actual", "VS_actual", "VS/TS", "reactor1"],
                 )
# df = pd.read_csv('/content/drive/MyDrive/Biogas RMS project/Datasets/Lab dataset/reactor1_biogas.csv', usecols=[1])
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,TS_actual,VS_actual,VS/TS,reactor1
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,369.593,338.3879,0.915569,0
1,357.2494,326.3529,0.913516,100
2,344.9058,314.3179,0.911315,120
3,332.5622,302.2829,0.908951,100
4,320.2186,290.2479,0.906405,100


## Standard scaling

In [149]:
# Using standard Scaling

X = df.drop(columns=['reactor1'])
y = df['reactor1']

# Scaling the data
scaler = StandardScaler()
scaler.fit(X)
X_trans = scaler.transform(X)

## Splitting the data into Test and training data

In [150]:
# Setting a fraction for train data
train_fraction = 0.8
train_size = (int)(len(X)*train_fraction)

# Splitting the train and test data
X_train = X_trans[0:train_size]
y_train = y[0:train_size]

X_test = X_trans[train_size :]
y_test = y[train_size :]

## Training the regressor

In [151]:
def build_model():
  RF_model = RandomForestRegressor(random_state = 0, 
                                   n_estimators= 500, 
                                   max_features='sqrt', 
                                   max_depth=32, 
                                   criterion='squared_error')
  return RF_model;

In [147]:
model = build_model()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(r2_score(y_test, y_pred))

0.018513442498096055


## Using K-Fold Cross Validation



In [128]:
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle= True, random_state = 0)

test_result = []
train_result = []
history = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
  print("Fold:", i+1)
  X_train = X[X.index.isin(train_index)]
  X_test = X[X.index.isin(test_index)]

  y_train = y[y.index.isin(train_index)]
  y_test = y[y.index.isin(test_index)]

  ## Run the split through the pipeline
  X_train_trans = scaler.transform(X_train)
  X_test_trans = scaler.transform(X_test)
  
  ######### Build the model first #############
  model = None
  model = build_model()

  ######## Compile and fit the model ########
  model.fit(X_train_trans, y_train);
 
  ######## Predicting Results #########
  y_pred = model.predict(X_test_trans)
  test_result.append(r2_score(y_test, y_pred))

  y_pred_train = model.predict(X_train_trans)
  train_result.append(r2_score(y_train, y_pred_train))


Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10


## Results on the test set

In [129]:
test = ""
for res in test_result:
  test+= (str(round(res, 3)) + ", ")

# Print the CV_score
print(test)
print("cv_score= ", round(np.mean(test_result), 4))

0.725, 0.636, 0.475, 0.276, 0.614, -0.733, 0.139, 0.796, 0.701, 0.702, 
cv_score=  0.4331


## Results on training set

In [130]:
for res in train_result:
  print(round(res, 4))

# Print the CV_score
print("cv_score= ", round(np.mean(train_result), 4))

0.9207
0.9271
0.9293
0.9298
0.922
0.9576
0.9377
0.9208
0.9268
0.9249
cv_score=  0.9297
