# Intro

Regression model for Combined Cycle Power Plant (CCPP). In this notebook, various regression models such as polynomial, SVR, Decision Tree Regression, and Random Forest Regression will be built in order to perform prediction of the net hourly electrical energy output (EP) of the plant. The dataset is from UCI ML dataset repository <a href="https://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant">here</a>.

# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("./dataset/CCPP.csv")

In [3]:
dataset.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


In [5]:
dataset.describe()

Unnamed: 0,AT,V,AP,RH,PE
count,9568.0,9568.0,9568.0,9568.0,9568.0
mean,19.651231,54.305804,1013.259078,73.308978,454.365009
std,7.452473,12.707893,5.938784,14.600269,17.066995
min,1.81,25.36,992.89,25.56,420.26
25%,13.51,41.74,1009.1,63.3275,439.75
50%,20.345,52.08,1012.94,74.975,451.55
75%,25.72,66.54,1017.26,84.83,468.43
max,37.11,81.56,1033.3,100.16,495.76


In [6]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [7]:
x

array([[  14.96,   41.76, 1024.07,   73.17],
       [  25.18,   62.96, 1020.04,   59.08],
       [   5.11,   39.4 , 1012.16,   92.14],
       ...,
       [  31.32,   74.33, 1012.92,   36.48],
       [  24.48,   69.45, 1013.86,   62.39],
       [  21.6 ,   62.52, 1017.23,   67.87]])

In [8]:
y

array([463.26, 444.37, 488.56, ..., 429.57, 435.74, 453.28])

In [9]:
y = y.reshape(len(y), 1)

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
# split the dataset
# 80% for training and 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
sc_x = StandardScaler()
sc_y = StandardScaler()

In [14]:
x_train = sc_x.fit_transform(x_train)
y_train = sc_y.fit_transform(y_train)

In [15]:
x_train

array([[-1.13572795, -0.88685592,  0.67357894,  0.52070558],
       [-0.80630243, -0.00971567,  0.45145467,  0.14531044],
       [ 1.77128416,  1.84743445,  0.24279248, -1.88374143],
       ...,
       [-0.38409993, -1.24886277,  0.84522042,  0.13092486],
       [-0.9232821 , -1.04155299,  1.54693117,  0.8830852 ],
       [ 1.70136528,  1.05824381, -1.20438076, -2.42285818]])

In [16]:
y_train

array([[ 1.15069786],
       [ 0.79540777],
       [-1.30936356],
       ...,
       [ 0.27595724],
       [ 0.49346982],
       [-1.53508417]])

# Model

In [17]:
from sklearn.svm import SVR

In [18]:
regressor = SVR(kernel='rbf')
regressor.fit(x_train, y_train)

  return f(**kwargs)


SVR()

## Prediction

In [19]:
pred_temp = regressor.predict(sc_x.transform(x_test))
y_pred = sc_y.inverse_transform(pred_temp)

In [20]:
np.set_printoptions(precision=2)

y_pred_reshape = y_pred.reshape(len(y_pred),1)
y_test_reshape = y_test.reshape(len(y_test),1)

In [21]:
np.concatenate((y_pred_reshape, y_test_reshape),1)

array([[434.05, 431.23],
       [457.94, 460.01],
       [461.03, 461.14],
       ...,
       [470.6 , 473.26],
       [439.42, 438.  ],
       [460.92, 463.28]])

# Evaluating the model $(R^2)$

## Normal $R^2$

In [22]:
from sklearn.metrics import r2_score

In [23]:
r2 = r2_score(y_test, y_pred)

In [24]:
r2

0.9480784049986258

## Adjusted $R^2$

In [25]:
n = len(x_test)
p = x_test.shape[1]

1-(1-r2)*(n-1)/(n-p-1)

0.9479696117141808

## Mean Squared Error

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
mean_squared_error(y_test, y_pred)

15.186434937782039

## Root Mean Square Error

In [28]:
np.sqrt(mean_squared_error(y_test, y_pred))

3.896977667087924