In [43]:
# Importing necessary libraries and modules for data manipulation, machine learning models, and metrics
from sklearn.model_selection import train_test_split
import pandas as pd  # Pandas library for data manipulation and analysis
import numpy as np  # Numpy library for numerical computations
from sklearn.svm import SVC  # Support Vector Classification from Scikit-learn SVM module
from sklearn.linear_model import LogisticRegression  # Logistic Regression model from Scikit-learn
from sklearn.model_selection import train_test_split  # Function to split data into train and test sets

In [44]:
# Reading the dataset
cc_df = pd.read_excel('CCPP.xlsx')  # Load the Combined Cycle Power Plant dataset from an Excel file.
# This dataset is expected to contain multiple columns where the first four are considered as features (inputs) for the model,
# and the fifth column is the target variable (output) the model aims to predict.

In [45]:
# Data preparation
# The next steps involve preparing the data for machine learning by selecting the appropriate features and target variable,
# and then converting them into a format (NumPy arrays) that is compatible with machine learning algorithms.

X = cc_df.iloc[:, :4].values  # Select all rows and the first four columns as features.
# .iloc[:, :4] is used for integer-location based indexing to select the features (inputs) from the dataset.
# The selection includes all rows (:) and the first four columns (:4) which are then converted into a NumPy array using .values.
# These features are the inputs that the machine learning model will use to make predictions.

y = cc_df.iloc[:, 4].values  # Select all rows and the fifth column as the target variable.
# Similarly, .iloc[:, 4] selects all rows (:) and specifically the fifth column (4) as the target variable.
# The target variable is converted into a NumPy array using .values, which the model will be trained to predict based on the inputs (X).

In [46]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)  # Split the features and target variable into training (70%) and testing (30%) sets

In [47]:
# Feature scaling
from sklearn.preprocessing import StandardScaler  # Import StandardScaler for feature scaling
sc_X = StandardScaler()  # Create an instance of StandardScaler for the features
sc_y = StandardScaler()  # Create an instance of StandardScaler for the target variable

In [48]:
# The StandardScaler in Python, provided by the scikit-learn library, is a preprocessing utility that is used to standardize
# the features of your dataset. Standardization refers to the process of scaling your data so that it has a mean of 0 and a standard
# deviation of 1. This is achieved by subtracting the mean value of each feature from the dataset and then dividing the feature values
# by their respective standard deviations. Standardizing the features is important because it ensures that each feature contributes
# equally to the analysis, which is particularly useful for machine learning models that are sensitive to the scale of the input data,
# such as Support Vector Machines (SVMs) and k-nearest neighbors (KNN). By using StandardScaler, you can improve the convergence of
# stochastic gradient descent algorithms and the overall performance of your models.

In [49]:
# Scaling the features to improve model performance
# This step is crucial for algorithms sensitive to the scale of data, such as SVM, k-NN, and PCA.

# Initialize sc_X with a scaler object (e.g., StandardScaler or MinMaxScaler) before this line.
# The choice of scaler depends on the desired scaling strategy:
# - StandardScaler scales data to have mean = 0 and variance = 1.
# - MinMaxScaler scales each feature to a specified range, typically [0, 1].

X = sc_X.fit_transform(X)  # Fit to data, then transform it for the features.
# The fit_transform method performs two operations:
# 1. Fit: Calculate the scaling parameters (mean and std for StandardScaler; min and max for MinMaxScaler) based on the data.
#    This ensures the scaling is tailored to the dataset's specific features.
# 2. Transform: Apply the scaling transformation using the calculated parameters, adjusting the scale of the data accordingly.

# The transformed dataset is then reassigned to X, replacing the original dataset.
# This practice maintains simplicity in variable naming but note that the original scale of the data is not preserved.
# If you need to reverse the transformation or access the original data later, keep a copy of the original dataset or the scaler object.

# Scaling is a best practice for many machine learning algorithms, especially those calculating distances between data points
# or assuming data is centered around zero. It helps in achieving better performance and more stable convergence.


In [50]:
# Ensure the target variable is in the appropriate format and scale for machine learning algorithms

# Convert the target variable to a NumPy array for uniformity and compatibility with scikit-learn methods.
y = np.array(y)  # Ensure y is a numpy array to enable reshaping.

# Reshape the target variable to have a single row.
# This step is necessary for the scaling process, especially when the scaler expects a two-dimensional input.
# The reshape method is used with (1, -1) to indicate that we want one row, with the number of columns automatically determined based on the length of y.
y = np.reshape(y, (1, -1))  # Reshape y to make it suitable for scaling (1 row, as many columns as necessary).

# Scale the target variable using a predefined scaler object (e.g., sc_y, which could be an instance of StandardScaler or MinMaxScaler).
# The fit_transform method first calculates the scaling parameters based on the data in y (fit),
# and then applies the scaling transformation (transform).
# This step ensures the target variable is on a suitable scale for the machine learning algorithm, improving model performance.
y = sc_y.fit_transform(y)  # Fit to data, then transform it for the target variable.

# After scaling, the target variable is in a two-dimensional array form due to the earlier reshaping.
# For many machine learning algorithms and further processing, we need to revert it back to its original shape.
# The flatten method is used to collapse the array into one dimension, achieving the desired format for modeling.
y = y.flatten()  # Flatten the array to revert it back to the original shape after scaling.

In [51]:
# Support Vector Regression (SVR) model
from sklearn.svm import SVR  # Import SVR class from scikit-learn's svm module.
# SVR is a type of Support Vector Machine (SVM) that is used for regression tasks, which predict a continuous value.

regressor = SVR(kernel='rbf')  
# Instantiate the SVR model with the Radial Basis Function (RBF) kernel.
# The RBF kernel is a popular choice for SVR (and SVM classification) because it can model complex, non-linear relationships
# between the features and the target variable. The kernel choice significantly affects the model's performance,
# and RBF is known for its flexibility and ability to handle non-linear data.

regressor.fit(X, y)  
# Fit the SVR model to the scaled features (X) and target variable (y).
# This step involves the model learning from the data by finding the hyperplane (or set of hyperplanes in higher-dimensional space)
# that best fits the data points. The fitting process adjusts the model parameters to minimize the error between
# the predicted values and the actual values in the dataset (y).
# The 'fit' method is a critical step where the model 'learns' from the data, making it capable of making predictions.

# Note: It's important that the features (X) are scaled before fitting the model, especially for algorithms like SVR,
# which are sensitive to the scale of the input data. Scaling ensures that all features contribute equally to the model's training process,
# improving performance and leading to more accurate predictions.