<a href="https://colab.research.google.com/github/hasan-rakibul/AI-cybersec/blob/main/lab_4/lab_4_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear regression

## Supplementary reading material
- Introduction to linear regression: https://www.statology.org/linear-regression/

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
data = fetch_california_housing()

In [None]:
data

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

In [None]:
# Create a linear regression model and train it on the training data
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
# Printing the learned parameters
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

In [None]:
# Make predictions on the testing data
y_pred = model.predict(X_test)

Mean Squared Error: https://www.ml-science.com/mean-squared-error

In [None]:
# Evaluate the performance of the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

In [None]:
# Plot the actual and predicted values
chosen_X = X_test[:, 0]
plt.scatter(chosen_X, y_test, label="Actual")
plt.scatter(chosen_X, y_pred, label="Predicted")
plt.xlabel("X[0]")
plt.ylabel("y")
plt.legend()

In [None]:
# all are separately straight lines
for i in range(len(data.feature_names)):
    if i > 3: # showing only few; as the range of Xs are different, many will look ambiguous
      break
    y_pred_eqn = model.intercept_ + model.coef_[i] * X_test[:, i]
    plt.plot(X_test[:, i], y_pred_eqn, label="Equation " + str(i))

# Support vector regression with GridSearchCV

## Supplementary reading materials
- Support Vector Machine - scikit learn: https://scikit-learn.org/stable/modules/svm.html
- Hyperparameters of SVM: https://www.geeksforgeeks.org/introduction-to-support-vector-machines-svm/
- K-fold cross-validation: https://www.statology.org/k-fold-cross-validation/

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, root_mean_squared_error

In [None]:
california = fetch_california_housing()
X = california.data
y = california.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = SVR()
params = {'kernel': ['rbf', 'sigmoid'], 'C': [0.1, 1]}
grid = GridSearchCV(
    estimator=model, param_grid=params, cv=5,
    scoring='neg_mean_squared_error', verbose=3
)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print("Best hyperparameters: ", grid.best_params_)

In [None]:
y_pred = grid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R^2:", r2)

The performance is not good with the above set of hyperparameters. Let's tune `gamma`...

In [None]:
model = SVR(kernel='rbf', C=1)
params = {'gamma': [0.005, 0.001, 0.01, 0.1]}
grid = GridSearchCV(
    estimator=model, param_grid=params, cv=5,
    scoring='neg_mean_squared_error', verbose=3
)

grid.fit(X_train, y_train)

print("Best hyperparameters: ", grid.best_params_)

y_pred = grid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R^2:", r2)