____
### Notas para grupo:
Este código precisa de um ficheiro "dataset_reduced.csv" resultante da Feature Reduction da section1.ipynb!

Ver:
session4_regression_diabetes, session7_regression_diabetes e session8_qsar_examples
____

# Section 3: Supervised Learning
# 1. Dataset Splitting

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the pre-processed .csv file reduced by PCA
data = pd.read_csv("dataset_reduced.csv")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,Y
0,-18.979618,-29.544092,-27.379373,26.890704,4.597509,-8.527948,-5.580079,-0.935693,-3.146765,-0.78573,...,3.648693,8.10066,-18.405698,10.772002,21.613432,9.60722,2.14875,-8.224677,-4.025757,7.69353
1,-18.060074,-29.511869,-27.561491,6.914117,4.45488,-16.256615,-5.18927,-14.303613,5.009351,0.124255,...,2.296746,-1.173271,-4.761234,-4.239881,15.279556,16.716698,1.232846,-0.798956,-4.019682,7.778053
2,-14.852828,-29.598422,-27.259799,29.207043,4.700565,-13.460366,-5.593262,10.941608,-10.088961,-0.000634,...,1.08667,-5.902036,-0.141581,-6.449239,15.141442,16.841146,1.028849,-10.111914,-4.086348,-1.198505
3,-17.399206,-29.478216,-27.583716,1.609595,4.401889,-2.826664,-6.02997,-8.969595,2.325586,0.041684,...,1.500826,-1.728483,-5.257342,-3.038552,16.140339,15.780399,1.398832,-2.589509,-4.014831,2.595684
4,100.928373,-29.722237,-27.613487,1.958608,4.503092,-0.188637,-6.238846,-3.589759,-1.220275,-0.108217,...,1.827501,-2.511339,-3.766501,-2.098171,16.711619,15.528714,1.245762,-0.365768,-4.022382,-5.139971


In [3]:
# Verifying Dataset
print(f"Dataset shape: {data.shape[0]} rows x {data.shape[1]} columns.")
print(f"Dataset has NaNs?: {data.isnull().values.any()}")
print(f"\nDescriptive Statistics of target column 'Y':\n{data['Y'].describe()}")

Dataset shape: 21760 rows x 100 columns.
Dataset has NaNs?: False

Descriptive Statistics of target column 'Y':
count    21760.000000
mean         4.469692
std         15.233728
min        -36.459230
25%         -5.206973
50%          4.122120
75%         13.840190
max         45.956396
Name: Y, dtype: float64


In [4]:
# Dataset splitting (random_state=42 for reproducible results)
from sklearn.model_selection import train_test_split

X = data.drop('Y', axis=1)
y = data['Y'] # Target column (y_true)

# Split the dataset into training set (80%) and test set (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the shape of the resulting sets
print(f'Training Set \t Shape of X_train: {X_train.shape} \n\t\t Shape of y_train: {y_train.shape} \n')
print(f'Test Set \t Shape of X_test: {X_test.shape} \n\t\t Shape of y_test: {y_test.shape}')

Training Set 	 Shape of X_train: (17408, 99) 
		 Shape of y_train: (17408,) 

Test Set 	 Shape of X_test: (4352, 99) 
		 Shape of y_test: (4352,)


# 2. Regression Models

## Feature Scaling
Ensures features are scaled appropriately.

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Train and Test multiple models

In [6]:
# Train and Test multiple models
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define models
models = [
    Ridge(random_state=42, alpha=1.0),                                                  
    RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, n_jobs=-1),  # Reduced n_estimators and tree depth helps reduce wait time. n_jobs=-1 ensure all CPU cores are used.
    KNeighborsRegressor(n_neighbors=10),                                                # Less n_neighbors leads to reduced wait time.
    SVR(kernel='rbf', C=1.0, epsilon=0.1)
]

# Initialize metrics list of each model for comparison
metrics_summary = []

# Iterate over models and calculate metrics
for model in models:
    # Indicate the current model being trained
    print(f"Training and Testing model: {model.__class__.__name__}...")

    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Print metrics for the model
    print(f"Model: {model.__class__.__name__}")
    print(f"R2 Score: {test_r2:.4f}")
    print(f"MSE: {test_mse:.4f}")
    print(f"MAE: {test_mae:.4f}")
    print('-' * 30)

    # Store metrics and model name
    metrics_summary.append({
        'model': model.__class__.__name__,
        'r2_score': test_r2,
        'mse': test_mse,
        'mae': test_mae
    })

Training and Testing model: Ridge...
Model: Ridge
R2 Score: 0.2689
MSE: 162.7323
MAE: 9.8734
------------------------------
Training and Testing model: RandomForestRegressor...
Model: RandomForestRegressor
R2 Score: 0.3638
MSE: 141.5941
MAE: 9.1099
------------------------------
Training and Testing model: KNeighborsRegressor...
Model: KNeighborsRegressor
R2 Score: 0.2868
MSE: 158.7358
MAE: 9.7066
------------------------------
Training and Testing model: SVR...
Model: SVR
R2 Score: 0.3286
MSE: 149.4378
MAE: 9.3427
------------------------------


In [7]:
# Determine the best model based on R2 Score (higher is better)
best_model_r2 = max(metrics_summary, key=lambda x: x['r2_score'])

# Optionally, determine the best model based on MSE or MAE (lower is better)
best_model_mse = min(metrics_summary, key=lambda x: x['mse'])
best_model_mae = min(metrics_summary, key=lambda x: x['mae'])

# Print the best models
print(f"Model with best R2 score: {best_model_r2['model']}\nR2 Score: {best_model_r2['r2_score']:.4f}\n")
print(f"Model with best MSE: {best_model_mse['model']}\nMSE: {best_model_mse['mse']:.4f}\n")
print(f"Model with best MAE: {best_model_mae['model']}\nMAE: {best_model_mae['mae']:.4f}")

Model with best R2 score: RandomForestRegressor
R2 Score: 0.3638

Model with best MSE: RandomForestRegressor
MSE: 141.5941

Model with best MAE: RandomForestRegressor
MAE: 9.1099


## 3. Cross-Validation

In [8]:
from sklearn.model_selection import cross_val_score

# Initialize lists to store results
r2_scores = []
mse_scores = []

# Iterate through each model
for model in models:
    # R2 scores
    r2 = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5, scoring='r2')
    r2_scores.append(r2.mean())
    print(f"Model: {model.__class__.__name__}, R2 values: {r2}, Mean R2: {r2.mean()}")

    # MSE scores
    mse = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5, scoring='neg_mean_squared_error')
    mse_scores.append(-mse.mean())  # Negate to get positive MSE
    print(f"Model: {model.__class__.__name__}, Negative MSE values: {mse}, Mean MSE: {-mse.mean()}")

# Summary of results
print(f"\nSummary of R2 scores: {r2_scores}")
print(f"Summary of MSE: {mse_scores}")

Model: Ridge, R2 values: [0.26766732 0.27259834 0.25769801 0.28889647 0.26831178], Mean R2: 0.2710343860540162
Model: Ridge, Negative MSE values: [-170.50566765 -168.49895935 -172.30414435 -166.20581697 -176.8195474 ], Mean MSE: 170.86682714325048
Model: RandomForestRegressor, R2 values: [0.36414476 0.3581759  0.34336626 0.35911918 0.35547168], Mean R2: 0.356055558336827
Model: RandomForestRegressor, Negative MSE values: [-148.04326776 -148.67534473 -152.41871289 -149.79270332 -155.75651328], Mean MSE: 150.93730839525116
Model: KNeighborsRegressor, R2 values: [0.284564   0.29046238 0.27467568 0.2743139  0.28077001], Mean R2: 0.2809571948424432
Model: KNeighborsRegressor, Negative MSE values: [-166.57169052 -164.36084347 -168.36326419 -169.61419337 -173.80889722], Mean MSE: 168.54377775329286
Model: SVR, R2 values: [0.31502942 0.31122598 0.29924214 0.32001121 0.30608552], Mean R2: 0.31031885153970457
Model: SVR, Negative MSE values: [-159.47856776 -159.55106114 -162.66086455 -158.933387

## 4. Hyperparameter Tuning

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define parameter grid
param_grid = {
    'n_estimators': [10, 50, 100], 
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10]
    }

grid_search = GridSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_test, y_test)

best_model = grid_search.best_estimator_

best_model.fit(X_train, y_train)
y_pred_rf = best_model.predict(X_test)

# Print the Best Parameters
print("Best Parameters:", grid_search.best_params_)

# Print Best Metrics
print(f'MSE: {mean_squared_error(y_test, y_pred_rf):.2f}')
print(f'MAE: {mean_absolute_error(y_test, y_pred_rf):.2f}')
print(f'R2 Score: {r2_score(y_test, y_pred_rf):.2f}')


Best Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 100}
MSE: 133.62
MAE: 8.78
R2 Score: 0.40
