____
### Notas para grupo:
Este código presume que foi criado um ficheiro "dataset.csv" na secção 1, consistindo no dataset pré-processado!
____

# Section 3: Supervised Learning
# 1. Dataset Splitting

In [1]:
# Imports (colocar todos os imports aqui quando esta secção estiver finalizada)
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Read pre-processed .csv file created in section1.ipynb
data = pd.read_csv("dataset.csv")
data.head()

Unnamed: 0,Y,Parameter_0,Parameter_1,Parameter_2,Parameter_3,Parameter_4,Parameter_5,Parameter_6,Parameter_7,Parameter_8,...,MACCSKey drug2 150,MACCSKey drug2 151,MACCSKey drug2 152,MACCSKey drug2 153,MACCSKey drug2 154,MACCSKey drug2 155,MACCSKey drug2 157,MACCSKey drug2 159,MACCSKey drug2 160,MACCSKey drug2 162
0,7.69353,5.291146,5.040387,5.291146,0.908336,2.514969,5.291146,5.291146,-1.492008,1.143195,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
1,7.778053,5.291146,5.040387,5.291146,0.908336,2.514969,5.291146,5.291146,-1.492008,1.143195,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
2,-1.198505,5.291146,5.040387,5.291146,0.908336,2.514969,5.291146,5.291146,-1.492008,1.143195,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
3,2.595684,5.291146,5.040387,5.291146,0.908336,2.514969,5.291146,5.291146,-1.492008,1.143195,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
4,-5.139971,5.291146,5.040387,5.291146,0.908336,2.514969,5.291146,5.291146,-1.492008,1.143195,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0


In [3]:
# Verifying Dataset
print(f"Dataset shape: {data.shape[0]} rows x {data.shape[1]} columns.")
print(f"Dataset has NaNs?: {data.isnull().values.any()}")
print(f"\nDescriptive Statistics of column 'Y':\n{data['Y'].describe()}")

Dataset shape: 21760 rows x 9635 columns.
Dataset has NaNs?: False

Descriptive Statistics of column 'Y':
count    21760.000000
mean         4.469692
std         15.233728
min        -36.459230
25%         -5.206973
50%          4.122120
75%         13.840190
max         45.956396
Name: Y, dtype: float64


In [4]:
# O dataset preprocessado tem muitas samples, o que aumenta o tempo que demora a correr o código.
# Este código obtém amostras aleatóriamente selecionadas para criar um dataset mais pequeno.
# O dataset mais reduzido corresponde a 33% do dataset preprocessado original.

# Obtaining a reduced version of the pre-processed dataset from randomly selected samples
data = data.sample(frac=0.33, random_state=42) # New smaller dataset will have 33% of the samples of the original.

print(f"Dataset shape: {data.shape[0]} rows x {data.shape[1]} columns.")

Dataset shape: 7181 rows x 9635 columns.


In [5]:
# Dataset splitting (random_state=42 for reproducible results)
from sklearn.model_selection import train_test_split

X = data.drop('Y', axis=1)
y = data['Y'] # Target column

# Split the dataset into training+validation set (80%) and test set (20%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # X_temp and y_temp are temporary variables, they exist to be split into the train and validation sets.

# Split the training+validation set into training set (70%) and validation set (10%)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42) # Splitting the temp set into train and validation sets.

# Checking the shape of the resulting sets
print(f'Training Set \t Shape of X_train: {X_train.shape} \n\t\t Shape of y_train: {y_train.shape} \n')
print(f'Validation Set \t Shape of X_val: {X_val.shape} \n\t\t Shape of y_val: {y_val.shape} \n')
print(f'Test Set \t Shape of X_test: {X_test.shape} \n\t\t Shape of y_test: {y_test.shape}')

Training Set 	 Shape of X_train: (5026, 9634) 
		 Shape of y_train: (5026,) 

Validation Set 	 Shape of X_val: (718, 9634) 
		 Shape of y_val: (718,) 

Test Set 	 Shape of X_test: (1437, 9634) 
		 Shape of y_test: (1437,)


# 2. Regression Models

## 2.1. Ridge Regression

In [6]:
from sklearn.linear_model import Ridge

# Ridge Regression
ridge_model = Ridge(alpha=1.0)  # Can tune alpha (regularization strength) later
ridge_model.fit(X_train, y_train)

# Predicting on validation and test sets
y_val_pred = ridge_model.predict(X_val)
y_test_pred = ridge_model.predict(X_test)

# Metrics for Ridge Regression
val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Ridge Regression Metrics:\n") # Check if there are other relevant metrics
print(f"Validation MSE: {val_mse:.4f}\nValidation R2: {val_r2:.4f}\n")
print(f"Test MSE: {test_mse:.4f}\nTest R2: {test_r2:.4f}")

Ridge Regression Metrics:

Validation MSE: 154.9345
Validation R2: 0.2169

Test MSE: 177.3285
Test R2: 0.2577


## 2.2. Support Vector Regression (SVR)
Process may be lengthy due to the dataset's size and the hardware involved.

(Célula seguinte pode demorar 20-30 minutos a completar com o dataset original, 2-5 minutos com o dataset reduzido)

In [7]:
from sklearn.svm import SVR

# Define the SVR model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)  # Initial hyperparameters

# Fit the model on the training data
svr_model.fit(X_train, y_train)

# Predict on validation and test sets
y_val_pred = svr_model.predict(X_val)
y_test_pred = svr_model.predict(X_test)

# Metrics for SVR
val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"SVR Metrics:\n")
print(f"Validation MSE: {val_mse:.4f}\nValidation R2: {val_r2:.4f}\n")
print(f"Test MSE: {test_mse:.4f}\nTest R2: {test_r2:.4f}")

SVR Metrics:

Validation MSE: 154.2762
Validation R2: 0.2203

Test MSE: 184.1529
Test R2: 0.2291


## 3. Hyperparameter Tuning