# Imports

In [23]:
import pandas   as pd
import numpy    as np

from sklearn            import metrics          as mt
from sklearn            import linear_model     as lm
from sklearn            import model_selection  as ms

# Load Dataset

In [24]:
#Lendo arquivo CSV de Treino
x_train = pd.read_csv('../../dataset/reg/X_training.csv')
y_train = pd.read_csv('../../dataset/reg/y_training.csv')

#Lendo arquivo CSV de teste
x_test = pd.read_csv('../../dataset/reg/X_test.csv')
y_test = pd.read_csv('../../dataset/reg/y_test.csv')

#Lendo arquivo CSV de Validação
x_val = pd.read_csv('../../dataset/reg/X_validation.csv')
y_val = pd.read_csv('../../dataset/reg/y_val.csv')

In [25]:
#Preparação dos dados

y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

In [26]:
x_train.head()

Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,0.205673,0.0921,0.72,0.802,0.0,0.090909,0.694,0.431778,1,0.0582,0.103876,0.8,0.723
1,-0.240409,0.737,0.483,0.412,0.0,0.636364,0.116,-0.262732,1,0.0402,1.711532,0.6,0.247
2,-0.12577,0.274,0.671,0.565,6.5e-05,1.0,0.37,0.013612,0,0.16,1.009176,0.8,0.561
3,-0.580967,0.00234,0.704,0.529,0.874,1.0,0.37,-0.266382,0,0.0416,0.666173,0.8,0.507
4,-0.688566,0.000414,0.354,0.91,0.205,0.090909,0.456,0.422914,1,0.043,1.18263,0.8,0.362


# Model Training - Ridge (Training Data)

## Definindo os melhores parametros do Ridge

In [27]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'alpha': np.logspace(-4, 4, 9),  # Testar valores de alpha entre 10^-4 e 10^4
    'max_iter': [1000, 5000, 10000, 20000]  # Valores típicos para max_iter
}

# Configurar a busca em grade
grid_search = ms.GridSearchCV(estimator=lm.Ridge(), 
                              param_grid=param_grid, 
                              cv=5, 
                              scoring='neg_mean_squared_error', 
                              n_jobs=-1)

# Ajustar o modelo
grid_search.fit(x_train, y_train)

# Exibir os melhores parâmetros e o desempenho
print(f"Melhores parâmetros: {grid_search.best_params_}")
print(f"Melhor erro quadrático médio (MSE): {-grid_search.best_score_}")

# Avaliar o modelo no conjunto de teste
best_ridge = grid_search.best_estimator_
test_score = best_ridge.score(x_test, y_test)
print(f"Acurácia do modelo no conjunto de teste: {test_score}")

Melhores parâmetros: {'alpha': np.float64(10.0), 'max_iter': 1000}
Melhor erro quadrático médio (MSE): 456.9566570083158
Acurácia do modelo no conjunto de teste: 0.05217878734577219


## Model Training

In [28]:
#define
l2 = lm.Ridge(alpha=grid_search.best_params_['alpha'],
              max_iter=grid_search.best_params_['max_iter'])

#fit
l2.fit(x_train,y_train)
yhat_train = l2.predict(x_train)

#performance

r2_train = mt.r2_score(y_train,yhat_train)
print(f'R2 Score: {r2_train:.3f}')

mse_train = mt.mean_squared_error(y_train,yhat_train)
print(f'MSE: {mse_train:.3f}')

rmse_train = np.sqrt(mse_train)
print(f'RMSE: {rmse_train:.3f}')

mae_train = mt.mean_absolute_error(y_train,yhat_train)
print(f'MAE: {mae_train:.3f}')

mape_train = mt.mean_absolute_percentage_error(y_train,yhat_train)
print(f'MAPE: {mape_train:.2f}%')



R2 Score: 0.046
MSE: 456.020
RMSE: 21.355
MAE: 16.999
MAPE: 8.66%


# Model Training - Ridge (Validation Data)

In [29]:
#define
l2 = lm.Ridge(alpha=grid_search.best_params_['alpha'],
              max_iter=grid_search.best_params_['max_iter'])

#fit
l2.fit(x_train,y_train)
yhat_val = l2.predict(x_val)

#performance

r2_val = mt.r2_score(y_val,yhat_val)
print(f'R2 Score: {r2_val:.3f}')

mse_val = mt.mean_squared_error(y_val,yhat_val)
print(f'MSE: {mse_val:.3f}')

rmse_val = np.sqrt(mse_val)
print(f'RMSE: {rmse_val:.3f}')

mae_val = mt.mean_absolute_error(y_val,yhat_val)
print(f'MAE: {mae_val:.3f}')

mape_val = mt.mean_absolute_percentage_error(y_val,yhat_val)
print(f'MAPE: {mape_val:.2f}%')

R2 Score: 0.040
MSE: 458.441
RMSE: 21.411
MAE: 17.038
MAPE: 8.68%


# Model Training - Ridge (Test Data)

In [30]:
#define
l2 = lm.Ridge(alpha=grid_search.best_params_['alpha'],
              max_iter=grid_search.best_params_['max_iter'])

#fit
l2.fit(np.concatenate((x_train,x_val)),
       np.concatenate((y_train,y_val)))
yhat_test = l2.predict(x_test)

#performance
r2_test = mt.r2_score(y_test,yhat_test)
print(f'R2 Score: {r2_test:.3f}')

mse_test = mt.mean_squared_error(y_test,yhat_test)
print(f'MSE: {mse_test:.3f}')

rmse_test = np.sqrt(mse_test)
print(f'RMSE: {rmse_test:.3f}')

mae_test = mt.mean_absolute_error(y_test,yhat_test)
print(f'MAE: {mae_test:.3f}')

mape_test = mt.mean_absolute_percentage_error(y_test,yhat_test)
print(f'MAPE: {mape_test:.2f}%')

R2 Score: 0.051
MSE: 462.006
RMSE: 21.494
MAE: 17.142
MAPE: 8.54%




# Save Results

In [31]:
train_metrics = {
    "Algorithm": "Linear Regression - Ridge",
    "R-Squared": np.round(r2_train, 3),
    "MSE": np.round(mse_train, 3),
    "RMSE": np.round(rmse_train, 3),
    "MAE": np.round(mae_train, 3),
    "MAPE": np.round(mape_train, 3),
}
validation_metrics = {
    "Algorithm": "Linear Regression - Ridge",
    "R-Squared": np.round(r2_val, 3),
    "MSE": np.round(mse_val, 3),
    "RMSE": np.round(rmse_val, 3),
    "MAE": np.round(mae_val, 3),
    "MAPE": np.round(mape_val, 3),
}
test_metrics = {
    "Algorithm": "Linear Regression - Ridge",
    "R-Squared": np.round(r2_test, 3),
    "MSE": np.round(mse_test, 3),
    "RMSE": np.round(rmse_test, 3),
    "MAE": np.round(mae_test, 3),
    "MAPE": np.round(mape_test, 3),
}

pd.DataFrame(train_metrics, index=[0]).to_csv(
    "./reg_train_metrics.csv", mode="a", header=False
)
pd.DataFrame(validation_metrics, index=[0]).to_csv(
    "./reg_validation_metrics.csv", mode="a", header=False
)
pd.DataFrame(test_metrics, index=[0]).to_csv(
    "./reg_test_metrics.csv", mode="a", header=False)