In [None]:

# Setting the figure size to a bigger one than the default
from matplotlib import rcParams
rcParams['figure.figsize'] = (10, 6)
rcParams['legend.fontsize'] = 16
rcParams['axes.labelsize'] = 16

import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score 
from sklearn import neighbors
from sklearn import datasets
from sklearn.model_selection import train_test_split
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet, HuberRegressor, RANSACRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import RandomizedSearchCV


In [None]:

# Read about the dataset here https://samyzaf.com/ML/song_year/song_year.html

# Load the data from the file
features = ['year', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10', 't11', 't12', 't13', 't14', 't15', 't16', 't17', 't18', 't1 9', 't20', 't21', 't22', 't23', 't24', 't25', 't26', 't27', 't28', 't29', 't30', 't31', 't32', 't33', 't34', 't35', 't36 ', 't37', 't38', 't39', 't40', 't41', 't42', 't43', 't44', 't45', 't46', 't47', 't48', 't49', 't50', 't51', 't52', 't53' , 't54', 't55', 't56', 't57', 't58', 't59', 't60', 't61', 't62', 't63', 't64', 't65', 't66', 't67', 't68', 't69', 't70', 't71', 't72', 't73', 't74', 't75', 't76', 't77', 't78', 't79', 't80', 't81', 't82', 't83', 't84', 't85', 't86', 't87', 't88', 't89', 't90']

data = pd.read_csv('/Users/ignaciopastorebenaim/Documents/MGRCV/TPs/ML/Lab1/data/YearPredictionMSD.csv', header=0, names=features)

# First look at the dataset
print(data.info())  
print(data.describe())  
print(data.head())

# If you want to see the total number of missing values in the entire dataset
total_missing_values = data.isna().sum().sum()
print(f"Total number of missing values in the dataset: {total_missing_values}")


In [None]:

# Convert to numpy array
data = data.to_numpy()

# Extract features (X) and labels (y)
X = data[:, 1:]
y = data[:, 0]   

# Split the data in training, validation, and test sets
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=0.1, random_state=5)
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size=0.1, random_state=7)

print('# training samples: ', X_train.shape[0])
print('# validation samples: ', X_val.shape[0])
print('# test samples: ', X_test.shape[0])


In [None]:

# Compute the correlation between each feature in X_train and y_train
correlations = []
for i in range(X_train.shape[1]):
  corr, _ = pearsonr(X_train[:, i], y_train)
  correlations.append(corr)

# Plot the correlations
plt.bar(range(len(correlations)), correlations)
plt.xlabel('Feature Index')
plt.ylabel('Correlation with Output')
plt.title('Correlation of Each Feature with the Output')
plt.show()

# Compute the correlation matrix
corr_matrix = np.corrcoef(X_train, rowvar=False)

# Plot the heatmap
plt.imshow(corr_matrix, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.title('Correlation Matrix Heatmap')
plt.show()

# Filter correlations with a threshold of 0.1
correlations = np.array(correlations)
threshold = 0.1
filtered_indices = np.where(np.abs(correlations) >= threshold)[0]
filtered_correlations = correlations[filtered_indices]

print("Filtered correlations: ", filtered_correlations)
print("Filtered indexes: ", filtered_indices)

plt.figure(figsize=(16, 8))
for i, feature_index in enumerate(filtered_indices):
  plt.subplot(2, 5, i+1)
  plt.scatter(X_train[:, feature_index], y_train)
plt.show()


In [None]:

# Define the pipeline with PolynomialFeatures and PCA
pipeline = Pipeline(steps=[
    ('oversampler', RandomOverSampler(random_state=42)),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),  # Add polynomial features step
    ('pca', PCA()),  # Optionally add PCA for dimensionality reduction
    ('model', Ridge())  # Temporary model placeholder for RandomizedSearch
])

# Define the parameter grid for multiple models and degrees of polynomial features
param_grid = [
    {
        'poly__degree': [1, 2, 3],
        'pca__n_components': [None, 0.90, 0.95, 0.99],  # Adding None to make PCA optional
        'model': [Ridge()],
        'model__alpha': [0.1, 1.0, 10.0]  # Different Ridge regression alpha values
    },
    {
        'poly__degree': [1, 2, 3],
        'pca__n_components': [None, 0.90, 0.95, 0.99],
        'model': [Lasso()],
        'model__alpha': [0.1, 1.0, 10.0]  # Lasso regularization strength
    },
    # Additional model configurations...
]

# Define the scoring metrics
scoring_metrics = {
    'MSE': 'neg_mean_squared_error',       # Mean Squared Error
    'MAE': 'neg_mean_absolute_error',      # Mean Absolute Error
    'R2': 'r2',                            # R² Score
    'MedAE': 'neg_median_absolute_error',  # Median Absolute Error
}


In [None]:

# Perform randomized search
random_search = RandomizedSearchCV(pipeline, param_grid, n_iter=50, cv=5, scoring=scoring_metrics, refit='MAE', verbose=3, return_train_score=True, n_jobs=-1)

# Fit the model
random_search.fit(X_train, y_train)

# Best model and hyperparameters
print("Best model:", random_search.best_estimator_)
print("Best hyperparameters:", random_search.best_params_)
