In [1]:
import pandas as pd
import csv
from tqdm import tqdm
from sklearn.model_selection import KFold
from utils import dataframe_to_tensor_dataset, train_model
import os

In [9]:
import sklearn.model_selection
sklearn.__version__

'1.3.2'

In [None]:
# Load the training set
train_set_numerical = pd.read_csv("./Data/training.csv")

In [None]:
# Method for normalising the training split, used in K-fold validation 
# Returns the normalised training split, its features' mean and standard deviation
def normalise_training_split(dataframe):

  dataframe = dataframe.copy()
  numerical_features= ['Total Followers', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Valence']

  
  means = {}
  stds = {}
  
  # Iterate over all features in a dataframe
  for column in dataframe[numerical_features]:

      # Calculate the mean and the standard deviation for the feature
      mean = dataframe[column].mean()
      std = dataframe[column].std()
      means[column] = mean
      stds[column] = std
      
      # Normalise the feature
      dataframe.loc[:,column] =  (dataframe[column] - mean ) / std

  return dataframe, means, stds

In [None]:
# Method for normalising the validation split, used in K-fold validation 
# Returns the normalised validation split.
def normalise_validation_split(dataframe, means, stds):

  dataframe = dataframe.copy()
  numerical_features= ['Total Followers', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Valence']

  # Normalising each feature using Z-score normalisation
  for column in dataframe[numerical_features]:

        mean = means[column]
        std = stds[column]
        
        # Normalise the feature
        dataframe.loc[:,column] =  (dataframe[column] - mean ) / std

  return dataframe


In [None]:
# Method that performs the hyperparameter grid search for a model, using K-fold cross validation
def model_hyperparameter_search(model_dictionary):
    
    
    output_path = model_dictionary["output"]
    # Obtain the model feature extraction information
    audio_features_id = model_dictionary["audio_features_id"]
    num_audio_features = model_dictionary["num_audio_features"]
    followers_id = model_dictionary["followers_id"]
    
    random_state = 10
    max_epochs = 150
    
    # Define the grid search hyperparameter lists
    batch_sizes = [2**i for i in range(5,9)]
    epochs = [10*i for i in range(1,16)]
    epochs = [1]
    
    hidden_units = [2**i for i in range(5,10)]
    hidden_units = [3]
    
    # Write the obtained results in a file
    with open(output_path, 'w', encoding='UTF8', newline='') as f:
    
        writer = csv.writer(f)
        header = ["batch_size", "epochs", "hidden_units", "train_loss", "val_loss", "train_mae", "val_mae"]
        writer.writerow(header)
    
        # Iterate over all batch sizes
        for batch_size in tqdm(batch_sizes):
    
            # Iterate over all hidden units
            for hidden_unit in tqdm(hidden_units):
    
                splits = 10
                
                # Construct training and validation splits for 10 folds
                k_fold = KFold(n_splits = splits, shuffle = True, random_state = random_state)
    
                training_histories = []
                
                # Iterate over 10 training and validation folds
                for k_id, (training_indices, validation_indices) in tqdm(enumerate(k_fold.split(train_set_numerical))):
    
                    # Construct the training and validation sets
                    k_training = train_set_numerical.iloc[training_indices,:]
                    k_validation = train_set_numerical.iloc[validation_indices,:]
    
                    # Normalise the training and validation sets
                    k_training_norm, mean, std = normalise_training_split(k_training)
                    k_validation_norm = normalise_validation_split(k_validation, mean, std)
    
                    # Obtain Tensorflow training and validation datasets
                    train_dataset = dataframe_to_tensor_dataset(k_training_norm, 11, 7, 7, batch_size, audio_features_id = audio_features_id, num_audio_features=num_audio_features, followers_id = followers_id)
                    valid_dataset = dataframe_to_tensor_dataset(k_validation_norm, 11, 7, 7, batch_size, audio_features_id = audio_features_id, num_audio_features=num_audio_features, followers_id = followers_id)
    
                    # Obtain model training history
                    training_history, _ = train_model(hidden_unit,max_epochs,train_dataset,valid_dataset)
                    training_histories.append(training_history)
    
                # Iterate over every 10th training epoch and store the average K-fold metrics in a file
                for epoch in epochs:
    
                    t_loss_total = 0
                    v_loss_total = 0
                    t_mae_total = 0
                    v_mae_total = 0
    
                    # Calculate the total MSE and MAE for training and validation sets for all K-folds
                    for training_history in training_histories:
    
                      t_loss_total += training_history["loss"][epoch-1]
                      v_loss_total += training_history["val_loss"][epoch-1]
                      t_mae_total += training_history["mean_absolute_error"][epoch-1]
                      v_mae_total += training_history["val_mean_absolute_error"][epoch-1]
    
                    # Calculate the average MSE and MAE for training and validation sets and store in a file
                    metrics = [t_loss_total / splits, v_loss_total  / splits, t_mae_total  / splits, v_mae_total  / splits]
                    formatted_metrics = [f'{metric:.3f}' for metric in metrics]
                    writer.writerow([batch_size, epoch, hidden_unit] + formatted_metrics)



In [None]:
# Define hyperparameter search dictionaries for all 4 model types
model_1 = {
    'output': os.path.join("Data","Hyperparameter_Search", "hyperparameters_points.csv"),
    'audio_features_id': None, 
    'num_audio_features': None,
    'followers_id': None
}

model_2 = {
    'output': os.path.join("Data","Hyperparameter_Search", "hyperparameters_points_audio.csv"),
    'audio_features_id': 3, 
    'num_audio_features': 7,
    'followers_id': None
}

model_3 = {
    'output': os.path.join("Data","Hyperparameter_Search", "hyperparameters_points_followers.csv"),
    'audio_features_id': None, 
    'num_audio_features': None,
    'followers_id': 2
}

model_4 = {
    'output': os.path.join("Data","Hyperparameter_Search", "hyperparameters_points_audio_followers.csv"),
    'audio_features_id': 3, 
    'num_audio_features': 7,
    'followers_id': 2
}


In [None]:
# Perform hyperparameter search for the 4 model types, saving the results in a file
for model in [model_1, model_2, model_3, model_4]:
    model_hyperparameter_search(model)