# Part 1: Training Model

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

## Step 1.0 - Hold out Test Set (Create Validation Set)

In [2]:
targets = ["arousal", "valence"]

In [3]:
df_iad = pd.read_csv("../data/preprocessed/IADSED_preprocessed.csv")
df_emo = pd.read_csv("../data/preprocessed/EmoSounds_preprocessed.csv")

In [4]:
"""
  Load target features on y and drop target features on X.
  Perform shuffle on first split to prevent bias. Split 80-20 for train-test.
  Split again on train to get validation set. So 60-20-20.
"""
def train_test_valid_split(df, targets):
  y = df[targets]
  X = df.drop(targets, axis=1)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=101, shuffle=True)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state=101)

  return X_train, X_test, X_val, y_train, y_test, y_val

Perform the splits for train, test, and validation sets.

In [5]:
X_train_iad, X_test_iad, X_val_iad, y_train_iad, y_test_iad, y_val_iad = train_test_valid_split(df_iad, targets)

In [6]:
X_train_emo, X_test_emo, X_val_emo, y_train_emo, y_test_emo, y_val_emo = train_test_valid_split(df_emo, targets)

In [7]:
"""
  Creates random forest regression instance with default hyperparameters.
  If regressor_cv is provided, then use the best hyperparameters from grid search results.
  If asked to do grid search and parameter grid is provided, then we will perform grid search
  to find the best parameters. After grid search is complete, print the best parameters.
  Otherwise, train the model with our training data.
"""
def RFRegressor(X_train, y_train, grid_search=False, param_grid=None, regressor_cv=None, ds_name="NA"):

  if regressor_cv == None:
    regressor = RandomForestRegressor(random_state=101)
  else:
    regressor = RandomForestRegressor(random_state=101, **regressor_cv.best_params_)

  if grid_search == True and not param_grid == None:
    regressor = GridSearchCV(regressor, param_grid, cv=5, n_jobs=-1)
    regressor.fit(X_train, y_train)
    print(f"{ds_name} - Best Parameters:", regressor.best_params_)
  else:
    regressor.fit(X_train, y_train)

  return regressor

In [8]:
"""
  Evaluates/predicts with the test or validation set.
  Returns MSE and RMSE metrics.
"""
def RFPredictions(regressor, X_test, y_test, ds_type, ds_name):
  predictions = regressor.predict(X_test)

  mse = mean_squared_error(y_test, predictions)
  print(f'{ds_name} - {ds_type} MSE: {mse}')

  rmse = root_mean_squared_error(y_test, predictions)
  print(f'{ds_name} - {ds_type} RMSE: {rmse}')

Train the random forest model on the datasets.

In [9]:
regressor_iad = RFRegressor(X_train=X_train_iad, y_train=y_train_iad)

regressor_emo = RFRegressor(X_train=X_train_emo, y_train=y_train_emo)

Make the predictions with validation set first and then test set to compare metrics.

In [10]:
RFPredictions(regressor=regressor_iad, X_test=X_val_iad, y_test=y_val_iad, ds_type="Validation", ds_name="IADSED")

RFPredictions(regressor=regressor_iad, X_test=X_test_iad, y_test=y_test_iad, ds_type="Test", ds_name="IADSED")

IADSED - Validation MSE: 0.5595654537157757
IADSED - Validation RMSE: 0.748040973085724
IADSED - Test MSE: 0.513702426702898
IADSED - Test RMSE: 0.7167256752717164


In [11]:
RFPredictions(regressor=regressor_emo, X_test=X_val_emo, y_test=y_val_emo, ds_type="Validation", ds_name="EmoSounds")

RFPredictions(regressor=regressor_emo, X_test=X_test_emo, y_test=y_test_emo, ds_type="Test", ds_name="EmoSounds")

EmoSounds - Validation MSE: 0.08113709054585573
EmoSounds - Validation RMSE: 0.27956768451420133
EmoSounds - Test MSE: 0.0938988959614451
EmoSounds - Test RMSE: 0.29697874937225854


## Steps 1.1 and 1.2 - 5-Fold Cross-Validation & Hyperparameter Tuning

In [12]:
"""
  Merges training and validation data, since we are now performing K-Fold CV.
"""
def concat_train_valid(X_train, y_train, X_val, y_val):
  X_train = pd.concat([X_train, X_val])
  y_train = pd.concat([y_train, y_val])

  return X_train, y_train

Merge the training and validation data for both datasets.

In [13]:
X_train_iad, y_train_iad = concat_train_valid(X_train_iad, y_train_iad, X_val_iad, y_val_iad)

X_train_emo, y_train_emo = concat_train_valid(X_train_emo, y_train_emo, X_val_emo, y_val_emo)

Parameter Grid Setup

In [14]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True]
}

Grid Search to find the best hyperparameters

In [15]:
regressor_cv_iad = RFRegressor(X_train=X_train_iad, y_train=y_train_iad, grid_search=True, param_grid=param_grid, ds_name="IADSED")

regressor_cv_emo = RFRegressor(X_train=X_train_emo, y_train=y_train_emo, grid_search=True, param_grid=param_grid, ds_name="EmoSounds")

IADSED - Best Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
EmoSounds - Best Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


## Step 1.3 - Retrain model with best parameters.

In [16]:
regressor_best_iad = RFRegressor(X_train_iad, y_train_iad, regressor_cv=regressor_cv_iad)

regressor_best_emo = RFRegressor(X_train_emo, y_train_emo, regressor_cv=regressor_cv_emo)

## Step 1.4 - Evaluate again with test set.

In [17]:
RFPredictions(regressor=regressor_best_iad, X_test=X_test_iad, y_test=y_test_iad, ds_type="Test", ds_name="IADSED")

RFPredictions(regressor=regressor_best_emo, X_test=X_test_emo, y_test=y_test_emo, ds_type="Test", ds_name="EmoSounds")

IADSED - Test MSE: 0.48074278472647786
IADSED - Test RMSE: 0.692786085097712
EmoSounds - Test MSE: 0.08620884847330877
EmoSounds - Test RMSE: 0.28514156492700865


# Part 2: Training with Feature Selection

In [18]:
df_iad_fs = pd.read_csv("../data/preprocessed/IADSED_preprocessed.csv")
df_emo_fs = pd.read_csv("../data/preprocessed/EmoSounds_preprocessed.csv")

## Step 2.0 - Hold out Test Set (Create Validation Set)

In [19]:
X_train_iad_fs, X_test_iad_fs, X_val_iad_fs, y_train_iad_fs, y_test_iad_fs, y_val_iad_fs = train_test_valid_split(df_iad_fs, targets)

X_train_emo_fs, X_test_emo_fs, X_val_emo_fs, y_train_emo_fs, y_test_emo_fs, y_val_emo_fs = train_test_valid_split(df_emo_fs, targets)

Train models first to get best features for feature selections.

In [20]:
regressor_iad_fs = RFRegressor(X_train=X_train_iad_fs, y_train=y_train_iad_fs)

regressor_emo_fs = RFRegressor(X_train=X_train_emo_fs, y_train=y_train_emo_fs)

In [21]:
"""
    Perform Feature Selection via feature importance functionality from Random Forest Regressor.
    It will get the top k (10 by default) features of the model and return the training, test, and validation datasets with only those said features selected.
"""
def select_top_k(regressor, X_train, X_test, X_val, top_k=10):

    feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': regressor.feature_importances_})
    feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
    top_features = feature_importance['Feature'][:top_k].values

    X_train = X_train[top_features]
    X_test = X_test[top_features]
    X_val = X_val[top_features]

    return X_train, X_test, X_val

In [22]:
X_train_iad_fs_top, X_test_iad_fs_top, X_val_iad_fs_top = select_top_k(regressor=regressor_iad_fs, X_train=X_train_iad_fs, X_test=X_test_iad_fs, X_val=X_val_iad_fs, top_k=20)

X_train_emo_fs_top, X_test_emo_fs_top, X_val_emo_fs_top = select_top_k(regressor=regressor_emo_fs, X_train=X_train_emo_fs, X_test=X_test_emo_fs, X_val=X_val_emo_fs, top_k=20)

Once feature selection is complete, retrain the model and perform the hold-out method by evaluating both validation and test sets like in Part 1. The rest of the steps from this point onward are more or less identically to part 1.

In [23]:
regressor_iad_fs = RFRegressor(X_train=X_train_iad_fs_top, y_train=y_train_iad_fs)

regressor_emo_fs = RFRegressor(X_train=X_train_emo_fs_top, y_train=y_train_emo_fs)

In [24]:
RFPredictions(regressor=regressor_iad_fs, X_test=X_val_iad_fs_top, y_test=y_val_iad_fs, ds_type="Validation", ds_name="IADSED")

RFPredictions(regressor=regressor_iad_fs, X_test=X_test_iad_fs_top, y_test=y_test_iad_fs, ds_type="Test", ds_name="IADSED")

IADSED - Validation MSE: 0.5401567818819137
IADSED - Validation RMSE: 0.7348872724129891
IADSED - Test MSE: 0.5053294759149147
IADSED - Test RMSE: 0.7108419640155816


In [25]:
RFPredictions(regressor=regressor_emo_fs, X_test=X_val_emo_fs_top, y_test=y_val_emo_fs, ds_type="Validation", ds_name="EmoSounds")

RFPredictions(regressor=regressor_emo_fs, X_test=X_test_emo_fs_top, y_test=y_test_emo_fs, ds_type="Test", ds_name="EmoSounds")

EmoSounds - Validation MSE: 0.08192361711711127
EmoSounds - Validation RMSE: 0.2806524448980177
EmoSounds - Test MSE: 0.09462748819406189
EmoSounds - Test RMSE: 0.29887213137671287


## Steps 2.1 and 2.2 - 5-Fold Cross-Validation & Hyperparameter Tuning

In [26]:
X_train_iad_fs_top, y_train_iad_fs = concat_train_valid(X_train_iad_fs_top, y_train_iad_fs, X_val_iad_fs_top, y_val_iad_fs)

X_train_emo_fs_top, y_train_emo_fs = concat_train_valid(X_train_emo_fs_top, y_train_emo_fs, X_val_emo_fs_top, y_val_emo_fs)

In [27]:
regressor_cv_iad_fs = RFRegressor(X_train=X_train_iad_fs_top, y_train=y_train_iad_fs, grid_search=True, param_grid=param_grid, ds_name="IADSED")

regressor_cv_emo_fs = RFRegressor(X_train=X_train_emo_fs_top, y_train=y_train_emo_fs, grid_search=True, param_grid=param_grid, ds_name="EmoSounds")

IADSED - Best Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
EmoSounds - Best Parameters: {'bootstrap': True, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


## Step 2.3 - Retrain model with best parameters.

In [28]:
regressor_best_iad_fs = RFRegressor(X_train_iad_fs_top, y_train_iad_fs, regressor_cv=regressor_cv_iad_fs)

regressor_best_emo_fs = RFRegressor(X_train_emo_fs_top, y_train_emo_fs, regressor_cv=regressor_cv_emo_fs)

## Step 2.4 - Evaluate again with test set.

In [29]:
RFPredictions(regressor=regressor_best_iad_fs, X_test=X_test_iad_fs_top, y_test=y_test_iad_fs, ds_type="Test", ds_name="IADSED")

RFPredictions(regressor=regressor_best_emo_fs, X_test=X_test_emo_fs_top, y_test=y_test_emo_fs, ds_type="Test", ds_name="EmoSounds")

IADSED - Test MSE: 0.44421410556698604
IADSED - Test RMSE: 0.6664364927442703
EmoSounds - Test MSE: 0.08444926873633317
EmoSounds - Test RMSE: 0.28218871496745496
