In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

Dataset Setup

In [2]:
"""
  Load the dataset. One-Hot encodes all nominal categorical features
  and label encodes all ordinal categorical features.
  Also removes any unwanted cols. (ie: text features like description or name)
"""
def load_process_datasets(ds,cat_cols,label_cols, remove_cols):
  df = pd.read_csv(ds)

  df = pd.get_dummies(
      df,
      columns=cat_cols,
      drop_first=True
  )

  for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

  df = df.drop(columns=remove_cols)

  return df

Set names for target features, nominal categorical features, ordinal categorial features, and unwanted features.

In [3]:
iad_targets = ["arousal", "valence", "dominance"]
emo_targets = ["arousal", "valence"]

In [4]:
iad_cat_cols = ["category", "BE_Classification"]
iad_label_cols = ["source"]
iad_remove_cols = ["description", "fname"]
emo_cat_cols = ["genre"]
emo_label_cols = ["dataset", "splits", "vocals"]
emo_remove_cols = ["fnames"]

Load and process dataset

In [5]:
df_iad = load_process_datasets("IADSED_preprocessed.csv", iad_cat_cols, iad_label_cols, iad_remove_cols)
df_emo = load_process_datasets("EmoSounds_preprocessed.csv", emo_cat_cols, emo_label_cols, emo_remove_cols)

Step 0 - Hold out Test Set (Create Validation Set)

In [6]:
"""
  Load target features on y and drop target features on X.
  Perform shuffle on first split to prevent bias. Split 80-20 for train-test.
  Split again on train to get validation set. So 60-20-20.
"""
def train_test_valid_split(df, targets):
  y = df[targets]
  X = df.drop(targets, axis=1)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=101, shuffle=True)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state=101)

  return X_train, X_test, X_val, y_train, y_test, y_val

Perform the splits for train, test, and validation sets.

In [7]:
X_train_iad, X_test_iad, X_val_iad, y_train_iad, y_test_iad, y_val_iad = train_test_valid_split(df_iad, iad_targets)

In [8]:
X_train_emo, X_test_emo, X_val_emo, y_train_emo, y_test_emo, y_val_emo = train_test_valid_split(df_emo, emo_targets)

Random Forest Regression Model

In [9]:
"""
  Creates random forest regression instance with default hyperparameters.
  If regressor_cv is provided, then use the best hyperparameters from grid search results.
  If asked to do grid search and parameter grid is provided, then we will perform grid search
  to find the best parameters. After grid search is complete, print the best parameters.
  Otherwise, train the model with our training data.
"""
def RFRegressor(X_train, y_train, grid_search=False, param_grid=None, regressor_cv=None):

  if regressor_cv == None:
    regressor = RandomForestRegressor()
  else:
    regressor = RandomForestRegressor(**regressor_cv.best_params_)

  if grid_search == True and not param_grid == None:
    regressor = GridSearchCV(regressor, param_grid, cv=5, n_jobs=-1)
    regressor.fit(X_train, y_train)
    print("Best Parameters:", regressor.best_params_)
  else:
    regressor.fit(X_train, y_train)

  return regressor

In [10]:
"""
  Evaluates/predicts with the test or validation set.
  Returns MSE and RMSE metrics.
"""
def RFPredictions(regressor, X_test, y_test):
  predictions = regressor.predict(X_test)

  mse = mean_squared_error(y_test, predictions)
  print(f'Mean Squared Error: {mse}')

  rmse = root_mean_squared_error(y_test, predictions)
  print(f'Root Mean Squared Error: {rmse}')

Train the random forest model on the datasets.

In [11]:
regressor_iad = RFRegressor(X_train=X_train_iad, y_train=y_train_iad)

In [12]:
regressor_emo = RFRegressor(X_train=X_train_emo, y_train=y_train_emo)

Make the predictions with validation set first and then test set to compare metrics.

In [13]:
RFPredictions(regressor=regressor_iad, X_test=X_val_iad, y_test=y_val_iad)

RFPredictions(regressor=regressor_iad, X_test=X_test_iad, y_test=y_test_iad)

Mean Squared Error: 0.5085698669735604
Root Mean Squared Error: 0.7118608225507689
Mean Squared Error: 0.5496569030847793
Root Mean Squared Error: 0.7386268197247444


In [14]:
RFPredictions(regressor=regressor_emo, X_test=X_val_emo, y_test=y_val_emo)

RFPredictions(regressor=regressor_emo, X_test=X_test_emo, y_test=y_test_emo)

Mean Squared Error: 0.082807002172962
Root Mean Squared Error: 0.28243620173760237
Mean Squared Error: 0.0925010437372613
Root Mean Squared Error: 0.29391875215673985


(Steps 1 and 2): 5-Fold Cross-Validation & Hyperparameter Tuning

In [15]:
"""
  Merges training and validation data, since we are now performing K-Fold CV.
"""
def concat_train_valid(X_train, y_train, X_val, y_val):
  X_train = pd.concat([X_train, X_val])
  y_train = pd.concat([y_train, y_val])

  return X_train, y_train

Merge the training and validation data for both datasets.

In [16]:
X_train_iad, y_train_iad = concat_train_valid(X_train_iad, y_train_iad, X_val_iad, y_val_iad)

In [17]:
X_train_emo, y_train_emo = concat_train_valid(X_train_emo, y_train_emo, X_val_emo, y_val_emo)

Parameter Grid Setup

In [18]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True]
}

Grid Search to find the best hyperparameters

In [19]:
regressor_cv_iad = RFRegressor(X_train=X_train_iad, y_train=y_train_iad, grid_search=True, param_grid=param_grid)

Best Parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [20]:
regressor_cv_emo = RFRegressor(X_train=X_train_emo, y_train=y_train_emo, grid_search=True, param_grid=param_grid)

Best Parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


Step 3 - Retrain model with best parameters.

In [21]:
regressor_best_iad = RFRegressor(X_train_iad, y_train_iad, regressor_cv=regressor_cv_iad)

In [22]:
regressor_best_emo = RFRegressor(X_train_emo, y_train_emo, regressor_cv=regressor_cv_emo)

Step 4 - Evaluate again with test set.

In [23]:
RFPredictions(regressor=regressor_best_iad, X_test=X_test_iad, y_test=y_test_iad)

Mean Squared Error: 0.5555540511254435
Root Mean Squared Error: 0.743366947612894


In [24]:
RFPredictions(regressor=regressor_best_emo, X_test=X_test_emo, y_test=y_test_emo)

Mean Squared Error: 0.0855562304558306
Root Mean Squared Error: 0.284842957107509
