In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

Dataset Setup

In [2]:
"""
  Load the dataset. One-Hot encodes all nominal categorical features
  and label encodes all ordinal categorical features.
  Also removes any unwanted cols. (ie: text features like description or name)
"""
def load_process_datasets(ds,cat_cols,label_cols, remove_cols):
  df = pd.read_csv(ds)

  df = pd.get_dummies(
      df,
      columns=cat_cols,
      drop_first=True
  )

  for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

  df = df.drop(columns=remove_cols)

  return df

Set names for target features, nominal categorical features, ordinal categorial features, and unwanted features.

In [3]:
iad_targets = ["arousal", "valence", "dominance"]
emo_targets = ["arousal", "valence"]

In [4]:
iad_cat_cols = ["category", "BE_Classification"]
iad_label_cols = ["source"]
iad_remove_cols = ["description", "fname"]
emo_cat_cols = ["genre"]
emo_label_cols = ["dataset", "splits", "vocals"]
emo_remove_cols = ["fnames"]

Load and process dataset

In [5]:
df_iad = load_process_datasets("../IADSED_preprocessed.csv", iad_cat_cols, iad_label_cols, iad_remove_cols)
df_emo = load_process_datasets("../EmoSounds_preprocessed.csv", emo_cat_cols, emo_label_cols, emo_remove_cols)

Step 0 - Hold out Test Set (Create Validation Set)

In [6]:
"""
  Load target features on y and drop target features on X.
  Perform shuffle on first split to prevent bias. Split 80-20 for train-test.
  Split again on train to get validation set. So 60-20-20.
"""
def train_test_valid_split(df, targets):
  y = df[targets]
  X = df.drop(targets, axis=1)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=101, shuffle=True)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state=101)

  return X_train, X_test, X_val, y_train, y_test, y_val

Perform the splits for train, test, and validation sets.

In [7]:
X_train_iad, X_test_iad, X_val_iad, y_train_iad, y_test_iad, y_val_iad = train_test_valid_split(df_iad, iad_targets)

In [8]:
X_train_emo, X_test_emo, X_val_emo, y_train_emo, y_test_emo, y_val_emo = train_test_valid_split(df_emo, emo_targets)

Polynomial Regression Model

In [9]:
def PolyRegressor(X_train, y_train, degree=2, search=False, degree_range=None, degree_cv=None):

  if degree_cv != None:
    degree = degree_cv

  if search == True and degree_range != None:
    best_degree = degree
    best_mse = float('inf')
    
    for d in degree_range:
      pipeline = Pipeline([
        ('poly_features', PolynomialFeatures(degree=d, include_bias=False)),
        ('linear_regression', LinearRegression())
      ])
      
      scores = cross_val_score(pipeline, X_train, y_train, cv=5, 
                               scoring='neg_mean_squared_error', n_jobs=-1)
      mse = -scores.mean()
      
      print(f'Degree {d}: MSE = {mse}')
      
      if mse < best_mse:
        best_mse = mse
        best_degree = d
    
    print(f"\nBest Degree: {best_degree} with MSE: {best_mse}")
    degree = best_degree

  regressor = Pipeline([
    ('poly_features', PolynomialFeatures(degree=degree, include_bias=False)),
    ('linear_regression', LinearRegression())
  ])
  
  regressor.fit(X_train, y_train)

  return regressor, degree

In [10]:
def PolyPredictions(regressor, X_test, y_test):
  predictions = regressor.predict(X_test)

  mse = mean_squared_error(y_test, predictions)
  print(f'Mean Squared Error: {mse}')

  rmse = root_mean_squared_error(y_test, predictions)
  print(f'Root Mean Squared Error: {rmse}')

Train the polynomial regression model on the datasets with default degree=2.

In [11]:
regressor_iad, degree_iad = PolyRegressor(X_train=X_train_iad, y_train=y_train_iad)

In [12]:
regressor_emo, degree_emo = PolyRegressor(X_train=X_train_emo, y_train=y_train_emo)

Make the predictions with validation set first and then test set to compare metrics.

In [13]:
PolyPredictions(regressor=regressor_iad, X_test=X_val_iad, y_test=y_val_iad)

PolyPredictions(regressor=regressor_iad, X_test=X_test_iad, y_test=y_test_iad)

Mean Squared Error: 2.379180729202904
Root Mean Squared Error: 1.5222124859896882
Mean Squared Error: 3.9313919230388943
Root Mean Squared Error: 1.9783424454315022


In [14]:
PolyPredictions(regressor=regressor_emo, X_test=X_val_emo, y_test=y_val_emo)

PolyPredictions(regressor=regressor_emo, X_test=X_test_emo, y_test=y_test_emo)

Mean Squared Error: 0.28027615481713064
Root Mean Squared Error: 0.5215130626609217
Mean Squared Error: 0.33344554622753153
Root Mean Squared Error: 0.5590652193700567


(Steps 1 and 2): 5-Fold Cross-Validation & Hyperparameter Tuning

In [15]:
"""
  Merges training and validation data, since we are now performing K-Fold CV.
"""
def concat_train_valid(X_train, y_train, X_val, y_val):
  X_train = pd.concat([X_train, X_val])
  y_train = pd.concat([y_train, y_val])

  return X_train, y_train

Merge the training and validation data for both datasets.

In [16]:
X_train_iad, y_train_iad = concat_train_valid(X_train_iad, y_train_iad, X_val_iad, y_val_iad)

In [17]:
X_train_emo, y_train_emo = concat_train_valid(X_train_emo, y_train_emo, X_val_emo, y_val_emo)

Degree Range Setup for Hyperparameter Search

In [18]:
degree_range = [1, 2, 3]

Search to find the best polynomial degree

In [19]:
regressor_cv_iad, best_degree_iad = PolyRegressor(X_train=X_train_iad, y_train=y_train_iad, 
                                                    search=True, degree_range=degree_range)

Degree 1: MSE = 0.5394347154641282
Degree 2: MSE = 7.271672386217401
Degree 3: MSE = 48.38224830763294

Best Degree: 1 with MSE: 0.5394347154641282


In [20]:
regressor_cv_emo, best_degree_emo = PolyRegressor(X_train=X_train_emo, y_train=y_train_emo, 
                                                    search=True, degree_range=degree_range)

Degree 1: MSE = 0.13406034140360362
Degree 2: MSE = 0.3645045623136086
Degree 3: MSE = 0.9997742728528056

Best Degree: 1 with MSE: 0.13406034140360362


Step 3 - Retrain model with best degree.

In [21]:
regressor_best_iad, _ = PolyRegressor(X_train_iad, y_train_iad, degree_cv=best_degree_iad)

In [22]:
regressor_best_emo, _ = PolyRegressor(X_train_emo, y_train_emo, degree_cv=best_degree_emo)

Step 4 - Evaluate again with test set.

In [23]:
PolyPredictions(regressor=regressor_best_iad, X_test=X_test_iad, y_test=y_test_iad)

Mean Squared Error: 0.47035366204415324
Root Mean Squared Error: 0.6836375862859109


In [24]:
PolyPredictions(regressor=regressor_best_emo, X_test=X_test_emo, y_test=y_test_emo)

Mean Squared Error: 0.104060633144751
Root Mean Squared Error: 0.32023695579124756
