# Part 1: Training Model

In [90]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE

## Step 1.0 - Hold out Test Set (Create Validation Set)

In [91]:
targets = ["arousal", "valence"]

In [92]:
df_iad = pd.read_csv("../data/preprocessed/IADSED_preprocessed.csv")
df_emo = pd.read_csv("../data/preprocessed/EmoSounds_preprocessed.csv")

In [93]:
"""
  Load target features on y and drop target features on X.
  Perform shuffle on first split to prevent bias. Split 80-20 for train-test.
  Split again on train to get validation set. So 60-20-20.
"""
def train_test_valid_split(df, targets):
  y = df[targets]
  X = df.drop(targets, axis=1)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=101, shuffle=True)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.20, random_state=101)

  return X_train, X_test, X_val, y_train, y_test, y_val

Perform the splits for train, test, and validation sets.

In [94]:
X_train_iad, X_test_iad, X_val_iad, y_train_iad, y_test_iad, y_val_iad = train_test_valid_split(df_iad, targets)

In [95]:
X_train_emo, X_test_emo, X_val_emo, y_train_emo, y_test_emo, y_val_emo = train_test_valid_split(df_emo, targets)

Polynomial Regression Model

In [96]:
"""
  Creates poly regression model.
  If no optional parameters specified, then create and train a poly regression with a degree of 2.
  If we are doing hyperparameter search and degree range is given, then test and perform K-fold cross-validation on each degree.
  Once search is finish, print the best degree and retrain the model with that degree.
  If we are doing recursive feature elimination, then train the model and run RFE. 
  For every iteration of RFE, remove 10% of features until we reach the top k features (Default: 10).
  For all scenarios, return the regression model and degree.
"""
def PolyRegressor(X_train, y_train, degree=2, search=False, degree_range=None, degree_cv=None, recursive_fe=False, top_k=10):

  if degree_cv != None:
    degree = degree_cv

  if search == True and degree_range != None:
    best_degree = degree
    best_mse = float('inf')
    
    for d in degree_range:
      pipeline = Pipeline([
        ('poly_features', PolynomialFeatures(degree=d, include_bias=False)),
        ('linear_regression', LinearRegression())
      ])
      
      scores = cross_val_score(pipeline, X_train, y_train, cv=5, 
                               scoring='neg_mean_squared_error', n_jobs=-1)
      mse = -scores.mean()
      
      print(f'Degree {d}: MSE = {mse}')
      
      if mse < best_mse:
        best_mse = mse
        best_degree = d
    
    print(f"\nBest Degree: {best_degree} with MSE: {best_mse}")
    degree = best_degree

  if recursive_fe == True:
    regressor = Pipeline([
      ('poly_features', PolynomialFeatures(degree=degree, include_bias=False)),
      ('linear_regression', RFE(estimator=LinearRegression(), n_features_to_select=top_k, step=0.1))
    ])
  else:    
    regressor = Pipeline([
      ('poly_features', PolynomialFeatures(degree=degree, include_bias=False)),
      ('linear_regression', LinearRegression())
    ])
  
  regressor.fit(X_train, y_train)

  return regressor, degree

In [97]:
"""
  Evaluates/predicts with the test or validation set.
  Returns MSE and RMSE metrics.
"""
def PolyPredictions(regressor, X_test, y_test, ds_type, ds_name):
  predictions = regressor.predict(X_test)

  mse = mean_squared_error(y_test, predictions)
  print(f'{ds_name} - {ds_type} MSE: {mse}')

  rmse = root_mean_squared_error(y_test, predictions)
  print(f'{ds_name} - {ds_type} RMSE: {rmse}')

Train the polynomial regression model on the datasets with default degree=2.

In [98]:
regressor_iad, degree_iad = PolyRegressor(X_train=X_train_iad, y_train=y_train_iad)

In [99]:
regressor_emo, degree_emo = PolyRegressor(X_train=X_train_emo, y_train=y_train_emo)

Make the predictions with validation set first and then test set to compare metrics.

In [100]:
PolyPredictions(regressor=regressor_iad, X_test=X_val_iad, y_test=y_val_iad, ds_type="Validation", ds_name="IADSED")

PolyPredictions(regressor=regressor_iad, X_test=X_test_iad, y_test=y_test_iad, ds_type="Test", ds_name="IADSED")

IADSED - Validation MSE: 1.360300238650361
IADSED - Validation RMSE: 1.1648821809096601
IADSED - Test MSE: 3.4092959820949806
IADSED - Test RMSE: 1.8463565785305116


In [101]:
PolyPredictions(regressor=regressor_emo, X_test=X_val_emo, y_test=y_val_emo, ds_type="Validation", ds_name="EmoSounds")

PolyPredictions(regressor=regressor_emo, X_test=X_test_emo, y_test=y_test_emo, ds_type="Test", ds_name="EmoSounds")

EmoSounds - Validation MSE: 0.28027615481713025
EmoSounds - Validation RMSE: 0.5215130626609215
EmoSounds - Test MSE: 0.333445546227532
EmoSounds - Test RMSE: 0.5590652193700572


## Steps 1.1 and 1.2 - 5-Fold Cross-Validation & Hyperparameter Tuning

In [102]:
"""
  Merges training and validation data, since we are now performing K-Fold CV.
"""
def concat_train_valid(X_train, y_train, X_val, y_val):
  X_train = pd.concat([X_train, X_val])
  y_train = pd.concat([y_train, y_val])

  return X_train, y_train

Merge the training and validation data for both datasets.

In [103]:
X_train_iad, y_train_iad = concat_train_valid(X_train_iad, y_train_iad, X_val_iad, y_val_iad)

In [104]:
X_train_emo, y_train_emo = concat_train_valid(X_train_emo, y_train_emo, X_val_emo, y_val_emo)

Degree Range Setup for Hyperparameter Search

In [105]:
degree_range = [1, 2, 3]

Search to find the best polynomial degree

In [106]:
regressor_cv_iad, best_degree_iad = PolyRegressor(X_train=X_train_iad, y_train=y_train_iad, 
                                                    search=True, degree_range=degree_range)

Degree 1: MSE = 0.5018885603893477
Degree 2: MSE = 7.17592183773894
Degree 3: MSE = 17.831311409908448

Best Degree: 1 with MSE: 0.5018885603893477


In [107]:
regressor_cv_emo, best_degree_emo = PolyRegressor(X_train=X_train_emo, y_train=y_train_emo, 
                                                    search=True, degree_range=degree_range)

Degree 1: MSE = 0.13406034140360368
Degree 2: MSE = 0.3645045623136089
Degree 3: MSE = 0.9997742728528076

Best Degree: 1 with MSE: 0.13406034140360368


# Step 1.3 - Retrain model with best degree.

In [108]:
regressor_best_iad, _ = PolyRegressor(X_train_iad, y_train_iad, degree_cv=best_degree_iad)

In [109]:
regressor_best_emo, _ = PolyRegressor(X_train_emo, y_train_emo, degree_cv=best_degree_emo)

# Step 1.4 - Evaluate again with test set

In [110]:
PolyPredictions(regressor=regressor_best_iad, X_test=X_test_iad, y_test=y_test_iad, ds_type="Test", ds_name="IADSED")

IADSED - Test MSE: 0.44688096006082545
IADSED - Test RMSE: 0.6673765786622908


In [111]:
PolyPredictions(regressor=regressor_best_emo, X_test=X_test_emo, y_test=y_test_emo, ds_type="Test", ds_name="EmoSounds")

EmoSounds - Test MSE: 0.10406063314475089
EmoSounds - Test RMSE: 0.3202369557912474


# Part 2: Training with Feature Selection

In [112]:
df_iad_fs = pd.read_csv("../data/preprocessed/IADSED_preprocessed.csv")
df_emo_fs = pd.read_csv("../data/preprocessed/EmoSounds_preprocessed.csv")

## Step 2.0 - Hold out Test Set (Create Validation Set)

In [113]:
X_train_iad_fs, X_test_iad_fs, X_val_iad_fs, y_train_iad_fs, y_test_iad_fs, y_val_iad_fs = train_test_valid_split(df_iad_fs, targets)

X_train_emo_fs, X_test_emo_fs, X_val_emo_fs, y_train_emo_fs, y_test_emo_fs, y_val_emo_fs = train_test_valid_split(df_emo_fs, targets)

Run Recursive Feature Elimination with top 20 features selected.

In [114]:
regressor_iad_fs, degree_iad_fs = PolyRegressor(X_train=X_train_iad_fs, y_train=y_train_iad_fs, recursive_fe=True, top_k=20)

regressor_emo_fs, degree_emo_fs = PolyRegressor(X_train=X_train_emo_fs, y_train=y_train_emo_fs, recursive_fe=True, top_k=20)

In [115]:
PolyPredictions(regressor=regressor_iad_fs, X_test=X_val_iad_fs, y_test=y_val_iad_fs, ds_type="Validation", ds_name="IADSED")

PolyPredictions(regressor=regressor_iad_fs, X_test=X_test_iad_fs, y_test=y_test_iad_fs, ds_type="Test", ds_name="IADSED")

IADSED - Validation MSE: 0.9157543253596759
IADSED - Validation RMSE: 0.955327508099057
IADSED - Test MSE: 0.8815280427172095
IADSED - Test RMSE: 0.9388065967109473


In [116]:
PolyPredictions(regressor=regressor_emo_fs, X_test=X_val_emo_fs, y_test=y_val_emo_fs, ds_type="Validation", ds_name="EmoSounds")

PolyPredictions(regressor=regressor_emo_fs, X_test=X_test_emo_fs, y_test=y_test_emo_fs, ds_type="Test", ds_name="EmoSounds")

EmoSounds - Validation MSE: 0.13562239547006466
EmoSounds - Validation RMSE: 0.3681119521944267
EmoSounds - Test MSE: 0.13472645014694584
EmoSounds - Test RMSE: 0.36532237604723083


## Steps 2.1 and 2.2 - 5-Fold Cross-Validation & Hyperparameter Tuning

In [117]:
X_train_iad_fs, y_train_iad_fs = concat_train_valid(X_train_iad_fs, y_train_iad_fs, X_val_iad_fs, y_val_iad_fs)

X_train_emo_fs, y_train_emo_fs = concat_train_valid(X_train_emo_fs, y_train_emo_fs, X_val_emo_fs, y_val_emo_fs)

In [118]:
regressor_cv_iad_fs, best_degree_iad_fs = PolyRegressor(X_train=X_train_iad_fs, y_train=y_train_iad_fs, 
                                                    search=True, degree_range=degree_range)

Degree 1: MSE = 0.5018885603893477
Degree 2: MSE = 7.17592183773894
Degree 3: MSE = 17.831311409908448

Best Degree: 1 with MSE: 0.5018885603893477


In [119]:
regressor_cv_emo_fs, best_degree_emo_fs = PolyRegressor(X_train=X_train_emo_fs, y_train=y_train_emo_fs, 
                                                    search=True, degree_range=degree_range)

Degree 1: MSE = 0.13406034140360368
Degree 2: MSE = 0.3645045623136089
Degree 3: MSE = 0.9997742728528076

Best Degree: 1 with MSE: 0.13406034140360368


# Step 2.3 - Retrain model with best degree.

In [120]:
regressor_best_iad, _ = PolyRegressor(X_train_iad, y_train_iad, degree_cv=best_degree_iad, recursive_fe=True, top_k=20)

In [121]:
regressor_best_emo, _ = PolyRegressor(X_train_emo, y_train_emo, degree_cv=best_degree_emo, recursive_fe=True, top_k=20)

# Step 2.4 - Evaluate again with test set.

In [122]:
PolyPredictions(regressor=regressor_best_iad, X_test=X_test_iad, y_test=y_test_iad, ds_type="Test", ds_name="IADSED")

IADSED - Test MSE: 0.46225363473491354
IADSED - Test RMSE: 0.6771810925753063


In [123]:
PolyPredictions(regressor=regressor_best_emo, X_test=X_test_emo, y_test=y_test_emo, ds_type="Test", ds_name="EmoSounds")

EmoSounds - Test MSE: 0.08784366499208404
EmoSounds - Test RMSE: 0.2932493528705145
