# Preliminary operations

## Import libraries

In [11]:
# importing libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt 
import seaborn as sns
#scaling, normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

## Mount Google Drive

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import dataset

In [13]:
# importing dataset
df = pd.read_csv("drive/MyDrive/Progetto Data Mining 1/Dataset/ravdess_features.csv")
df_prep = df.copy()
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 38 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   modality             2452 non-null   object 
 1   vocal_channel        2256 non-null   object 
 2   emotion              2452 non-null   object 
 3   emotional_intensity  2452 non-null   object 
 4   statement            2452 non-null   object 
 5   repetition           2452 non-null   object 
 6   actor                1326 non-null   float64
 7   sex                  2452 non-null   object 
 8   channels             2452 non-null   int64  
 9   sample_width         2452 non-null   int64  
 10  frame_rate           2452 non-null   int64  
 11  frame_width          2452 non-null   int64  
 12  length_ms            2452 non-null   int64  
 13  frame_count          2452 non-null   float64
 14  intensity            1636 non-null   float64
 15  zero_crossings_sum   2452 non-null   i

#Data Preparation

##Remove useless data

In [14]:
nunique = df_prep.nunique()
cols_to_drop = nunique[nunique == 1].index
df_prep.drop(cols_to_drop, axis=1, inplace=True)
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   vocal_channel        2256 non-null   object 
 1   emotion              2452 non-null   object 
 2   emotional_intensity  2452 non-null   object 
 3   statement            2452 non-null   object 
 4   repetition           2452 non-null   object 
 5   actor                1326 non-null   float64
 6   sex                  2452 non-null   object 
 7   channels             2452 non-null   int64  
 8   frame_width          2452 non-null   int64  
 9   length_ms            2452 non-null   int64  
 10  frame_count          2452 non-null   float64
 11  intensity            1636 non-null   float64
 12  zero_crossings_sum   2452 non-null   int64  
 13  mfcc_mean            2452 non-null   float64
 14  mfcc_std             2452 non-null   float64
 15  mfcc_min             2452 non-null   f

##Delete high correlation variables

In [15]:
cor_matrix = df_prep.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
print(to_drop)
df_prep.drop(df_prep[to_drop].columns, axis=1, inplace=True)

df_prep.info()

['frame_width', 'frame_count', 'mfcc_std', 'mfcc_min', 'sc_skew', 'stft_skew', 'std', 'min', 'max']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   vocal_channel        2256 non-null   object 
 1   emotion              2452 non-null   object 
 2   emotional_intensity  2452 non-null   object 
 3   statement            2452 non-null   object 
 4   repetition           2452 non-null   object 
 5   actor                1326 non-null   float64
 6   sex                  2452 non-null   object 
 7   channels             2452 non-null   int64  
 8   length_ms            2452 non-null   int64  
 9   intensity            1636 non-null   float64
 10  zero_crossings_sum   2452 non-null   int64  
 11  mfcc_mean            2452 non-null   float64
 12  mfcc_max             2452 non-null   float64
 13  sc_mean              2452 non-null   f

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))


In [16]:
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   vocal_channel        2256 non-null   object 
 1   emotion              2452 non-null   object 
 2   emotional_intensity  2452 non-null   object 
 3   statement            2452 non-null   object 
 4   repetition           2452 non-null   object 
 5   actor                1326 non-null   float64
 6   sex                  2452 non-null   object 
 7   channels             2452 non-null   int64  
 8   length_ms            2452 non-null   int64  
 9   intensity            1636 non-null   float64
 10  zero_crossings_sum   2452 non-null   int64  
 11  mfcc_mean            2452 non-null   float64
 12  mfcc_max             2452 non-null   float64
 13  sc_mean              2452 non-null   float64
 14  sc_std               2452 non-null   float64
 15  sc_min               2452 non-null   f

##Split Dataset

In [17]:
df = df_prep

In [18]:
# get data columns
df_data = df.loc[:, "length_ms":"skew"]

In [19]:
X = df_data.dropna().drop("intensity", axis=1).values
y = df_data["intensity"].dropna().values
X_missing_values = df_data[df_data["intensity"].isna()].drop("intensity", axis=1).values

# split train- and test-set
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1, 
    random_state=0,
)

# standardizing the new training set and the test set
std_sclr = StandardScaler()
std_sclr.fit(X_train)
X_train = std_sclr.transform(X_train)
X_test = std_sclr.transform(X_test)

In [30]:
print(len(X_train))
print(len(y_train))
print()
print(len(X_missing_values))

1472
1472

816


#Regression

##Import libraries

In [23]:
# stats
from scipy import stats
# repeated stratified kfold
from sklearn.model_selection import RepeatedKFold
# randomized and grid search cv
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# Dummy clf
from sklearn.dummy import DummyRegressor

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

In [24]:
from sklearn import linear_model

In [None]:
# get two baseline (for validation- and for test- set)
dummy_reg = DummyRegressor(strategy="median")
dummy_reg.fit(X_train, y_train)

print(dummy_reg.score(X=None, y=y_train))

-0.007917016080284833


##Linear

In [54]:
from sklearn.linear_model import LinearRegression

In [55]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

LinearRegression()

In [56]:
y_pred_test = linear_reg.predict(X_test)

In [57]:
print(r2_score(y_pred_test, y_test))
print(mean_absolute_error(y_test, y_pred_test))
print(mean_squared_error(y_test, y_pred_test))

0.8744580496153072
2.2209797719146853
8.054092895006088


## Ridge

In [21]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform, randint, loguniform
from sklearn.linear_model import Ridge

In [43]:
param_grid = {
    "alpha": np.logspace(2, -5, 1000)
}

RR = Ridge(
    random_state = 0
)

RKF = RepeatedKFold(
        n_splits = 20, 
        n_repeats = 3, 
        random_state = 0
)

grid = GridSearchCV(
    RR,
    param_grid=param_grid,
    cv=RKF,
    n_jobs=-1,
    refit=True,
    verbose = 1
)

grid.fit(X_train, y_train)

Fitting 60 folds for each of 1000 candidates, totalling 60000 fits


GridSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=20, random_state=0),
             estimator=Ridge(random_state=0), n_jobs=-1,
             param_grid={'alpha': array([1.00000000e+02, 9.83995230e+01, 9.68246612e+01, 9.52750047e+01,
       9.37501502e+01, 9.22497005e+01, 9.07732653e+01, 8.93204600e+01,
       8.78909065e+01, 8.64842328e+01, 8.51000725e+01, 8.37380654e+01,
       8.23978568e+01, 8.10790981e+01, 7...
       1.44930957e-05, 1.42611371e-05, 1.40328908e-05, 1.38082977e-05,
       1.35872990e-05, 1.33698374e-05, 1.31558562e-05, 1.29452998e-05,
       1.27381132e-05, 1.25342427e-05, 1.23336350e-05, 1.21362380e-05,
       1.19420003e-05, 1.17508713e-05, 1.15628013e-05, 1.13777413e-05,
       1.11956432e-05, 1.10164595e-05, 1.08401436e-05, 1.06666496e-05,
       1.04959323e-05, 1.03279473e-05, 1.01626509e-05, 1.00000000e-05])},
             verbose=1)

In [44]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

{'alpha': 1.71488196987054}
0.8650431019769479
Ridge(alpha=1.71488196987054, random_state=0)


In [48]:
RR = Ridge(
    alpha = 1.71488196987054,
    random_state = 0
)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

RR.fit(X_train, y_train)
y_pred_test = RR.predict(X_test)

print(r2_score(y_pred_test, y_test))
print(mean_absolute_error(y_test, y_pred_test))
print(mean_squared_error(y_test, y_pred_test))

0.8741342184924075
2.22334458388791
8.044533775233974


##Lasso

In [45]:
from sklearn.linear_model import Lasso

In [46]:
param_grid = {
    "alpha": np.logspace(2, -5, 1000)
}

LR = Lasso(
    random_state = 0
)

RKF = RepeatedKFold(
        n_splits = 20, 
        n_repeats = 3, 
        random_state = 0
)

grid = GridSearchCV(
    LR,
    param_grid=param_grid,
    cv=RKF,
    n_jobs=-1,
    refit=True,
    verbose = 1
)

grid.fit(X_train, y_train)

Fitting 60 folds for each of 1000 candidates, totalling 60000 fits


GridSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=20, random_state=0),
             estimator=Lasso(random_state=0), n_jobs=-1,
             param_grid={'alpha': array([1.00000000e+02, 9.83995230e+01, 9.68246612e+01, 9.52750047e+01,
       9.37501502e+01, 9.22497005e+01, 9.07732653e+01, 8.93204600e+01,
       8.78909065e+01, 8.64842328e+01, 8.51000725e+01, 8.37380654e+01,
       8.23978568e+01, 8.10790981e+01, 7...
       1.44930957e-05, 1.42611371e-05, 1.40328908e-05, 1.38082977e-05,
       1.35872990e-05, 1.33698374e-05, 1.31558562e-05, 1.29452998e-05,
       1.27381132e-05, 1.25342427e-05, 1.23336350e-05, 1.21362380e-05,
       1.19420003e-05, 1.17508713e-05, 1.15628013e-05, 1.13777413e-05,
       1.11956432e-05, 1.10164595e-05, 1.08401436e-05, 1.06666496e-05,
       1.04959323e-05, 1.03279473e-05, 1.01626509e-05, 1.00000000e-05])},
             verbose=1)

In [47]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

{'alpha': 0.01355601785329369}
0.8652701580722426
Lasso(alpha=0.01355601785329369, random_state=0)


In [49]:
LR = Lasso(
    alpha = 0.01355601785329369,
    random_state = 0
)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

LR.fit(X_train, y_train)
y_pred_test = LR.predict(X_test)

print(r2_score(y_pred_test, y_test))
print(mean_absolute_error(y_test, y_pred_test))
print(mean_squared_error(y_test, y_pred_test))

0.8735021515254487
2.2309958455815773
8.037143099336424


##Predictions

In [None]:
reg.fit(X_train, y_train)

LinearRegression()

In [None]:
y_pred = reg.predict(X_test)

In [None]:
len(y_pred)

816

In [None]:
reg.score(X_test, y_test)

1.0