# Preliminary operations

## Import libraries

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt 
import seaborn as sns
#scaling, normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import dataset

In [3]:
# importing dataset
df = pd.read_csv("drive/MyDrive/Progetto Data Mining 1/Dataset/ravdess_features.csv")
df_prep = df.copy()
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 38 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   modality             2452 non-null   object 
 1   vocal_channel        2256 non-null   object 
 2   emotion              2452 non-null   object 
 3   emotional_intensity  2452 non-null   object 
 4   statement            2452 non-null   object 
 5   repetition           2452 non-null   object 
 6   actor                1326 non-null   float64
 7   sex                  2452 non-null   object 
 8   channels             2452 non-null   int64  
 9   sample_width         2452 non-null   int64  
 10  frame_rate           2452 non-null   int64  
 11  frame_width          2452 non-null   int64  
 12  length_ms            2452 non-null   int64  
 13  frame_count          2452 non-null   float64
 14  intensity            1636 non-null   float64
 15  zero_crossings_sum   2452 non-null   i

#Data Preparation

##Remove useless data

In [4]:
nunique = df_prep.nunique()
cols_to_drop = nunique[nunique == 1].index
df_prep.drop(cols_to_drop, axis=1, inplace=True)
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   vocal_channel        2256 non-null   object 
 1   emotion              2452 non-null   object 
 2   emotional_intensity  2452 non-null   object 
 3   statement            2452 non-null   object 
 4   repetition           2452 non-null   object 
 5   actor                1326 non-null   float64
 6   sex                  2452 non-null   object 
 7   channels             2452 non-null   int64  
 8   frame_width          2452 non-null   int64  
 9   length_ms            2452 non-null   int64  
 10  frame_count          2452 non-null   float64
 11  intensity            1636 non-null   float64
 12  zero_crossings_sum   2452 non-null   int64  
 13  mfcc_mean            2452 non-null   float64
 14  mfcc_std             2452 non-null   float64
 15  mfcc_min             2452 non-null   f

##Delete high correlation variables

In [5]:
cor_matrix = df_prep.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
print(to_drop)
df_prep.drop(df_prep[to_drop].columns, axis=1, inplace=True)

df_prep.info()

['frame_width', 'frame_count', 'mfcc_std', 'mfcc_min', 'sc_skew', 'stft_skew', 'std', 'min', 'max']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   vocal_channel        2256 non-null   object 
 1   emotion              2452 non-null   object 
 2   emotional_intensity  2452 non-null   object 
 3   statement            2452 non-null   object 
 4   repetition           2452 non-null   object 
 5   actor                1326 non-null   float64
 6   sex                  2452 non-null   object 
 7   channels             2452 non-null   int64  
 8   length_ms            2452 non-null   int64  
 9   intensity            1636 non-null   float64
 10  zero_crossings_sum   2452 non-null   int64  
 11  mfcc_mean            2452 non-null   float64
 12  mfcc_max             2452 non-null   float64
 13  sc_mean              2452 non-null   f

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))


In [6]:
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   vocal_channel        2256 non-null   object 
 1   emotion              2452 non-null   object 
 2   emotional_intensity  2452 non-null   object 
 3   statement            2452 non-null   object 
 4   repetition           2452 non-null   object 
 5   actor                1326 non-null   float64
 6   sex                  2452 non-null   object 
 7   channels             2452 non-null   int64  
 8   length_ms            2452 non-null   int64  
 9   intensity            1636 non-null   float64
 10  zero_crossings_sum   2452 non-null   int64  
 11  mfcc_mean            2452 non-null   float64
 12  mfcc_max             2452 non-null   float64
 13  sc_mean              2452 non-null   float64
 14  sc_std               2452 non-null   float64
 15  sc_min               2452 non-null   f

##Split Dataset

In [7]:
df = df_prep

In [8]:
# get data columns
df_data = df.loc[:, "length_ms":"skew"]

In [9]:
X_train = df_data.dropna().drop("intensity", axis=1).values
y_train = df_data["intensity"].dropna().values
y_train = np.array(y_train, dtype=np.float64, order="C")

X_test = df_data[df_data["intensity"].isna()].drop("intensity", axis=1).values

In [10]:
# train, test, split
from sklearn.model_selection import train_test_split
X = df_data.dropna().drop("intensity", axis=1).values
y = df_data["intensity"].dropna().values
X_missing_values = df_data[df_data["intensity"].isna()].drop("intensity", axis=1).values

# split train- and test-set
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1, 
    random_state=0,
)

# standardizing the new training set and the test set
std_sclr = StandardScaler()
std_sclr.fit(X_train)
X_train = std_sclr.transform(X_train)
X_test = std_sclr.transform(X_test)

In [11]:
print(len(X_train))
print(len(y_train))
print()
print(len(X_test))

1472
1472

164


#Regression

##Import libraries

In [12]:
# import main libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [13]:
# stats
from scipy import stats
#scaling, normalization
from sklearn.preprocessing import StandardScaler
# train, test, split
from sklearn.model_selection import train_test_split
# repeated stratified kfold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
# randomized and grid search cv
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# Dummy clf
from sklearn.dummy import DummyRegressor

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

In [14]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

##Hyperparameters tuning

In [15]:
# get two baseline (for validation- and for test- set)
dummy_clf = DummyRegressor(strategy="median")
dummy_clf.fit(X_train, y_train)

print(dummy_clf.score(X=None, y=y_train))

-0.007917016080284833


In [16]:
# GridSearchCV
param_grid = {
    "n_neighbors": range(1, X_train.shape[0]//2),
    "weights": ["uniform", "distance"],
    "metric": ["cityblock", "euclidean"]
}

KNR = KNeighborsRegressor()

RKF = RepeatedKFold(
        n_splits = 20, 
        n_repeats = 3, 
        random_state = 0
)

grid = RandomizedSearchCV(
    KNR,
    n_iter=1500,
    param_distributions = param_grid,
    cv=RKF,
    n_jobs = -1,
    refit = True,
    verbose=1,
    random_state=0
)

# fit GridSearchCV
grid.fit(X_train, y_train)

Fitting 60 folds for each of 1500 candidates, totalling 90000 fits


RandomizedSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=20, random_state=0),
                   estimator=KNeighborsRegressor(), n_iter=1500, n_jobs=-1,
                   param_distributions={'metric': ['cityblock', 'euclidean'],
                                        'n_neighbors': range(1, 736),
                                        'weights': ['uniform', 'distance']},
                   random_state=0, verbose=1)

In [17]:
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

{'weights': 'distance', 'n_neighbors': 6, 'metric': 'cityblock'}
0.8042371121109305
KNeighborsRegressor(metric='cityblock', n_neighbors=6, weights='distance')


###Test-set performance

In [18]:
KNR = KNeighborsRegressor(
    metric='cityblock', 
    n_neighbors=6, 
    weights='distance'
)

In [22]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

KNR.fit(X_train, y_train)
y_pred_test = KNR.predict(X_test)

print(r2_score(y_pred_test, y_test))
print(mean_absolute_error(y_test, y_pred_test))
print(mean_squared_error(y_test, y_pred_test))

0.7458136936158765
2.7894339645704544
13.020627556564255
