Calculate the correlation between each feature and the valence and arousal means to determine the more important features.

In [1]:
import pandas as pd

In [4]:
features_file = '../data/combined_features_scores.csv'

# Read the averaged features
features_df = pd.read_csv(features_file)

In [5]:
# Exclude target column from feature set
score_columns = ['song_id', ' valence_mean', ' valence_std', ' arousal_mean', ' arousal_std']
feature_cols = [col for col in features_df.columns if col not in score_columns]

### Valence
Start by computing the Pearson correlation scores for feature columns.

In [6]:
# Compute correlations
correlations = {}
for col in feature_cols:
    correlations[col] = features_df[col].corr(features_df[" valence_mean"], method="pearson")

# Convert to DataFrame for readability
corr_df = pd.DataFrame.from_dict(correlations, orient="index", columns=["Pearson_corr"])
corr_df = corr_df.sort_values(by="Pearson_corr", ascending=False)

print(corr_df.head(10))

                                              Pearson_corr
audspec_lengthL1norm_sma_de_stddev                0.501772
audspec_lengthL1norm_sma_amean                    0.481517
logHNR_sma_stddev                                 0.442174
audspec_lengthL1norm_sma_stddev                   0.417220
pcm_fftMag_spectralFlux_sma_de_stddev             0.413104
pcm_fftMag_spectralRollOff90.0_sma_de_stddev      0.393641
pcm_fftMag_spectralFlux_sma_amean                 0.393134
pcm_fftMag_spectralRollOff90.0_sma_stddev         0.391356
pcm_zcr_sma_de_stddev                             0.383540
pcm_fftMag_spectralRollOff90.0_sma_amean          0.383252


If we look at the combined_features_scores dataframe, we can see that most of the features are not normally distributed. So instead of just Pearson, we also check the Spearman correlation coefficients to perform feature analysis.

In [34]:
# Compute correlations
correlations = {}
for col in feature_cols:
    correlations[col] = features_df[col].corr(features_df[" valence_mean"], method="spearman")

# Convert to DataFrame for readability
corr_df = pd.DataFrame.from_dict(correlations, orient="index", columns=["Spearman_corr"])
corr_df = corr_df.sort_values(by="Spearman_corr", ascending=False)

print(corr_df.head(15))

print(corr_df.tail(10))

                                              Spearman_corr
audspec_lengthL1norm_sma_de_stddev                 0.553357
pcm_fftMag_spectralEntropy_sma_de_stddev           0.499674
pcm_fftMag_spectralVariance_sma_amean              0.491416
pcm_fftMag_psySharpness_sma_de_stddev              0.490746
pcm_fftMag_spectralCentroid_sma_de_stddev          0.485142
pcm_fftMag_spectralRollOff90.0_sma_de_stddev       0.480478
pcm_zcr_sma_de_stddev                              0.479764
audspec_lengthL1norm_sma_amean                     0.478373
logHNR_sma_stddev                                  0.467139
pcm_fftMag_spectralFlux_sma_de_stddev              0.465329
pcm_fftMag_spectralRollOff75.0_sma_de_stddev       0.463878
pcm_fftMag_spectralRollOff90.0_sma_stddev          0.457785
pcm_fftMag_spectralVariance_sma_de_stddev          0.457591
pcm_fftMag_psySharpness_sma_stddev                 0.453474
pcm_fftMag_spectralCentroid_sma_stddev             0.452366
                                        

## Arousal

In [8]:
# Compute correlations
correlations = {}
for col in feature_cols:
    correlations[col] = features_df[col].corr(features_df[" arousal_mean"], method="pearson")

# Convert to DataFrame for readability
corr_df = pd.DataFrame.from_dict(correlations, orient="index", columns=["Pearson_corr"])
corr_df = corr_df.sort_values(by="Pearson_corr", ascending=False)

print(corr_df.head(10))

                                              Pearson_corr
logHNR_sma_stddev                                 0.488601
audspec_lengthL1norm_sma_de_stddev                0.476519
audspec_lengthL1norm_sma_amean                    0.431377
pcm_zcr_sma_amean                                 0.426807
pcm_fftMag_spectralRollOff90.0_sma_amean          0.420858
pcm_fftMag_spectralVariance_sma_amean             0.418729
pcm_fftMag_spectralRollOff90.0_sma_de_stddev      0.416844
pcm_fftMag_spectralRollOff90.0_sma_stddev         0.406181
pcm_fftMag_spectralFlux_sma_de_stddev             0.401050
pcm_zcr_sma_de_stddev                             0.400079


In [33]:
# Compute correlations
correlations = {}
for col in feature_cols:
    correlations[col] = features_df[col].corr(features_df[" arousal_mean"], method="spearman")

# Convert to DataFrame for readability
corr_df = pd.DataFrame.from_dict(correlations, orient="index", columns=["Spearman_corr"])
corr_df = corr_df.sort_values(by="Spearman_corr", ascending=False)

print(corr_df.head(15))

print(corr_df.tail(10))

                                              Spearman_corr
pcm_fftMag_spectralVariance_sma_amean              0.561099
pcm_fftMag_spectralEntropy_sma_de_stddev           0.544063
audspec_lengthL1norm_sma_amean                     0.539378
audspec_lengthL1norm_sma_de_stddev                 0.518857
pcm_zcr_sma_amean                                  0.513950
logHNR_sma_stddev                                  0.512589
pcm_fftMag_psySharpness_sma_de_stddev              0.510206
pcm_fftMag_spectralCentroid_sma_de_stddev          0.505381
pcm_fftMag_spectralRollOff90.0_sma_de_stddev       0.503448
pcm_fftMag_spectralRollOff90.0_sma_amean           0.497857
pcm_zcr_sma_de_stddev                              0.493612
pcm_fftMag_spectralRollOff75.0_sma_de_stddev       0.490480
pcm_fftMag_spectralRollOff90.0_sma_stddev          0.478314
pcm_fftMag_spectralVariance_sma_de_stddev          0.475591
pcm_fftMag_spectralCentroid_sma_stddev             0.465296
                                        

It is observed that the top correlation scores computed are all less than 0.6. The next step is to perform dimensionality reduction to investigate underlying factors.

## PCA Analysis

In [20]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [21]:
x = features_df.loc[:, feature_cols].values
x = StandardScaler().fit_transform(x) # normalizing the features

In [22]:
x.shape

(1802, 261)

In [27]:
pca = PCA(n_components=5)
pca_components = pca.fit_transform(x)

In [28]:
pca_df = pd.DataFrame(data = pca_components
             , columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
pca_df.tail()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
1797,0.682946,-6.517677,2.211251,0.049311,0.617322
1798,9.166552,6.928986,-2.943468,0.898038,0.357143
1799,1.003915,-0.602328,0.183225,-1.881518,-0.548672
1800,-0.400704,1.23297,-1.890257,3.780309,2.43064
1801,-1.985406,-1.403166,-0.617912,1.992864,2.693162


In [29]:
print('Explained variability per principal component: {}'.format(pca.explained_variance_ratio_))

Explained variability per principal component: [0.2969799  0.09378534 0.08328836 0.05999501 0.04804354]


## Factor Analysis

In [31]:
# Apply Bartlett's test
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value, p_value = calculate_bartlett_sphericity(features_df)
print(f'Chi-square value: {chi_square_value}\nP-value: {p_value}')

Chi-square value: inf
P-value: 0.0


We see that the p-value is <0.05, which indicates that the test has failed.