# Predecir tonalidad en función el resto de las variables

(dataset con ontología json-ld)

In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model
import statsmodels.api as sm

np.set_printoptions(precision=2)

## Dataset con ontología json-ld

In [19]:
data = pd.read_csv('snd-dataset-from-json-ld.csv') # Dataset original
#data = pd.read_csv('snd-segmented-dataset-from-json-ld.csv')

# Descarto columnas que no se van a usar por ahora
# Tip: axis number (0 for rows and 1 for columns)
data = data.drop("Tempo.confidence", axis=1);
data = data.drop("Pitch.confidence", axis=1);
data = data.drop("Key.confidence", axis=1);
data = data.drop("Loop", axis=1);

data.head()

Unnamed: 0,Duration,Loudness,LogAttackTime,Tempo,TemporalCentroid,Pitch,Key
0,24.688,-9.428771,0.643956,122,0.554729,150.475891,A minor
1,9.743673,-15.724195,-1.538928,99,0.515523,172.650284,A# major
2,2.965,-41.118546,-0.108449,98,0.436209,1856.814087,C# minor
3,212.834106,-13.990514,1.079394,94,0.471911,233.854294,B major
4,1.985306,-12.687588,-0.707071,125,0.517171,269.756226,C major


In [20]:
data.shape

(1711, 7)

## Correlación entre variables

In [21]:
data.corr(method='pearson', min_periods=1) # pearson -> método estándar

Unnamed: 0,Duration,Loudness,LogAttackTime,Tempo,TemporalCentroid,Pitch
Duration,1.0,0.125187,0.520356,0.096701,0.229799,-0.133888
Loudness,0.125187,1.0,0.120566,0.05565,0.02433,-0.184486
LogAttackTime,0.520356,0.120566,1.0,0.068165,0.352514,-0.149583
Tempo,0.096701,0.05565,0.068165,1.0,0.044982,0.006298
TemporalCentroid,0.229799,0.02433,0.352514,0.044982,1.0,-0.00867
Pitch,-0.133888,-0.184486,-0.149583,0.006298,-0.00867,1.0


Se observa muy baja correlación excepto algo apenas aceptable entre Duration y LogAttackTime

# Intento de predicción del Tempo a partir de la duración

In [22]:
Duration = np.asarray( data.loc[:, 'Duration' ] )# body ---> variable dependiente --> x 
Tempo = np.asarray( data.loc[:, 'Tempo' ] ) # heart weight ---> variable independiente --> y (lo que predigo)

In [23]:
# esto es un ajuste lineal (cuadrados mínimos)
linearRegresionTest = sm.add_constant( Duration )
model = sm.OLS(Tempo, linearRegresionTest).fit()

model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.009
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,16.13
Date:,"Thu, 21 Feb 2019",Prob (F-statistic):,6.16e-05
Time:,12:55:59,Log-Likelihood:,-8083.9
No. Observations:,1711,AIC:,16170.0
Df Residuals:,1709,BIC:,16180.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,113.0455,0.943,119.933,0.000,111.197,114.894
x1,0.0168,0.004,4.016,0.000,0.009,0.025

0,1,2,3
Omnibus:,32.962,Durbin-Watson:,1.938
Prob(Omnibus):,0.0,Jarque-Bera (JB):,65.947
Skew:,-0.014,Prob(JB):,4.78e-15
Kurtosis:,3.961,Cond. No.,322.0


In [24]:
AdjRSquared = 0.009
AdjRSquared

print("El R cuadrado es muy bajo!")

El R cuadrado es muy bajo!


## Regresión múltiple (varias variables)

In [25]:
# https://stackoverflow.com/questions/11479064/multiple-linear-regression-in-python
def reg_multiple(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

## Predicción de Tempo a partir de Duration y TemporalCentroid

In [26]:
TemporalCentroid = np.asarray( data.loc[:, 'TemporalCentroid' ] )
Pitch = np.asarray( data.loc[:, 'Pitch' ] )

y = Tempo

X = np.array( [ Duration, TemporalCentroid])
#X = np.array( [ Pitch, TemporalCentroid])

reg_multiple(y, X).summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,8.537
Date:,"Thu, 21 Feb 2019",Prob (F-statistic):,0.000204
Time:,12:56:00,Log-Likelihood:,-8083.5
No. Observations:,1711,AIC:,16170.0
Df Residuals:,1708,BIC:,16190.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,9.4355,9.714,0.971,0.332,-9.617,28.488
x2,0.0159,0.004,3.686,0.000,0.007,0.024
const,108.4761,4.798,22.609,0.000,99.066,117.886

0,1,2,3
Omnibus:,31.469,Durbin-Watson:,1.94
Prob(Omnibus):,0.0,Jarque-Bera (JB):,61.797
Skew:,-0.006,Prob(JB):,3.81e-14
Kurtosis:,3.931,Cond. No.,3680.0


In [27]:
print("El R cuadrado es muy bajo!")

El R cuadrado es muy bajo!
