# Predecir tonalidad en función el resto de las variables

(dataset SEGMENTADO/PARTICIONADO con ontología json-ld)

In [3]:
import pandas as pd
import numpy as np

from sklearn import linear_model
import statsmodels.api as sm

np.set_printoptions(precision=2)

## Dataset con ontología json-ld

In [6]:
#data = pd.read_csv('snd-dataset-from-json-ld.csv')
data = pd.read_csv('snd-segmented-dataset-from-json-ld.csv')

# Descarto columnas que no se van a usar por ahora
# Tip: axis number (0 for rows and 1 for columns)
data = data.drop("Tempo.confidence", axis=1);
data = data.drop("Pitch.confidence", axis=1);
data = data.drop("Key.confidence", axis=1);
data = data.drop("Loop", axis=1);

data.head()

Unnamed: 0,Duration,Loudness,LogAttackTime,Tempo,TemporalCentroid,Pitch,Key
0,5.0,-15.304321,0.662552,103,0.572003,384.759399,G major
1,5.0,-19.805204,-0.860892,99,0.539102,67.521309,A major
2,5.0,-24.740988,-1.379621,116,0.431652,427.887085,F# major
3,5.0,-10.838598,0.478131,146,0.497631,263.397491,C major
4,5.0,-35.42086,0.560322,98,0.49889,126.194229,A# minor


In [7]:
data.shape

(56083, 7)

## Correlación entre variables

In [8]:
data.corr(method='pearson', min_periods=1) # pearson -> método estándar

Unnamed: 0,Duration,Loudness,LogAttackTime,Tempo,TemporalCentroid,Pitch
Duration,1.0,0.007493,0.058741,0.019578,0.038726,-0.030447
Loudness,0.007493,1.0,-0.081416,0.046157,0.057607,-0.077931
LogAttackTime,0.058741,-0.081416,1.0,-0.001458,0.290779,-0.025086
Tempo,0.019578,0.046157,-0.001458,1.0,-0.017688,-0.005988
TemporalCentroid,0.038726,0.057607,0.290779,-0.017688,1.0,0.006011
Pitch,-0.030447,-0.077931,-0.025086,-0.005988,0.006011,1.0


Se observa muy baja correlación excepto algo apenas aceptable entre Duration y LogAttackTime

## Escalado de los datos

In [9]:
from sklearn.preprocessing import scale

X = data.drop('Key', axis=1)

In [10]:
X_scaled = scale(X, axis=0, with_mean=True, with_std=True, copy=True)
df = pd.DataFrame(X_scaled)
df.corr(method='pearson', min_periods=1) # pearson -> método estándar

  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2,3,4,5
0,1.0,0.007493,0.058741,0.019578,0.038726,-0.030447
1,0.007493,1.0,-0.081416,0.046157,0.057607,-0.077931
2,0.058741,-0.081416,1.0,-0.001458,0.290779,-0.025086
3,0.019578,0.046157,-0.001458,1.0,-0.017688,-0.005988
4,0.038726,0.057607,0.290779,-0.017688,1.0,0.006011
5,-0.030447,-0.077931,-0.025086,-0.005988,0.006011,1.0


# Intento de predicción del Tempo a partir de la duración

In [11]:
Duration = np.asarray( data.loc[:, 'Duration' ] )# body ---> variable dependiente --> x 
Tempo = np.asarray( data.loc[:, 'Tempo' ] ) # heart weight ---> variable independiente --> y (lo que predigo)

In [12]:
# esto es un ajuste lineal (cuadrados mínimos)
linearRegresionTest = sm.add_constant( Duration )
model = sm.OLS(Tempo, linearRegresionTest).fit()

model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,21.5
Date:,"Thu, 21 Feb 2019",Prob (F-statistic):,3.54e-06
Time:,19:06:25,Log-Likelihood:,-262940.0
No. Observations:,56083,AIC:,525900.0
Df Residuals:,56081,BIC:,525900.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,100.4540,4.495,22.347,0.000,91.644,109.265
x1,4.1721,0.900,4.637,0.000,2.409,5.935

0,1,2,3
Omnibus:,2375.301,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2683.508
Skew:,0.534,Prob(JB):,0.0
Kurtosis:,2.905,Cond. No.,210.0


In [13]:
#AdjRSquared = 0.009
#AdjRSquared

print("El R cuadrado es muy bajo!")

El R cuadrado es muy bajo!


## Regresión múltiple (varias variables)

In [14]:
# https://stackoverflow.com/questions/11479064/multiple-linear-regression-in-python
def reg_multiple(y, x):
    ones = np.ones(len(x[0]))
    X = sm.add_constant(np.column_stack((x[0], ones)))
    for ele in x[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))
    results = sm.OLS(y, X).fit()
    return results

## Predicción de Tempo a partir de Duration y TemporalCentroid

In [15]:
TemporalCentroid = np.asarray( data.loc[:, 'TemporalCentroid' ] )
Pitch = np.asarray( data.loc[:, 'Pitch' ] )

y = Tempo

X = np.array( [ Duration, TemporalCentroid])
#X = np.array( [ Pitch, TemporalCentroid])

reg_multiple(y, X).summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,20.32
Date:,"Thu, 21 Feb 2019",Prob (F-statistic):,1.51e-09
Time:,19:06:26,Log-Likelihood:,-262930.0
No. Observations:,56083,AIC:,525900.0
Df Residuals:,56080,BIC:,525900.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-7.9495,1.818,-4.373,0.000,-11.513,-4.387
x2,4.3245,0.900,4.804,0.000,2.560,6.089
const,103.6648,4.554,22.764,0.000,94.739,112.591

0,1,2,3
Omnibus:,2372.803,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2680.367
Skew:,0.533,Prob(JB):,0.0
Kurtosis:,2.905,Cond. No.,214.0


In [16]:
print("El R cuadrado es muy bajo!")

El R cuadrado es muy bajo!
