# PCA

In [3]:
from pandas import read_csv

In [4]:
data = read_csv('snd-dataset-from-plain-json.csv')

data.head()

Unnamed: 0,Duration,Loudness,LogAttackTime,Tempo,Tempo.confidence,TemporalCentroid,SingleEvent,Loop,Tonality,Tonality.confidence,DynamicRange,Note.midi,Note.frequency,Note.confidence,Genre,Mood
0,24.218412,-16.581459,0.769376,95,0.133154,0.498596,False,False,G major,0.524679,9.689243,55,197.9729,0.0,Genre B,Mood B
1,243.983673,-16.891335,1.618665,65,0.545527,0.479576,False,False,G major,0.785114,5.247044,40,85.456451,0.0,Genre A,Mood A
2,15.281632,-21.658251,0.582658,63,0.996905,0.492315,True,True,C minor,0.698095,1.060242,50,151.972198,0.352345,Genre B,Mood B
3,2.0,-10.525232,-1.590209,119,0.0,0.468918,False,False,G# minor,0.64668,0.0,41,91.402817,0.0,Genre A,Mood A
4,1.45415,-28.335722,-0.492548,152,0.0,0.502481,True,False,F# minor,0.408481,0.0,107,3984.657227,0.695633,Genre A,Mood A


In [5]:
# 16 columnas de datos
data.shape

(1017, 16)

In [6]:
# Descarto columnas que no se van a usar por ahora
# Tip: axis number (0 for rows and 1 for columns)
data = data.drop("Tempo.confidence", axis=1);
data = data.drop("Pitch.confidence", axis=1);
data = data.drop("Key.confidence", axis=1);
data = data.drop("Loop", axis=1);

data.head()

KeyError: "['Pitch.confidence'] not found in axis"

In [131]:
key_to_number_list = ['A minor', 'A major', 'A# minor', 'A# major', 'B minor', 'B major', 'C minor', 'C major', 'C# minor', 'C# major', 'D minor', 'D major', 'D# minor', 'D# major','E minor', 'E major', 'F minor', 'F major', 'F# minor', 'F# major', 'G minor', 'G major','G# minor', 'G# major']

def keyToNumber(x_value):
    return [i for i,x in enumerate(key_to_number_list) if x == x_value][0]

In [132]:
# test
keyToNumber('B minor')

4

In [133]:
# Mapeo de las tonalidades 'en texto' a categorias numéricas
data['Key'] = data['Key'].map(keyToNumber)

In [134]:
data.head()

Unnamed: 0,Duration,Loudness,LogAttackTime,Tempo,TemporalCentroid,Pitch,Key
0,24.688,-9.428771,0.643956,122,0.554729,150.475891,0
1,9.743673,-15.724195,-1.538928,99,0.515523,172.650284,3
2,2.965,-41.118546,-0.108449,98,0.436209,1856.814087,8
3,212.834106,-13.990514,1.079394,94,0.471911,233.854294,5
4,1.985306,-12.687588,-0.707071,125,0.517171,269.756226,7


In [172]:
X = data
#y = data["Key"]

In [151]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [152]:
pca

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [153]:
pca.n_components_

6

In [154]:
# Nota: En sklearn PCA esta implementado como una descomposición SVD
# Libro hands on ML, p.214 (chapter 8)
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)
pca.explained_variance_ratio_

array([0.92436908, 0.07324825])

# Conclusión parcial

Hasta acá esta mal porque 'Key' es una variable categórica
y no se normalizaron los datos


# Rearmado

## Escalado de los datos (normalización)
Media = 0 y varianza = 1

    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.scale.html



In [173]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import numpy as np

X = data.drop('Key', axis=1)
X = data
X_scaled = scale(X, axis=0, with_mean=True, with_std=True, copy=True)

In [174]:
import pandas as pd
df = pd.DataFrame(X_scaled)

df.shape

(1711, 7)

In [175]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.863956,0.980326,-0.274263,0.228131,0.77503,-0.274007,-1.569148
1,-0.958866,0.083658,-2.273533,-0.611371,0.213123,-0.233947,-1.148529
2,-1.001917,-3.533299,-0.963379,-0.647871,-0.923631,2.808588,-0.447497
3,0.330945,0.33059,0.124548,-0.793872,-0.411946,-0.123379,-0.868116
4,-1.008139,0.516167,-1.511648,0.337631,0.236739,-0.05852,-0.587703


In [176]:
# Nota: En sklearn PCA esta implementado como una descomposición SVD
# Libro hands on ML, p.214 (chapter 8)

pca = PCA()
pca.fit(X_scaled)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [177]:
# cantidad de componentes resultantes
pca.n_components_

7

In [178]:
# estos son los autovalores (ordenados de mayor a menor)
pca.explained_variance_

array([1.86825943, 1.12846829, 1.02609944, 0.9663424 , 0.80478083,
       0.75220448, 0.4579387 ])

In [179]:
avalores = pca.explained_variance_
avalores[0]/sum(avalores)

0.2667382165108527

In [180]:
# y esto muestra la 'variabilidad' que aporta cada componente
# (lo que representa en porcentaje)
pca.explained_variance_ratio_

array([0.26673822, 0.16111554, 0.14649996, 0.13796823, 0.1149015 ,
       0.10739498, 0.06538158])

In [181]:
# Y el acumulado agregando dimensiones
np.cumsum(pca.explained_variance_ratio_)

array([0.26673822, 0.42785375, 0.57435371, 0.71232195, 0.82722344,
       0.93461842, 1.        ])

# Conclusión

*Recién con 4 dimensiones tengo algo interesante*
Pero en realidad hacen falta 5



### Ecuación del modelo



In [164]:
# componente 0
pca.components_.T[:,0]

array([ 0.5669665 ,  0.24186696,  0.60294714,  0.14727468,  0.41608323,
       -0.24837278])

In [165]:
# componente 1 (autovector del autovalor 1)
pca.components_.T[:,1]

array([ 0.08071659, -0.62663587,  0.14574249,  0.04287633,  0.40996551,
        0.64004793])

# Conclusión parcial


---

**Siguiente:** [4 - Conclusión final](4%20-%20Conclusión%20final.ipynb)