
<img src="img/viu_logo.png" width="200">

## 04EPPY - Ciencia de Datos e Inteligencia Artificial
### Unsupervised Learning

![logo](img/python_logo.png)

*Òscar Garibo*

# PCA
- Analisis de Componentes Principales (Principal Component Analysis)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# cargar dataset desde sns
df_iris = sns.load_dataset('iris')
df_iris.sample(5)

In [None]:
# relacion variables
sns.pairplot(data = df_iris, hue='species')

In [None]:
# correlaciones entre variables
display(df_iris.corr())

In [None]:
# grafico de correlaciones
sns.heatmap(df_iris.corr(), square=True, annot=True)

In [None]:
# quitar variable no predictora
df_pca = df_iris.drop('species', axis=1)
display(df_pca)

In [None]:
from sklearn.preprocessing import StandardScaler, scale

In [None]:
StandardScaler?

In [None]:
scale?

In [None]:
# escalar y normalizar los datos
x_scaled = StandardScaler()
x_scaled.fit(df_pca)
scaled = x_scaled.transform(df_pca)
print(scaled.shape)
print(scaled)

In [None]:
from sklearn.decomposition import PCA

In [None]:
# ajustar el dataset escalado
n_components = 3
pca = PCA(n_components = n_components)
pca.fit(scaled)

In [None]:
# transformar los datos
x_pca = pca.transform(scaled)
x_pca.shape

In [None]:
x_pca

In [None]:
# visualizar varianza acumulada
fig, ax = plt.subplots()
ax.plot([1,2,3], np.cumsum(pca.explained_variance_ratio_))
plt.show()

In [None]:
columns = [i for i in pca.explained_variance_ratio_]
columns

In [None]:
# nombre de columnas
columns = [f"AV{i+1}" for i,v in enumerate(pca.explained_variance_ratio_)]
columns

In [None]:
# dataframe con pca
df = pd.DataFrame(x_pca, columns=columns)
display(df)
df.shape

In [None]:
# visualizar los dos primeros componentes
plt.scatter(df.AV1, df.AV2)
plt.show()

In [None]:
# anyadir variable eliminada
df_joined = df.join(df_iris['species'], how='inner')
display(df_joined.sample(5))

In [None]:
# separando por especue
sns.scatterplot(data=df_joined, x='AV1', y='AV3', hue='species')

In [None]:
# visualizar dos primeros componentes por especie
sns.scatterplot(data=df_joined, x='AV1', y='AV2', hue='species', palette='magma')

In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# grafico 3d con los tres componentes
color_labels = df_joined['species'].unique()
rgb_values = sns.color_palette("magma", 3)
color_map = dict(zip(color_labels, rgb_values))

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')
ax.set_title('Iris Dataset PCA', size=14)
ax.scatter(df_joined['AV1'],df_joined['AV2'],df_joined['AV3'],c=df_joined['species'].map(color_map))
ax.set_xlabel('AutoVector_1')
ax.set_ylabel('AutoVector_2')
ax.set_zlabel('AutoVector_3')
ax.w_xaxis.set_ticklabels(())
ax.w_yaxis.set_ticklabels(())
ax.w_zaxis.set_ticklabels(())
plt.show()