
# Exemple PCA et t-SNE

Analyse en Composantes Principales et T-NSE appliqué aux dataset des Iris.


In [1]:
import numpy as np
from time import time

from sklearn import decomposition
from sklearn import datasets
from sklearn.cluster import KMeans

#Eviter les warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

iris = datasets.load_iris()
X = iris.data
y = iris.target

X[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [2]:
t0 = time()
pca = decomposition.PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)
print(f"Durée PCA : {round(time()-t0,4)} s.")
print(pca.explained_variance_ratio_)
print(X_pca[:10])

Durée PCA : 0.002 s.
[0.92461872 0.05306648]
[[-2.68412563  0.31939725]
 [-2.71414169 -0.17700123]
 [-2.88899057 -0.14494943]
 [-2.74534286 -0.31829898]
 [-2.72871654  0.32675451]
 [-2.28085963  0.74133045]
 [-2.82053775 -0.08946138]
 [-2.62614497  0.16338496]
 [-2.88638273 -0.57831175]
 [-2.6727558  -0.11377425]]


In [3]:
filtre_setosa = y==0
filtre_versicolor = y==1
filtre_virginica = y==2
print(filtre_virginica)

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]


In [4]:
import plotly.graph_objs as go  # Alternative plus moderne à matplotlib


palette = ['navy','red','maroon']

acp_setosa =go.Scatter(x=X_pca[filtre_setosa,0],y=X_pca[filtre_setosa,1],name='Setosa',
                          text='Setosa',opacity=0.9,
                          marker=dict(color=palette[0],size=5),mode='markers'
                        )
acp_versicolor =go.Scatter(x=X_pca[filtre_versicolor,0],y=X_pca[filtre_versicolor,1],name='Versicolor',
                          text='Versicolor',opacity=0.9,
                          marker=dict(color=palette[1],size=5),mode='markers'
                        )
acp_virginica =go.Scatter(x=X_pca[filtre_virginica,0],y=X_pca[filtre_virginica,1],name='Virginica',
                          text='Virginica',opacity=0.9,
                          marker=dict(color=palette[2],size=5),mode='markers'
                        )

layout = go.Layout(title="PCA - Iris",titlefont=dict(size=40),autosize=False, width=1000,height=600)

data=[acp_setosa,acp_versicolor,acp_virginica]
fig = go.Figure(data=data, layout=layout)
fig.show()

## Exemple T-SNE
T-SNE appliqué au dataset des Iris.

In [5]:
from sklearn import manifold
t0 = time()
X = iris.data
tsne = manifold.TSNE(n_components=2, verbose=0,perplexity=3, n_iter=1000)
X_tsne = tsne.fit_transform(X)
print(f"Durée T-SNE : {round(time()-t0,4)} s.")

Durée T-SNE : 0.2402 s.


In [6]:
import plotly.graph_objs as go  # Alternative plus moderne à matplotlib

palette = ['navy','red','maroon']

acp_setosa =go.Scatter(x=X_tsne[filtre_setosa,0],y=X_tsne[filtre_setosa,1],name='Setosa',
                          text='Setosa',opacity=0.9,
                          marker=dict(color=palette[0],size=5),mode='markers'
                        )
acp_versicolor =go.Scatter(x=X_tsne[filtre_versicolor,0],y=X_tsne[filtre_versicolor,1],name='Versicolor',
                          text='Versicolor',opacity=0.9,
                          marker=dict(color=palette[1],size=5),mode='markers'
                        )
acp_virginica =go.Scatter(x=X_tsne[filtre_virginica,0],y=X_tsne[filtre_virginica,1],name='Virginica',
                          text='Virginica',opacity=0.9,
                          marker=dict(color=palette[2],size=5),mode='markers'
                        )

layout = go.Layout(title="T-SNE - Iris",titlefont=dict(size=40),autosize=False, width=1000,height=600)

data=[acp_setosa,acp_versicolor,acp_virginica]
fig = go.Figure(data=data, layout=layout)
fig.show()

## Exemple K-Means

In [7]:

# Liste pour stocker les inerties
inertias = []

# Appliquer KMeans pour différents nombres de clusters (de 1 à 10)
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

## Affichage de l'inertie

In [8]:

# Tracer l'Elbow Plot avec Plotly
fig = go.Figure()

fig.add_trace(go.Scatter(x=list(range(1, 11)), y=inertias, mode='lines+markers',
                         line=dict(dash='dash', color='blue'), marker=dict(size=8)))

fig.update_layout(
    title="Elbow Plot pour déterminer le nombre optimal de clusters",
    xaxis_title="Nombre de clusters",
    yaxis_title="Inertie Intra",
    xaxis=dict(tickmode='linear'),
    template="plotly_white",
    width=800,  # Largeur du graphique
    height=600  # Hauteur du graphique
)

# Afficher la figure
fig.show()

## Affichage 3 clusters

In [10]:
import pandas as pd
import plotly.express as px
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)

df = pd.DataFrame(X_pca, columns=["X1", "X2"])
df["Cluster"] = kmeans.labels_  # Ajouter les labels de clusters

# Tracé avec Plotly Express
fig = px.scatter(df, x="X1", y="X2", color=df["Cluster"].astype(str), 
                 title="Clustering K-Means", 
                 labels={"color": "Cluster"})

fig.show()

In [None]:
X