<a href="https://colab.research.google.com/github/MachineLearnia/Python-Machine-Learning/blob/master/24%20-%20Sklearn%20%3A%20Apprentissage%20Non-supervis%C3%A9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn import metrics
import pandas as pd
import seaborn as sns
from scipy.stats import f_oneway, kruskal, chi2_contingency

In [2]:
raw_data = pd.read_csv("input/housing.csv", sep="\t")
raw_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'input/housing.csv'

In [None]:
raw_data.shape

In [None]:
raw_data.info()

In [None]:
stringcols = raw_data.select_dtypes(include="object").columns
print(stringcols)

In [None]:
raw_data_conv = raw_data.copy()
raw_data_conv[stringcols] = raw_data_conv[stringcols].astype(str)
raw_data_conv[stringcols] = raw_data_conv[stringcols].astype('string')
raw_data_conv[stringcols] = raw_data_conv[stringcols].replace({pd.NA: np.nan})
raw_data_conv.info()

In [None]:
# nombre de valeurs manquantes
print(raw_data_conv.isnull().sum().sort_values(ascending=False))
# Proportion de valeurs manquantes
raw_data_conv.isnull().mean().sort_values(ascending=False)

In [None]:
data_median =  raw_data_conv["total_bedrooms"].fillna(raw_data_conv["total_bedrooms"].median())
data_median.describe()

# 24/30 Apprentissage Non-Supervisé

In [None]:
def compare_clusters_full_with_total(df, cluster_col,
                                      numeric_vars=None,
                                        cat_vars=None,
                                          test_type="anova"):
    df_copy = df.copy()
    numeric_results = []
    categorical_results = []

    # Convertir cluster_col en chaînes
    df_copy[cluster_col] = df_copy[cluster_col].astype(str)
    unique_clusters = df_copy[cluster_col].unique().tolist()

    # Palette couleurs
    palette = sns.color_palette("Set2", n_colors=len(unique_clusters))
    palette_dict = {str(c): palette[i] for i, c in enumerate(unique_clusters)}
    palette_dict["Total"] = "blue"

    if numeric_vars:
        print("\n=== Analyse des variables numériques ===\n")
        for var in numeric_vars:
            # Ajout ligne "Total"
            total_df = df_copy[[var]].copy()
            total_df[cluster_col] = "Total"
            df_with_total = pd.concat([df_copy[[cluster_col, var]], total_df], axis=0)

            plt.figure(figsize=(10, 5))
            sns.boxplot(data=df_with_total, x=cluster_col, y=var, hue=cluster_col, palette=palette_dict, legend=False)

            # sns.boxplot(data=df_with_total, x=cluster_col, y=var, palette=palette_dict)
            plt.title(f'Distribution de {var} par cluster (incluant Total)')
            plt.tight_layout()
            plt.show()

            # Statistiques descriptives AVEC total
            print(f"\n Statistiques pour '{var}' :")
            desc = df_with_total.groupby(cluster_col)[var].describe()
            print(desc)

            # Test statistique sur vrais clusters
            cluster_only = df_copy[df_copy[cluster_col] != "Total"]
            grouped_data = [group[var].dropna() for name, group in cluster_only.groupby(cluster_col)]
            if test_type == "anova":
                stat, p = f_oneway(*grouped_data)
                test_name = "ANOVA"
            else:
                stat, p = kruskal(*grouped_data)
                test_name = "Kruskal-Wallis"

            numeric_results.append({
                "variable": var,
                "test": test_name,
                "stat": stat,
                "p_value": p
            })

    if cat_vars:
        print("\n=== Analyse des variables catégorielles ===\n")
        for var in cat_vars:
            # Ajout ligne "Total"
            total_row = pd.DataFrame({cluster_col: ["Total"], var: [None]})
            df_cat = pd.concat([df_copy[[cluster_col, var]], total_row], axis=0)

            # Tableau croisé AVEC total
            contingency_table = pd.crosstab(df_copy[cluster_col], df_copy[var], dropna=False)
            print(f"\n Tableau croisé pour '{var}' (sans Total dans test) :")
            print(contingency_table)

            # Heatmap
            plt.figure(figsize=(10, 4))
            sns.heatmap(contingency_table, annot=True, fmt='d', cmap='Blues')
            plt.title(f'Tableau croisé: {var} vs {cluster_col}')
            plt.tight_layout()
            plt.show()

            # Test du chi² sur vrais clusters uniquement
            cat_only = df_copy[df_copy[cluster_col] != "Total"]
            contingency = pd.crosstab(cat_only[cluster_col], cat_only[var], dropna=False)
            chi2, p, dof, expected = chi2_contingency(contingency)

            categorical_results.append({
                "variable": var,
                "test": "Chi2",
                "stat": chi2,
                "p_value": p
            })

    if numeric_results:
        print("\n Résumé des tests numériques :")
        display(pd.DataFrame(numeric_results).sort_values("p_value"))

    if categorical_results:
        print("\n Résumé des tests catégoriels :")
        display(pd.DataFrame(categorical_results).sort_values("p_value"))

    return pd.DataFrame(numeric_results), pd.DataFrame(categorical_results)


# Data Prep

In [None]:
# Génération de données 
X, y = make_blobs(n_samples=100, centers=3, cluster_std=0.4, random_state=0)
plt.scatter(X[:,0], X[:,1])

# Modélisation

## 1. K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
# création du modèle
model_kmeans = KMeans(n_clusters=4)
model_kmeans.fit(X)

### Catégorisation

In [None]:
X_km = X.copy()
X_km = pd.DataFrame(X_km)
clustering_labels_km = model_kmeans.fit_predict(X)
X_km['clusters'] = clustering_labels_km
X_km['clusters'].value_counts()

In [None]:
X_km.head(100)

### Visualisation des clusters

In [None]:
# connaitre le nombre de cluster
model_kmeans.n_clusters

In [None]:
# Visualisation des clusters
plt.scatter(X[:,0], X[:,1], c = clustering_labels_km)
plt.scatter(model_kmeans.cluster_centers_[:,0], model_kmeans.cluster_centers_[:,1], c='r')

### Evaluation KMEANS

In [None]:
metrics.silhouette_score(X_km, X_km['clusters'])

#### Elbow Method

In [None]:
# Recherche du k optimal par la méthode du coude
inertia = []
K_range = range(1, 20)
for k in K_range:
    model = KMeans(n_clusters=k).fit(X)
    inertia.append(model.inertia_)

In [None]:
# Visualisation du coude
plt.plot(K_range, inertia)
plt.xlabel('nombre de clusters')
plt.ylabel('Inertia')

### Qualification

In [None]:
numeric_vars = [0, 1]
# cat_vars = ['sexe', 'statut_marital']

numeric_summary, cat_summary = compare_clusters_full_with_total(
    df = X_km,
    cluster_col='clusters',
    numeric_vars=numeric_vars,
    # cat_vars=cat_vars,
    test_type='anova'
)

## 2. Detection d'anomalies avec Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# Générer les données
X, y = make_blobs(n_samples=50, centers=1, cluster_std=0.1, random_state=0)
X[-1,:] = np.array([2.25, 5])

plt.scatter(X[:,0], X[:, 1])

In [None]:
# Création du modèle d'isolation Forest

model_isofo = IsolationForest(contamination=0.5)

anomalies = model_isofo.fit(X)

In [None]:
# Clusters des anomalies
clusters_ano = anomalies.predict(X)

clusters_ano

### Categorisation

In [None]:
X_ano = X.copy()
X_ano = pd.DataFrame(X_ano)
X_ano['clusters'] = clusters_ano
X_ano['clusters'].value_counts()

### Visualisation des clusters

In [None]:
# Visualisation des clusters des anomalies
plt.scatter(X_ano[0], X_ano[1], c=clusters_ano)

### Evaluation

In [None]:
metrics.silhouette_score(X_ano, X_ano['clusters'])

### Qualification

In [None]:
numeric_vars = [0, 1]
# cat_vars = ['sexe', 'statut_marital']

numeric_summary, cat_summary = compare_clusters_full_with_total(
    df = X_ano,
    cluster_col='clusters',
    numeric_vars=numeric_vars,
    # cat_vars=cat_vars,
    test_type='anova'
)

## 3. DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
db_model = DBSCAN(eps=0.5, min_samples=4)

# modelisation

clustering = db_model.fit(X)

clustering_labels = db_model.fit_predict(X)

### Catégorisation

In [None]:

X_db = X.copy()
X_db = pd.DataFrame(X_db)
X_db['clusters'] = clustering_labels
#clusters_db = pd.DataFrame({'clusters':clustering.labels_})
X_db['clusters'].value_counts()

### Visualisation des clusters

In [None]:
plt.scatter(X_db[0], X_db[1], c=clustering.labels_ )

### Evaluation

In [None]:
metrics.silhouette_score(X_db, X_db['clusters'])

### Qualification

In [None]:
numeric_vars = [0, 1]
# cat_vars = ['sexe', 'statut_marital']

numeric_summary, cat_summary = compare_clusters_full_with_total(
    df = X_db,
    cluster_col='clusters',
    numeric_vars=numeric_vars,
    # cat_vars=cat_vars,
    test_type='anova'
)