# 4. Window clustering
In this notebook we:
- Load the stats created in step 2
- Create a clustering of the various windows according to mean, std, min, max, word count and bins.

## Setup

In [None]:
# Installs
import sys
!echo "Purging pip environment and installing packages..."
!{sys.executable} -m pip cache purge 
!{sys.executable} -m pip uninstall -y jhutils 
!{sys.executable} -m pip install -q seaborn
!{sys.executable} -m pip install -q git+https://github.com/jdchart/jh-py-utils.git

# Imports
print("Importing packages...")
import os
from jhutils.local_files import collect_files
import matplotlib.pyplot as plt
import pandas as pd
import utils
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
print("Ready!")

## Load data

In [None]:
STATS = "/Users/jacob/Documents/Repos/dps/projects/data/output/norm-stats"

stat_folders = [f for f in os.listdir(STATS) if os.path.isdir(os.path.join(STATS, f))]
stat_folders.remove("dps_curve_manual")
print(f"Sucsessfully found {len(stat_folders)} statistics folders!")

## Analysis config

In [None]:
NUM_CLUSTERS = 4

## Clustering

In [None]:
INDEX = 3

window_stats = []
files = collect_files(os.path.join(STATS, stat_folders[INDEX]), ["csv"])

for file in files:
    df = pd.read_csv(file, index_col=0)
    window_stats.append(df)

combined_df = pd.concat(window_stats, ignore_index=True)

df_windows = pd.DataFrame(combined_df).round(2)

features = ['mean', 'std', 'min', 'max', 'count_words'] + [col for col in df_windows.columns if col.startswith("perc")]
X = df_windows[features].fillna(0)

X_scaled = StandardScaler().fit_transform(X)

# Clustering
kmeans = KMeans(n_clusters = NUM_CLUSTERS, random_state = 0)
labels = kmeans.fit_predict(X_scaled)
df_windows["cluster"] = labels

# Réduction dimensionnelle
X_tsne = TSNE(n_components=2, perplexity=10, random_state=0).fit_transform(X_scaled)

utils.display_scatter(X_tsne, labels)

In [None]:
ordered_files = sorted(
    collect_files(os.path.join(STATS, stat_folders[INDEX]), ["csv"]),
    key=lambda f: os.path.basename(f)
)

plt.figure(figsize=(14, len(ordered_files) * 0.5))

for i, file in enumerate(ordered_files):
    key_ = f"{os.path.splitext(os.path.basename(file))[0]}.json"
    file_windows = df_windows[df_windows["file"] == key_]

    for _, row in file_windows.iterrows():
        color = utils.cluster_colors_hex[int(row["cluster"])]
        plt.plot([row["start_time_s"] / 60, row["end_time_s"] / 60],
                 [i, i], color=color, linewidth=6)


plt.yticks(
    range(len(ordered_files)),
    [os.path.splitext(os.path.basename(f))[0][:12] for f in ordered_files]
)
plt.xlabel("Time (minutes)")
plt.title("Timeline des clusters rythmiques (par fenêtre)")
plt.grid(True, axis='x', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()

# Process all

In [None]:
OUTPUT_DEST = "/Users/jacob/Documents/Repos/dps/projects/data/output/clusters"
os.makedirs(OUTPUT_DEST, exist_ok = True)

for sub_folder in stat_folders:

    window_stats = []
    files = collect_files(os.path.join(STATS, sub_folder), ["csv"])

    for file in files:
        df = pd.read_csv(file, index_col=0)
        window_stats.append(df)

    combined_df = pd.concat(window_stats, ignore_index=True)

    df_windows = pd.DataFrame(combined_df).round(2)

    features = ['mean', 'std', 'min', 'max', 'count_words'] + [col for col in df_windows.columns if col.startswith("perc")]
    X = df_windows[features].fillna(0)

    X_scaled = StandardScaler().fit_transform(X)

    # Clustering
    kmeans = KMeans(n_clusters = NUM_CLUSTERS, random_state = 0)
    labels = kmeans.fit_predict(X_scaled)
    df_windows["cluster"] = labels

    # Réduction dimensionnelle
    X_tsne = TSNE(n_components=2, perplexity=10, random_state=0).fit_transform(X_scaled)

    ordered_files = sorted(
        collect_files(os.path.join(STATS, sub_folder), ["csv"]),
        key=lambda f: os.path.basename(f)
    )

    plt.figure(figsize=(14, len(ordered_files) * 0.5))

    for i, file in enumerate(ordered_files):
        key_ = f"{os.path.splitext(os.path.basename(file))[0]}.json"
        file_windows = df_windows[df_windows["file"] == key_]

        for _, row in file_windows.iterrows():
            color = utils.cluster_colors_hex[int(row["cluster"])]
            plt.plot([row["start_time_s"] / 60, row["end_time_s"] / 60],
                    [i, i], color=color, linewidth=6)

    plt.yticks(
        range(len(ordered_files)),
        [os.path.splitext(os.path.basename(f))[0][:12] for f in ordered_files]
    )
    plt.xlabel("Time (minutes)")
    plt.title("Timeline des clusters rythmiques (par fenêtre)")
    plt.grid(True, axis='x', linestyle='--', alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DEST, f"{sub_folder}.png"), dpi=300, bbox_inches='tight')
    plt.close()

print("👍 Finished!")