In [1]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import TSNE
import numpy as np
from tqdm import tqdm
from itertools import product
import hdbscan
from sklearn.metrics import silhouette_score
import plotly.express as px

# Load datasets
datasets = {
    'Dataset_1': pd.read_csv("/Users/leo/Programming/PLR/Leo/data/dataset_1.csv").drop(columns=['Unnamed: 0']),
    'Dataset_2': pd.read_csv("/Users/leo/Programming/PLR/Leo/data/dataset_2.csv").drop(columns=['Unnamed: 0']),
    'Dataset_3': pd.read_csv("/Users/leo/Programming/PLR/Leo/data/dataset_3.csv").drop(columns=['Unnamed: 0']),
    'Dataset_4': pd.read_csv("/Users/leo/Programming/PLR/Leo/data/dataset_4.csv").drop(columns=['Unnamed: 0'])
}

# Artificial labels for each dataset
labels = {name: np.random.randint(0, 2, size=len(data)) for name, data in datasets.items()}

# Define the range of parameters for the grid search
n_estimators_range = [50, 100, 150, 200]
perplexities = [5, 30, 50, 100, 150, 200]
learning_rates = [10, 100, 200, 300, 500]
n_iterations = [500, 1000, 2000]
top_features_range = [10, 15, 20, 25, 30]

# Calculate total combinations including datasets and top features range
total_combinations = len(datasets) * len(n_estimators_range) * len(perplexities) * len(learning_rates) * len(n_iterations) * len(top_features_range)

# Create the output directory if it doesn't exist
output_dir = os.path.join(os.getcwd(), 'output')
os.makedirs(output_dir, exist_ok=True)

# Store the results
results = []

# Single tqdm progress bar for all combinations
with tqdm(total=total_combinations, desc='Grid Search Progress') as pbar:
    for dataset_name, dataset in datasets.items():
        for n_estimators, perplexity, learning_rate, n_iter, N in product(n_estimators_range, perplexities, learning_rates, n_iterations, top_features_range):
            # Train Random Forest with current number of estimators
            forest = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
            forest.fit(dataset, labels[dataset_name])
            importances = forest.feature_importances_
            indices = np.argsort(importances)[-N:]
            top_features = dataset.columns[indices]

            # Apply t-SNE with current parameters
            tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate, n_iter=n_iter, random_state=42)
            tsne_results = tsne.fit_transform(dataset[top_features])

            # Apply HDBSCAN clustering
            clusterer = hdbscan.HDBSCAN()
            cluster_labels = clusterer.fit_predict(tsne_results)

            # Compute silhouette score
            if len(set(cluster_labels)) > 1:
                sil_score = silhouette_score(tsne_results, cluster_labels)
            else:
                sil_score = None

            # Store the results including top feature names and silhouette score
            results.append({
                'dataset': dataset_name,
                'n_estimators': n_estimators,
                'perplexity': perplexity,
                'learning_rate': learning_rate,
                'n_iter': n_iter,
                'N_top_features': N,
                'tsne_results': tsne_results,
                'top_features': top_features.tolist(),
                'silhouette_score': sil_score,
                'cluster_labels': cluster_labels
            })

            unique_labels = np.unique(cluster_labels)
            colors = {str(label): 'white' if label == -1 else None for label in unique_labels}

            # Plot the results for the current iteration with cluster colors using Plotly
            fig = px.scatter(
                x=tsne_results[:, 0], y=tsne_results[:, 1], color=cluster_labels.astype(str),
                color_discrete_map=colors,
                labels={'color': 'Cluster'},
                title=f"{dataset_name} - t-SNE with RF ({n_estimators} Estimators, Top {N} Features) - Clusters"
            )
            fig.update_traces(marker=dict(size=5))

            # Save the figure to the output directory
            fig_file_name = os.path.join(output_dir, f"{dataset_name}_tsne_rf_{n_estimators}_perp_{perplexity}_lr_{learning_rate}_iter_{n_iter}_top{N}.png")
            fig.write_image(fig_file_name)

            # Update the tqdm progress bar
            pbar.update(1)


Grid Search Progress:   1%|          | 54/7200 [09:13<18:40:30,  9.41s/it]