In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
random_state = 2025

In [3]:
sns.set_theme(style='darkgrid')

In [4]:
TASK_NAME = 'Kd'  # 'Ki' or 'Kd'
START_FOLD = 1
END_FOLD = 5
WORK_DIR = '/home/rlawlsgurjh/hdd/work/MMFDL'

print(f"[INFO] Task: {TASK_NAME}, Folds: {START_FOLD} to {END_FOLD}")

[INFO] Task: Kd, Folds: 1 to 5


In [5]:
def doPCA(embs, y, random_state):
    points = PCA(n_components=2, random_state=random_state).fit_transform(embs)
    df_points = pd.DataFrame({
        'PC1': points[:, 0],
        'PC2': points[:, 1],
        'Ssel': y,
    })
    return df_points

def scatterplot(df, filepath, title='Embedding Distribution'):
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    
    scatter = ax.scatter(df['PC1'], df['PC2'], c=df['Ssel'], cmap='rainbow', alpha=0.6, s=20)
    ax.set_xlabel('PC1', fontsize=15)
    ax.set_ylabel('PC2', fontsize=15)
    ax.set_title(title, fontsize=16)
    
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label('Ssel', fontsize=12)
    
    plt.tight_layout()
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"[INFO] Saved plot to {filepath}")

In [6]:
for fold_num in range(START_FOLD, END_FOLD + 1):
    print("\n" + "=" * 80)
    print(f"[Fold {fold_num}] Processing...")
    print("=" * 80)
    
    embeddings_dir = os.path.join(WORK_DIR, 'results', 'SGD', 'selectivity', TASK_NAME, 
                                   f'fold{fold_num}', 'embeddings')
    
    tr_val_path = os.path.join(embeddings_dir, f'tr_val_embeddings.npy')
    te_path = os.path.join(embeddings_dir, f'te_embeddings.npy')
    
    if not os.path.exists(tr_val_path) or not os.path.exists(te_path):
        print(f"[WARNING] Embedding files not found for fold {fold_num}")
        continue
    
    print(f"[INFO] Loading embeddings from {embeddings_dir}")
    tr_val_data = np.load(tr_val_path, allow_pickle=True).item()
    te_data = np.load(te_path, allow_pickle=True).item()
    
    emb_train_val = tr_val_data['embeddings']
    y_train_val = tr_val_data['Ssel']
    
    emb_test = te_data['embeddings']
    y_test = te_data['Ssel']
    
    print(f"Train+Val embeddings shape: {emb_train_val.shape}")
    print(f"Test embeddings shape: {emb_test.shape}")

    print("[INFO] Performing PCA...")
    df_train_val_pca = doPCA(emb_train_val, y_train_val, random_state)
    df_test_pca = doPCA(emb_test, y_test, random_state)
    
    figure_dir = os.path.join(embeddings_dir, 'PCA')
    os.makedirs(figure_dir, exist_ok=True)
    
    scatterplot(df_train_val_pca, 
                os.path.join(figure_dir, f'train_val_embeddings.png'),
                title=f'Train+Val Embeddings Distribution (Fold {fold_num})')
    
    scatterplot(df_test_pca,
                os.path.join(figure_dir, f'test_embeddings.png'),
                title=f'Test Embeddings Distribution (Fold {fold_num})')
    
    print(f"\n[INFO] Fold {fold_num} Statistics:")
    print(f"  Train+Val embeddings: shape={emb_train_val.shape}, mean={emb_train_val.mean():.4f}, std={emb_train_val.std():.4f}")
    print(f"  Test embeddings: shape={emb_test.shape}, mean={emb_test.mean():.4f}, std={emb_test.std():.4f}")
    print(f"  Train+Val Ssel: min={y_train_val.min():.4f}, max={y_train_val.max():.4f}, mean={y_train_val.mean():.4f}")
    print(f"  Test Ssel: min={y_test.min():.4f}, max={y_test.max():.4f}, mean={y_test.mean():.4f}")
    
    print(f"[Fold {fold_num}] Completed!")

print("\n" + "=" * 80)
print("[INFO] All folds processed!")
print("=" * 80)


[Fold 1] Processing...
[INFO] Loading embeddings from /home/rlawlsgurjh/hdd/work/MMFDL/results/SGD/selectivity/Kd/fold1/embeddings
Train+Val embeddings shape: (1459, 540)
Test embeddings shape: (365, 540)
[INFO] Performing PCA...
[INFO] Saved plot to /home/rlawlsgurjh/hdd/work/MMFDL/results/SGD/selectivity/Kd/fold1/embeddings/PCA/train_val_embeddings.png
[INFO] Saved plot to /home/rlawlsgurjh/hdd/work/MMFDL/results/SGD/selectivity/Kd/fold1/embeddings/PCA/test_embeddings.png

[INFO] Fold 1 Statistics:
  Train+Val embeddings: shape=(1459, 540), mean=0.0073, std=0.3915
  Test embeddings: shape=(365, 540), mean=0.0072, std=0.3913
  Train+Val Ssel: min=0.0001, max=4.1790, mean=0.5696
  Test Ssel: min=0.0001, max=2.6119, mean=0.5521
[Fold 1] Completed!

[Fold 2] Processing...
[INFO] Loading embeddings from /home/rlawlsgurjh/hdd/work/MMFDL/results/SGD/selectivity/Kd/fold2/embeddings
Train+Val embeddings shape: (1459, 540)
Test embeddings shape: (365, 540)
[INFO] Performing PCA...
[INFO] Save