In [1]:
import os
import numpy as np
import pandas as pd

# scikit-learn 버전 호환성 문제 해결을 위한 패치
# umap을 import하기 전에 패치를 적용해야 함
from sklearn.utils import check_array as sklearn_check_array
from sklearn.utils import validation

# check_array의 ensure_all_finite 파라미터를 처리하는 wrapper
def patched_check_array(*args, **kwargs):
    # ensure_all_finite 파라미터가 있으면 제거
    if 'ensure_all_finite' in kwargs:
        kwargs.pop('ensure_all_finite')
    return sklearn_check_array(*args, **kwargs)

# sklearn.utils.validation의 check_array를 패치
validation.check_array = patched_check_array

# umap을 import하기 전에 umap.umap_ 모듈의 check_array도 패치
import umap.umap_ as umap_module
umap_module.check_array = patched_check_array

# 이제 umap을 import
import umap

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [3]:
random_state = 2025

In [4]:
sns.set_theme(style='darkgrid')

In [5]:
TASK_NAME = 'Ki'  # 'Ki' or 'Kd'
START_FOLD = 1
END_FOLD = 5
WORK_DIR = '/home/rlawlsgurjh/hdd/work/MMFDL'

print(f"[INFO] Task: {TASK_NAME}, Folds: {START_FOLD} to {END_FOLD}")

[INFO] Task: Ki, Folds: 1 to 5


In [6]:
# 2. UMAP 함수 정의
def doUMAP(embs, y, random_state):
    """UMAP을 사용하여 embedding을 2D로 축소"""
    # 입력 데이터를 float32 numpy array로 명시적으로 변환
    embs = np.asarray(embs, dtype=np.float32)
    
    # NaN이나 Inf 값이 있는지 확인하고 처리
    if np.any(np.isnan(embs)) or np.any(np.isinf(embs)):
        print("[WARNING] Found NaN or Inf values in embeddings, replacing with 0")
        embs = np.nan_to_num(embs, nan=0.0, posinf=0.0, neginf=0.0)
    
    # UMAP 적용
    # n_neighbors는 데이터 포인트 수보다 작아야 함
    n_neighbors = min(15, len(embs) - 1)
    if n_neighbors < 2:
        n_neighbors = 2
    
    # UMAP reducer 생성 및 fit
    reducer = umap.UMAP(
        n_components=2, 
        random_state=random_state, 
        n_neighbors=n_neighbors,
        min_dist=0.1
    )
    
    # fit 후 embedding_ 속성에서 결과 가져오기
    reducer.fit(embs)
    points = reducer.embedding_
    
    df_points = pd.DataFrame({
        'UMAP1': points[:, 0],
        'UMAP2': points[:, 1],
        'Ssel': y,
    })
    return df_points

# 3. Scatter plot 함수 정의
def scatterplot(df, filepath, title='Embedding Distribution'):
    """Ssel 값을 색상으로 사용한 scatter plot"""
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    
    scatter = ax.scatter(df['UMAP1'], df['UMAP2'], c=df['Ssel'], cmap='rainbow', alpha=0.6, s=20)
    ax.set_xlabel('UMAP1', fontsize=15)
    ax.set_ylabel('UMAP2', fontsize=15)
    ax.set_title(title, fontsize=16)
    
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label('Ssel', fontsize=12)
    
    plt.tight_layout()
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()  # 메모리 절약을 위해 close
    print(f"[INFO] Saved plot to {filepath}")

In [7]:
# 4. 각 fold별로 처리
for fold_num in range(START_FOLD, END_FOLD + 1):
    print("\n" + "=" * 80)
    print(f"[Fold {fold_num}] Processing...")
    print("=" * 80)
    
    # Embedding 파일 경로
    embeddings_dir = os.path.join(WORK_DIR, 'results', 'SGD', 'selectivity', TASK_NAME, 
                                   f'fold{fold_num}', 'embeddings')
    
    tr_val_path = os.path.join(embeddings_dir, f'tr_val_embeddings.npy')
    te_path = os.path.join(embeddings_dir, f'te_embeddings.npy')
    
    if not os.path.exists(tr_val_path) or not os.path.exists(te_path):
        print(f"[WARNING] Embedding files not found for fold {fold_num}")
        continue
    
    # Embedding 로드
    print(f"[INFO] Loading embeddings from {embeddings_dir}")
    tr_val_data = np.load(tr_val_path, allow_pickle=True).item()
    te_data = np.load(te_path, allow_pickle=True).item()
    
    emb_train_val = tr_val_data['embeddings']
    y_train_val = tr_val_data['Ssel']
    
    emb_test = te_data['embeddings']
    y_test = te_data['Ssel']
    
    print(f"Train+Val embeddings shape: {emb_train_val.shape}")
    print(f"Test embeddings shape: {emb_test.shape}")
    
    # UMAP 수행
    print("[INFO] Performing UMAP...")
    df_train_val_umap = doUMAP(emb_train_val, y_train_val, random_state)
    df_test_umap = doUMAP(emb_test, y_test, random_state)
    
    # 출력 디렉토리 생성
    figure_dir = os.path.join(embeddings_dir, 'UMAP')
    os.makedirs(figure_dir, exist_ok=True)
    
    # 시각화 (fold 정보 포함)
    scatterplot(df_train_val_umap, 
                os.path.join(figure_dir, f'train_val_embeddings.png'),
                title=f'Train+Val Embeddings Distribution (Fold {fold_num})')
    
    scatterplot(df_test_umap,
                os.path.join(figure_dir, f'test_embeddings.png'),
                title=f'Test Embeddings Distribution (Fold {fold_num})')
    
    # 통계 정보 출력
    print(f"\n[INFO] Fold {fold_num} Statistics:")
    print(f"  Train+Val embeddings: shape={emb_train_val.shape}, mean={emb_train_val.mean():.4f}, std={emb_train_val.std():.4f}")
    print(f"  Test embeddings: shape={emb_test.shape}, mean={emb_test.mean():.4f}, std={emb_test.std():.4f}")
    print(f"  Train+Val Ssel: min={y_train_val.min():.4f}, max={y_train_val.max():.4f}, mean={y_train_val.mean():.4f}")
    print(f"  Test Ssel: min={y_test.min():.4f}, max={y_test.max():.4f}, mean={y_test.mean():.4f}")
    
    print(f"[Fold {fold_num}] Completed!")

print("\n" + "=" * 80)
print("[INFO] All folds processed!")
print("=" * 80)


[Fold 1] Processing...
[INFO] Loading embeddings from /home/rlawlsgurjh/hdd/work/MMFDL/results/SGD/selectivity/Ki/fold1/embeddings
Train+Val embeddings shape: (1459, 540)
Test embeddings shape: (365, 540)
[INFO] Performing UMAP...
[INFO] Saved plot to /home/rlawlsgurjh/hdd/work/MMFDL/results/SGD/selectivity/Ki/fold1/embeddings/UMAP/train_val_embeddings.png
[INFO] Saved plot to /home/rlawlsgurjh/hdd/work/MMFDL/results/SGD/selectivity/Ki/fold1/embeddings/UMAP/test_embeddings.png

[INFO] Fold 1 Statistics:
  Train+Val embeddings: shape=(1459, 540), mean=-0.0163, std=0.3577
  Test embeddings: shape=(365, 540), mean=-0.0155, std=0.3563
  Train+Val Ssel: min=0.0001, max=4.1790, mean=0.5696
  Test Ssel: min=0.0001, max=2.6119, mean=0.5521
[Fold 1] Completed!

[Fold 2] Processing...
[INFO] Loading embeddings from /home/rlawlsgurjh/hdd/work/MMFDL/results/SGD/selectivity/Ki/fold2/embeddings
Train+Val embeddings shape: (1459, 540)
Test embeddings shape: (365, 540)
[INFO] Performing UMAP...
[INFO