In [None]:

from google.colab import drive

# üîÑ Montar Google Drive
drive.mount('/content/drive')


#triplet loss entrenamiento y guardado de embeddings

In [None]:
import os
import shutil
import kagglehub

# === Ruta destino en tu Drive
dataset_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/Dataset"
os.makedirs(dataset_dir, exist_ok=True)

# === Descargar el dataset desde Kaggle
print("Descargando dataset...")
download_path = kagglehub.dataset_download("masoudnickparvar/brain-tumor-mri-dataset")
print(f"Descarga completa en: {download_path}")

# === Copiar el dataset a tu carpeta en Drive
for filename in os.listdir(download_path):
    src_file = os.path.join(download_path, filename)
    dst_file = os.path.join(dataset_dir, filename)
    if os.path.isdir(src_file):
        shutil.copytree(src_file, dst_file, dirs_exist_ok=True)
    else:
        shutil.copy2(src_file, dst_file)

print(f"Dataset copiado a: {dataset_dir}")

In [None]:
import os
import glob
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split, Sampler
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import timm
from torch.optim.lr_scheduler import CosineAnnealingLR

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

In [None]:
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split

ruta = "/content/drive/MyDrive/Mendoza_Serey(2025)/Datasets" # Nueva ruta
data = []

# Leer todas las im√°genes de todas las clases
for category in os.listdir(ruta):
    category_path = os.path.join(ruta, category)
    if os.path.isdir(category_path):
        images = glob.glob(os.path.join(category_path, "*.*"))
        data.extend([{"path": img, "category": category} for img in images])

# Convertir a DataFrame
df = pd.DataFrame(data)

# Dividir en train y test (80%/20%) estratificado por categor√≠a
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df["category"], random_state=42)

# Mostrar resumen
print("Clases en train:\n", df_train['category'].value_counts())
print("Clases en test:\n",  df_test['category'].value_counts())


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image

# Aseg√∫rate de que df_train ya fue generado desde la estructura actual
clases = df_train['category'].unique()
num_ejemplos = 6

fig, axes = plt.subplots(len(clases), num_ejemplos + 1, figsize=((num_ejemplos + 1) * 2, len(clases) * 2))
fig.suptitle("TRAINING DATA", fontsize=16, fontweight='bold')

for i, clase in enumerate(clases):
    muestras = df_train[df_train['category'] == clase].sample(num_ejemplos, random_state=42)
    axes[i, 0].text(0.5, 0.5, clase, fontsize=14, fontweight='bold', ha='center', va='center')
    axes[i, 0].axis('off')

    for j, ruta in enumerate(muestras['path']):
        try:
            img = Image.open(ruta).convert("RGB")
            axes[i, j + 1].imshow(img)
            axes[i, j + 1].axis('off')
        except Exception as e:
            print(f"Error abriendo {ruta}: {e}")
            axes[i, j + 1].axis('off')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


In [None]:
class BrainDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.transform = transform
        self.classes = sorted(self.df['category'].unique())
        self.class_to_idx = {c:i for i,c in enumerate(self.classes)}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row['path']).convert("RGB")
        if self.transform:
            img = self.transform(img)
        label = self.class_to_idx[row['category']]
        return img, label

train_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomRotation(10),
    transforms.ColorJitter(0.1,0.1,0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])
val_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

dataset_full = BrainDataset(df_train, transform=train_transform)
test_ds      = BrainDataset(df_test,  transform=val_transform)

val_frac = 0.10
n_total  = len(dataset_full)
n_val    = int(n_total * val_frac)
n_train  = n_total - n_val

train_ds, val_ds = random_split(dataset_full, [n_train, n_val],
                                 generator=torch.Generator().manual_seed(42))
print(f"Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}")

In [None]:
class BalancedBatchSampler(Sampler):
    """
    Devuelve lotes con 'k' ejemplos por clase.
    - labels: lista/array con la etiqueta de cada √≠ndice del dataset
    - batch_size: m√∫ltiplo del n¬∫ de clases
    - oversample: si True, repite √≠ndices de clases minoritarias para que no se agoten
    - shuffle: baraja los √≠ndices al inicio de cada √©poca
    - drop_last: si True, descarta el √∫ltimo lote si no est√° completo
    """
    def __init__(self, labels, batch_size, oversample=True, shuffle=True, drop_last=True):
        self.labels   = np.array(labels)
        self.classes  = np.unique(self.labels)
        self.C        = len(self.classes)
        assert batch_size % self.C == 0, f"batch_size debe ser m√∫ltiplo de {self.C}"
        self.k        = batch_size // self.C
        self.oversamp = oversample
        self.shuffle  = shuffle
        self.drop_last= drop_last
        self.idcs_by_class = {c: np.where(self.labels==c)[0].tolist() for c in self.classes}
        if self.oversamp:
            self._oversample()

    def _oversample(self):
        max_len = max(len(lst) for lst in self.idcs_by_class.values())
        for c, lst in self.idcs_by_class.items():
            deficit = max_len - len(lst)
            if deficit>0:
                lst.extend(random.choices(lst, k=deficit))

    def __iter__(self):
        if self.shuffle:
            for lst in self.idcs_by_class.values():
                random.shuffle(lst)
        ptr = {c:0 for c in self.classes}
        finished = False
        while not finished:
            batch = []
            for c in self.classes:
                start, end = ptr[c], ptr[c] + self.k
                if end>len(self.idcs_by_class[c]):
                    finished = True
                    break
                batch.extend(self.idcs_by_class[c][start:end])
                ptr[c] = end
            if len(batch)==self.k*self.C:
                yield batch
            elif not self.drop_last and batch:
                yield batch

    def __len__(self):
        return min(len(v)//self.k for v in self.idcs_by_class.values())

In [None]:
orig_ds      = train_ds.dataset  # BrainDataset original
train_idx    = train_ds.indices  # √≠ndices del Subset de entrenamiento
train_labels = [
    orig_ds.class_to_idx[orig_ds.df.iloc[i]['category']]
    for i in train_idx
]

batch_size   = 44
num_classes  = len(orig_ds.classes)
k            = batch_size // num_classes  # ejemplos por clase

# Instanciar el sampler balanceado personalizado
sampler = BalancedBatchSampler(
    labels = train_labels,
    batch_size = batch_size,
    oversample = True,
    shuffle = True,
    drop_last = True
)

# Crear DataLoaders usando batch_sampler
train_loader = DataLoader(
    train_ds,
    batch_sampler = sampler,
    num_workers = 2,
    pin_memory = True
)

val_loader = DataLoader(
    val_ds,
    batch_size = batch_size,
    shuffle = False,
    num_workers = 2,
    pin_memory = True
)

test_loader = DataLoader(
    test_ds,
    batch_size = batch_size,
    shuffle = False,
    num_workers = 2,
    pin_memory = True
)

print("DataLoaders preparados:")
print(" ‚Ä¢ Train samples:", len(train_ds))
print(" ‚Ä¢ Train batches:", len(train_loader))
print(" ‚Ä¢ Val batches:  ", len(val_loader))
print(" ‚Ä¢ Test batches: ", len(test_loader))

In [None]:
from collections import Counter

print("Distribuci√≥n por batch (primeros 5):")
for i, (_, labels) in enumerate(train_loader):
    counts = Counter(labels.tolist())
    print(f"Batch {i}: {counts}")
    if i >= 4:
        break

# Contar total de muestras por clase en toda la √©poca
total_counts = Counter()
for _, labels in train_loader:
    total_counts.update(labels.tolist())
print("Distribuci√≥n total por √©poca:", total_counts)

In [None]:
class BackboneEmbedder(nn.Module):
    def __init__(self, model_name="inception_v4"):
        super().__init__()
        self.backbone = timm.create_model(model_name, pretrained=True)
        classifier = self.backbone.get_classifier()
        self.in_features = classifier.in_features
        self.backbone.reset_classifier(0)

        self.fc1 = nn.Linear(self.in_features, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.relu = nn.ReLU()

    def forward(self, x, return_all=False):
        x_backbone = self.backbone(x)
        x1 = self.relu(self.fc1(x_backbone))
        x2 = self.relu(self.fc2(x1))
        x3 = self.relu(self.fc3(x2))
        x4 = self.fc4(x3)

        if return_all:
            return {
                'backbone': x_backbone,
                '512': x1,
                '256': x2,
                '128': x3,
                '64': x4
            }
        else:
            return x4

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BackboneEmbedder(model_name="inception_v4").to(device)

# Convertir lista de tensores a tensor de lote
images_tensor = torch.stack(images).to(device)

# Ejecutar modelo
outputs = model(images_tensor, return_all=True)

# Mostrar resultados
for key, tensor in outputs.items():
    print(f"{key}: shape = {tensor.shape}")
    print(tensor)

In [None]:
pip install pytorch_metric_learning

In [None]:
import torch

# Ruta donde se guard√≥ el modelo entrenado
save_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/Modelos/mri"
model_path = os.path.join(save_dir, 'best_model_triplet_cifar.pth')

# Crear la instancia del modelo
model = BackboneEmbedder(model_name="efficientnet_b0").to(device)

# Cargar pesos entrenados
model.load_state_dict(torch.load(model_path, map_location=device))

# Poner en modo evaluaci√≥n (opcional, pero recomendado si no vas a seguir entrenando)
model.eval()

print("Modelo cargado correctamente desde:", model_path)


In [None]:
import matplotlib.pyplot as plt
import os
from tqdm.auto import tqdm
import torch
from torch.optim.lr_scheduler import MultiStepLR
from pytorch_metric_learning.losses import TripletMarginLoss
from pytorch_metric_learning.miners import TripletMarginMiner
from pytorch_metric_learning.samplers import MPerClassSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modelo base
model = BackboneEmbedder(model_name="efficientnet_b0").to(device)

# --- Definir miner y p√©rdida ---
margin = 0.2
miner = TripletMarginMiner(margin=margin, type_of_triplets="hard")
criterion = TripletMarginLoss(margin=margin)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = MultiStepLR(optimizer, milestones=[20, 40], gamma=0.1)

save_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/Modelos/mri"
num_epochs = 50
patience = 10
best_val_loss = float('inf')
epochs_no_improve = 0
train_losses = []
val_losses = []

# --- Sampler para entrenamiento balanceado ---
train_labels = [label for _, label in train_ds]
sampler = MPerClassSampler(train_labels, m=4, length_before_new_iter=len(train_ds))
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=32, sampler=sampler)

# --- Entrenamiento ---
for epoch in tqdm(range(1, num_epochs + 1), desc="Epochs"):
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()

        embeddings = model(images)
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

        hard_triplets = miner(embeddings, labels)

        # Saltar batch si no hay triplets hard
        if len(hard_triplets[0]) == 0:
            continue

        loss = criterion(embeddings, labels, hard_triplets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

        if epoch == 1:
            print(f"[Depuraci√≥n] Triplets encontrados: {len(hard_triplets[0])}")
            print(f"[Depuraci√≥n] Loss en batch: {loss.item():.4f}")
            print(f"[Depuraci√≥n] Embedding std: {embeddings.std().item():.4f}")

    train_loss = running_loss / len(train_ds)
    train_losses.append(train_loss)

    # --- Validaci√≥n ---
    model.eval()
    val_running = 0.0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            embeddings = model(images)
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

            hard_triplets = miner(embeddings, labels)
            if len(hard_triplets[0]) == 0:
                continue

            v_loss = criterion(embeddings, labels, hard_triplets).item()
            val_running += v_loss * images.size(0)

    val_loss = val_running / len(val_ds)
    val_losses.append(val_loss)

    scheduler.step()

    tqdm.write(
        f"Epoch {epoch}/{num_epochs}  "
        f"Train Loss: {train_loss:.4f}  "
        f"Val Loss: {val_loss:.4f}  "
        f"LR: {scheduler.get_last_lr()[0]:.2e}"
    )

    # Early stopping + checkpoint
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        os.makedirs(save_dir, exist_ok=True)
        torch.save(model.state_dict(), os.path.join(save_dir, 'best_model_triplet.pth'))
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            tqdm.write(f"Deteniendo tras {patience} √©pocas sin mejora.")
            break

# --- Graficar p√©rdidas ---
epochs = list(range(1, len(train_losses) + 1))
plt.figure()
plt.plot(epochs, train_losses, label='Train Loss')
plt.plot(epochs, val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import os
import torch

# Ruta al modelo guardado
save_model_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/Modelos/mri"
model_path = os.path.join(save_model_dir, 'best_model_triplet_cifar.pth')

# Crear el modelo y mover a GPU si est√° disponible
model = BackboneEmbedder(model_name="efficientnet_b0").to(device)

# Cargar pesos del modelo
model.load_state_dict(torch.load(model_path, map_location=device))

# Poner en modo evaluaci√≥n
model.eval()

print(f"Modelo cargado correctamente desde: {model_path}")



In [None]:
import os
import pandas as pd
import torch.nn.functional as F
from tqdm.auto import tqdm
import numpy as np

# === Ruta donde guardar los archivos
save_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings"
os.makedirs(save_dir, exist_ok=True)

def extract_embeddings_dict(loader, model):
    backbone_list = []
    dim512_list = []
    dim256_list = []
    dim128_list = []
    dim64_list = []
    labels_list = []

    model.eval()
    with torch.no_grad():
        for imgs, labels in tqdm(loader):
            imgs = imgs.to(device)
            outputs = model(imgs, return_all=True)

            backbone_list.extend(outputs['backbone'].cpu().numpy())
            dim512_list.extend(outputs['512'].cpu().numpy())
            dim256_list.extend(outputs['256'].cpu().numpy())
            dim128_list.extend(outputs['128'].cpu().numpy())
            dim64_list.extend(outputs['64'].cpu().numpy())
            labels_list.extend(labels.cpu().numpy())

    return pd.DataFrame({
        'backbone': backbone_list,
        '512': dim512_list,
        '256': dim256_list,
        '128': dim128_list,
        '64': dim64_list,
        'label': labels_list
    })

# === Ejecutar y guardar ===
df_train = extract_embeddings_dict(train_loader, model)
df_val   = extract_embeddings_dict(val_loader, model)
df_test  = extract_embeddings_dict(test_loader, model)

df_train.to_pickle(os.path.join(save_dir, "embeddings_vectors_train_mri.pkl"))
df_val.to_pickle(os.path.join(save_dir, "embeddings_vectors_val_mri.pkl"))
df_test.to_pickle(os.path.join(save_dir, "embeddings_vectors_test_mri.pkl"))

print("Embeddings guardados correctamente en:", save_dir)


# evaluacion de otros clasificadores clasificadores

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# === Rutas
base_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings"
df_train = pd.read_pickle(os.path.join(base_dir, "embeddings_vectors_train_mri.pkl"))
df_val   = pd.read_pickle(os.path.join(base_dir, "embeddings_vectors_val_mri.pkl"))
df_test  = pd.read_pickle(os.path.join(base_dir, "embeddings_vectors_test_mri.pkl"))

# === Dimensiones
dims = ['backbone', '512', '256', '128', '64']
df_all_train = pd.concat([df_train, df_val], ignore_index=True)

# === Codificar etiquetas
le = LabelEncoder()
df_all_train['label_enc'] = le.fit_transform(df_all_train['label'])
df_test['label_enc'] = le.transform(df_test['label'])
n_classes = len(le.classes_)

# === Clasificadores a evaluar
classifiers = {
    'Logistic Regression': GridSearchCV(LogisticRegression(max_iter=1000, solver='liblinear'),
                                        {'C': [0.01, 0.1, 1, 10, 100]}, cv=5),

    'KNN': GridSearchCV(KNeighborsClassifier(),
                        {'n_neighbors': list(range(1, 16))}, cv=5),

    'Random Forest': GridSearchCV(RandomForestClassifier(random_state=42),
                                  {'n_estimators': [50, 100], 'max_depth': [None, 10, 20]}, cv=5),

    'SVM': GridSearchCV(SVC(probability=True),
                        {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}, cv=5),

    'Naive Bayes': GaussianNB(),

    'XGBoost': XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        tree_method='hist',
        random_state=42,
        eta=0.01,
        max_depth=6,
        n_estimators=100,
    )
}

# === Resultados generales
results = []

for dim in dims:
    print(f"\n === Evaluando dimensi√≥n: {dim} ===")
    X_train = np.stack(df_all_train[dim].values)
    y_train = df_all_train['label_enc'].values
    X_test = np.stack(df_test[dim].values)
    y_test = df_test['label_enc'].values

    for name, clf in classifiers.items():
        print(f"\n Entrenando: {name}...")

        model = clf.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        report_dict = classification_report(
            y_test, y_pred,
            target_names=le.classes_.astype(str),
            digits=6,
            output_dict=True
        )
        report_str = classification_report(
            y_test, y_pred,
            target_names=le.classes_.astype(str),
            digits=6,
            output_dict=False
        )

        print(f"\n===== Reporte {name} - Dimensi√≥n: {dim} =====")
        print(report_str)

        results.append({
            'dimension': dim,
            'model': name,
            'accuracy': acc,
            'macro_f1': report_dict['macro avg']['f1-score'],
            'macro_precision': report_dict['macro avg']['precision'],
            'macro_recall': report_dict['macro avg']['recall']
        })

# === Mostrar resumen
df_results = pd.DataFrame(results)
pd.set_option("display.precision", 6)
print("\n Tabla resumen por modelo y dimensi√≥n:\n")
print(df_results)

# === Guardar
df_results.to_csv(os.path.join(base_dir, "multi_model_metrics_summary.csv"), index=False)




In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, accuracy_score

# === Rutas
base_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings"
df_train = pd.read_pickle(os.path.join(base_dir, "embeddings_vectors_train_mri.pkl"))
df_val   = pd.read_pickle(os.path.join(base_dir, "embeddings_vectors_val_mri.pkl"))
df_test  = pd.read_pickle(os.path.join(base_dir, "embeddings_vectors_test_mri.pkl"))

# === Dimensiones a evaluar
dims = ['backbone', '512', '256', '128', '64']

# === Etiquetas codificadas
le = LabelEncoder()
df_train['y'] = le.fit_transform(df_train['label'])
df_val['y'] = le.transform(df_val['label'])
df_test['y'] = le.transform(df_test['label'])
n_classes = len(le.classes_)

# === M√©todos y n√∫mero de componentes
methods = ['SDGM-D', 'DGMMC-S']
components_list = [1, 2]

# === Resultados resumen
summary = []

# === Evaluar
for dim in dims:
    print(f"\n Dimensi√≥n: {dim}")

    X_train = np.stack(df_train[dim].values)
    y_train = df_train['y'].values
    X_test  = np.stack(df_test[dim].values)
    y_test  = df_test['y'].values

    for method in methods:
        for n_comp in components_list:
            if method == 'SDGM-D':
                gmms = []
                for c in range(n_classes):
                    gmm = GaussianMixture(n_components=n_comp, covariance_type='full', random_state=42)
                    gmm.fit(X_train[y_train == c])
                    gmms.append(gmm)

                log_probs = np.array([gmm.score_samples(X_test) for gmm in gmms]).T
                y_pred = np.argmax(log_probs, axis=1)

            elif method == 'DGMMC-S':
                gmm = GaussianMixture(n_components=n_comp * n_classes, covariance_type='full', random_state=42)
                gmm.fit(X_train)

                component_labels = np.zeros(gmm.n_components, dtype=int)
                comp_assignments = gmm.predict(X_train)
                for k in range(gmm.n_components):
                    indices = np.where(comp_assignments == k)[0]
                    if len(indices) > 0:
                        labels_k = y_train[indices]
                        most_common = np.bincount(labels_k).argmax()
                        component_labels[k] = most_common
                    else:
                        component_labels[k] = 0

                comp_preds = gmm.predict(X_test)
                y_pred = component_labels[comp_preds]

            acc = accuracy_score(y_test, y_pred)
            report_dict = classification_report(
                y_test,
                y_pred,
                target_names=le.classes_.astype(str),
                digits=6,
                output_dict=True
            )
            report_str = classification_report(
                y_test,
                y_pred,
                target_names=le.classes_.astype(str),
                digits=6,
                output_dict=False
            )

            print(f"\n===== Reporte {method} ({n_comp} comp) - Dimensi√≥n: {dim} =====")
            print(report_str)

            # Guardar resumen
            summary.append({
                'dimension': dim,
                'method': method,
                'components': n_comp,
                'accuracy': acc,
                'macro_f1': report_dict['macro avg']['f1-score'],
                'macro_precision': report_dict['macro avg']['precision'],
                'macro_recall': report_dict['macro avg']['recall']
            })

# === Mostrar tabla resumen
df_summary = pd.DataFrame(summary)
pd.set_option("display.precision", 6)
print("\n Tabla resumen (accuracy, macro f1, etc):\n")
print(df_summary)

# === Guardar CSV (opcional)
df_summary.to_csv(os.path.join(base_dir, "gmm_metrics_summary.csv"), index=False)



# clasificador jerakiko

data augmentation en embeddings

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# === Rutas absolutas a los pickles en Google Drive
base_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings"
pickle_train_path = os.path.join(base_dir, "embeddings_vectors_train_mri.pkl")
pickle_val_path   = os.path.join(base_dir, "embeddings_vectors_val_mri.pkl")

# === Cargar los embeddings
df_train = pd.read_pickle(pickle_train_path)
df_val   = pd.read_pickle(pickle_val_path)

# === Par√°metros
dims = ['backbone', '512', '256', '128', '64']
N_NEIGHBORS = 5
EPSILON = 0.1
VAL_FRACTION = 1  # Usa 0.1 si quieres usar solo el 10% del set de validaci√≥n

# === Selecci√≥n de muestras para aumento
N = len(df_train)
n_aug = int(len(df_val) * VAL_FRACTION)
sel_idx = np.random.choice(N, n_aug, replace=False)
Y_all = df_train['label'].values
Y_sel = Y_all[sel_idx]

# === Extraer vectores por dimensi√≥n
X_all = {dim: np.stack(df_train[dim].values) for dim in dims}
X_sel = {dim: X_all[dim][sel_idx] for dim in dims}

# === Calcular vecinos (dimensi√≥n backbone)
knn = NearestNeighbors(n_neighbors=N_NEIGHBORS + 1).fit(X_all['backbone'])
neighbors = knn.kneighbors(X_sel['backbone'], return_distance=False)

# === Funci√≥n de mezcla tipo Mixup
def apply_mixup(x_i, x_j, alpha=0.4):
    lam = np.random.beta(alpha, alpha)
    return lam * x_i + (1 - lam) * x_j

# === Construcci√≥n del nuevo dataset aumentado
def build_augmented_df(name, strategy_fn, X_source):
    df_aug = {'label': [], **{dim: [] for dim in dims}}

    for i in range(n_aug):
        x_aug = {}
        y_i = Y_sel[i]

        j = strategy_fn(i)
        if j is None:
            continue

        y_j = Y_all[j]
        for dim in dims:
            x_i = X_sel[dim][i]
            x_j = X_source[dim][j]
            x_aug[dim] = apply_mixup(x_i, x_j)

        label = y_i if np.random.rand() < 0.5 else y_j
        df_aug['label'].append(label)
        for dim in dims:
            df_aug[dim].append(x_aug[dim])

    df_out = pd.DataFrame(df_aug)

    # === Guardar en Drive
    save_path = os.path.join(base_dir, f'embedding_vectors_augmented_{name}.pkl')
    df_out.to_pickle(save_path)
    print(f" Guardado: {save_path}")

# === Ejecutar Neighborhood Mixup
build_augmented_df('neighborhood_mixup', lambda i: np.random.choice(neighbors[i][1:]), X_all)

In [None]:
import os
import numpy as np
import pandas as pd
import joblib
from tqdm.auto import tqdm
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
from scipy import linalg

# === Rutas
BASE_DIR = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings"
VERSION = "version_5"
CHECKPOINT_DIR = f"/content/drive/MyDrive/Mendoza_Serey(2025)/{VERSION}"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# === Datos
pickle_train_path = os.path.join(BASE_DIR, "embeddings_vectors_train_mri.pkl")
pickle_val_path   = os.path.join(BASE_DIR, "embeddings_vectors_val_mri.pkl")
pickle_test_path  = os.path.join(BASE_DIR, "embeddings_vectors_test_mri.pkl")
pickle_aug_path   = os.path.join(BASE_DIR, "embedding_vectors_augmented_neighborhood_mixup.pkl")

# === Par√°metros
dims = ['backbone', '512', '256', '128', '64']
N_NEIGHBORS = 10
MAX_COMPONENTS_PER_CLASS = 10
normalizer = Normalizer(norm='l2')
le = LabelEncoder()

# === Funci√≥n para normalizar matrices por fila
def row_normalize(matrix):
    row_sums = np.sum(matrix, axis=1, keepdims=True) + 1e-10
    return matrix / row_sums

# === Cargar pickles
df_train = pd.read_pickle(pickle_train_path)
df_val   = pd.read_pickle(pickle_val_path)
df_test  = pd.read_pickle(pickle_test_path)
df_aug   = pd.read_pickle(pickle_aug_path)
df_train_aug = pd.concat([df_train, df_aug], ignore_index=True)

# === Etiquetas
y_train_raw = df_train_aug['label'].values
y_test_raw  = df_test['label'].values
y_val_raw   = df_val['label'].values
y_train_enc = le.fit_transform(y_train_raw)
y_test_enc  = le.transform(y_test_raw)
y_val_enc   = le.transform(y_val_raw)
joblib.dump(le, os.path.join(CHECKPOINT_DIR, 'label_encoder_version_5.pkl'))

# === Procesar por dimensi√≥n
for dim in dims:
    print(f"\n Procesando dimensi√≥n: {dim}")

    # Rutas de salida
    ftr_tr_path = os.path.join(CHECKPOINT_DIR, f'ftr_train_{dim}_version_5.npy')
    ftr_te_path = os.path.join(CHECKPOINT_DIR, f'ftr_test_{dim}_version_5.npy')
    ftr_val_path = os.path.join(CHECKPOINT_DIR, f'ftr_val_{dim}_version_5.npy')
    gmm_path    = os.path.join(CHECKPOINT_DIR, f'gmm_global_{dim}_version_5.pkl')
    map_path    = os.path.join(CHECKPOINT_DIR, f'component_to_class_{dim}_version_5.pkl')

    if all(os.path.exists(p) for p in [ftr_tr_path, ftr_te_path, ftr_val_path, gmm_path, map_path]):
        print(f" Dimensi√≥n {dim} ya procesada. Saltando...")
        continue

    # === Datos
    X_train = np.stack(df_train_aug[dim].values)
    X_test  = np.stack(df_test[dim].values)
    X_val   = np.stack(df_val[dim].values)

    X_train_norm = normalizer.fit_transform(X_train)
    X_test_norm  = normalizer.transform(X_test)
    X_val_norm   = normalizer.transform(X_val)

    print(" Entrenando GMMs por clase con BIC...")
    all_means, all_covs, all_weights, comp_to_class = [], [], [], []
    for cls in np.unique(y_train_enc):
        X_cls = X_train_norm[y_train_enc == cls]
        best_gmm, best_bic = None, np.inf
        for k in range(1, MAX_COMPONENTS_PER_CLASS + 1):
            gmm = GaussianMixture(n_components=k, covariance_type='full', reg_covar=1e-6, random_state=42)
            gmm.fit(X_cls)
            bic = gmm.bic(X_cls)
            if bic < best_bic:
                best_bic = bic
                best_gmm = gmm
        preds = best_gmm.predict(X_cls)
        for i in range(best_gmm.n_components):
            idx = preds == i
            if np.sum(idx) == 0:
                continue
            all_means.append(best_gmm.means_[i])
            all_covs.append(best_gmm.covariances_[i])
            all_weights.append(np.mean(idx))
            comp_to_class.append(cls)

    print(" Consolidando GMM global y mapa componente->clase...")
    gmm = GaussianMixture(n_components=len(all_means), covariance_type='full', reg_covar=1e-6)
    gmm.weights_ = np.array(all_weights) / np.sum(all_weights)
    gmm.means_ = np.array(all_means)
    gmm.covariances_ = np.array(all_covs)
    gmm.precisions_cholesky_ = np.linalg.cholesky(np.linalg.inv(gmm.covariances_))

    joblib.dump(gmm, gmm_path)
    joblib.dump(comp_to_class, map_path)

    # === Calcular pertenencias por clase
    def get_class_probs(X):
        probs = gmm.predict_proba(X)
        class_probs = np.zeros((X.shape[0], len(np.unique(y_train_enc))))
        for c, cls in enumerate(comp_to_class):
            class_probs[:, cls] += probs[:, c]
        return row_normalize(class_probs)

    print(" Buscando vecinos en set de entrenamiento...")
    knn = NearestNeighbors(n_neighbors=N_NEIGHBORS)
    knn.fit(X_train_norm)

    def enrich(X_query, X_base):
        _, idxs = knn.kneighbors(X_query)
        feats = []
        for i, x in enumerate(X_query):
            row = []
            row.extend(x)
            row.extend(get_class_probs(x.reshape(1, -1))[0])
            for idx in idxs[i]:
                row.extend(X_base[idx])
                row.extend(get_class_probs(X_base[idx].reshape(1, -1))[0])
            feats.append(row)
        return np.array(feats)

    print(" Enriqueciendo caracter√≠sticas...")
    ftr_tr  = enrich(X_train_norm, X_train_norm)
    ftr_te  = enrich(X_test_norm,  X_train_norm)
    ftr_val = enrich(X_val_norm,   X_train_norm)

    np.save(ftr_tr_path,  ftr_tr)
    np.save(ftr_te_path,  ftr_te)
    np.save(ftr_val_path, ftr_val)
    print(f" Guardado: {ftr_tr_path}, {ftr_te_path}, {ftr_val_path}")




In [None]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import LabelEncoder

BASE_DIR = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings"
CHECKPOINT_DIR = "/content/drive/MyDrive/Mendoza_Serey(2025)/version_5"

# Cargar pickles
df_train = pd.read_pickle(os.path.join(BASE_DIR, "embeddings_vectors_train_mri.pkl"))
df_aug   = pd.read_pickle(os.path.join(BASE_DIR, "embedding_vectors_augmented_neighborhood_mixup.pkl"))
df_val   = pd.read_pickle(os.path.join(BASE_DIR, "embeddings_vectors_val_mri.pkl"))
df_test  = pd.read_pickle(os.path.join(BASE_DIR, "embeddings_vectors_test_mri.pkl"))


    ftr_tr_path = os.path.join(CHECKPOINT_DIR, f'ftr_train_{dim}_version_5.npy')
    ftr_te_path = os.path.join(CHECKPOINT_DIR, f'ftr_test_{dim}_version_5.npy')
    ftr_val_path = os.path.join(CHECKPOINT_DIR, f'ftr_val_{dim}_version_5.npy')
    gmm_path    = os.path.join(CHECKPOINT_DIR, f'gmm_global_{dim}_version_5.pkl')
    map_path    = os.path.join(CHECKPOINT_DIR, f'component_to_class_{dim}_version_5.pkl')







df_train_aug = pd.concat([df_train, df_aug], ignore_index=True)

# Codificar etiquetas
le = LabelEncoder()
y_train_enc = le.fit_transform(df_train_aug['label'].values)
y_val_enc   = le.transform(df_val['label'].values)
y_test_enc  = le.transform(df_test['label'].values)

# Guardar
np.save(os.path.join(CHECKPOINT_DIR, 'y_train.npy'), y_train_enc)
np.save(os.path.join(CHECKPOINT_DIR, 'y_val.npy'),   y_val_enc)
np.save(os.path.join(CHECKPOINT_DIR, 'y_test.npy'),  y_test_enc)
joblib.dump(le, os.path.join(CHECKPOINT_DIR, 'label_encoder_version_5.pkl'))

print(" Etiquetas recuperadas y guardadas exitosamente.")


In [None]:
import os
import numpy as np
import pandas as pd
import joblib

# === Ruta
CHECKPOINT_DIR = '/content/drive/MyDrive/Mendoza_Serey(2025)/version_5'
dims = ['backbone', '512', '256', '128', '64']

# === Cargar etiquetas
df_train = pd.read_pickle(os.path.join(CHECKPOINT_DIR, '../embeddings/embeddings_vectors_train_mri.pkl'))
df_test  = pd.read_pickle(os.path.join(CHECKPOINT_DIR, '../embeddings/embeddings_vectors_test_mri.pkl'))
df_val   = pd.read_pickle(os.path.join(CHECKPOINT_DIR, '../embeddings/embeddings_vectors_val_mri.pkl'))
df_aug   = pd.read_pickle(os.path.join(CHECKPOINT_DIR, '../embeddings/embedding_vectors_augmented_neighborhood_mixup.pkl'))

# Concatenar entrenamiento + augmentaci√≥n
df_train_aug = pd.concat([df_train, df_aug], ignore_index=True)

# === Codificar etiquetas
le = joblib.load(os.path.join(CHECKPOINT_DIR, 'label_encoder_version_5.pkl'))
y_train = le.transform(df_train_aug['label'].values)
y_test  = le.transform(df_test['label'].values)

# === Concatenar caracter√≠sticas enriquecidas por dimensi√≥n
def concat_features(split):
    feats = []
    for dim in dims:
        fpath = os.path.join(CHECKPOINT_DIR, f'ftr_{split}_{dim}_version_5.npy')
        feats.append(np.load(fpath))
    return np.hstack(feats)

print("üîÑ Concatenando caracter√≠sticas enriquecidas...")
X_train = concat_features("train")
X_test  = concat_features("test")

# === Guardar para entrenamiento
np.save(os.path.join(CHECKPOINT_DIR, 'X_train_version_5.npy'), X_train)
np.save(os.path.join(CHECKPOINT_DIR, 'X_test_version_5.npy'),  X_test)
np.save(os.path.join(CHECKPOINT_DIR, 'y_train_version_5.npy'), y_train)
np.save(os.path.join(CHECKPOINT_DIR, 'y_test_version_5.npy'),  y_test)

print("‚úÖ Datos jer√°rquicos concatenados y guardados.")
print(f"   X_train shape: {X_train.shape}")
print(f"   X_test shape : {X_test.shape}")


In [None]:
import os
import numpy as np
import joblib
import xgboost as xgb
import pandas as pd
from sklearn.metrics import classification_report

# === Ruta en tu Google Drive
CHECKPOINT_DIR = '/content/drive/MyDrive/Mendoza_Serey(2025)/version_5'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# === Cargar datos y codificador
X_train = np.load(os.path.join(CHECKPOINT_DIR, 'X_train_version_5.npy'))
X_test  = np.load(os.path.join(CHECKPOINT_DIR, 'X_test_version_5.npy'))
y_train = np.load(os.path.join(CHECKPOINT_DIR, 'y_train_version_5.npy'))
y_test  = np.load(os.path.join(CHECKPOINT_DIR, 'y_test_version_5.npy'))
le      = joblib.load(os.path.join(CHECKPOINT_DIR, 'label_encoder_version_5.pkl'))

# === Crear DMatrix para XGBoost
dtrain_full = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# === Hiperpar√°metros √≥ptimos (de Optuna Trial 0)
best_params = {
    "objective": "multi:softprob",
    "num_class": len(le.classes_),
    "tree_method": "hist",  # usa "gpu_hist" si tienes CUDA
    "device": "cuda",       # ignorable si no hay GPU
    "eval_metric": "mlogloss",
    "seed": 42,
    "eta": 0.0013292918943162175,
    "max_depth": 8,
    "min_child_weight": 8,
    "subsample": 0.7993292420985183,
    "colsample_bytree": 0.5780093202212182,
    "gamma": 1.7699302940633311e-07,
    "alpha": 2.9152036385288193e-08,
    "lambda": 0.08499808989182997,
    "max_bin": 64
}

# === Entrenar el modelo completo
bst_final = xgb.train(
    best_params,
    dtrain_full,
    num_boost_round=1000,
    evals=[(dtrain_full, "train")],
    early_stopping_rounds=20,
    verbose_eval=True
)

# === Guardar el modelo entrenado
model_path = os.path.join(CHECKPOINT_DIR, 'xgboost_booster_version_5_optuna_retrain.json')
bst_final.save_model(model_path)
print(f" Modelo guardado en: {model_path}")

# === Evaluaci√≥n final sobre el conjunto de prueba
preds_prob = bst_final.predict(dtest)
preds = np.argmax(preds_prob, axis=1)

# === Reporte de clasificaci√≥n
report = classification_report(
    y_test, preds,
    target_names=le.classes_.astype(str),
    output_dict=True
)
df_report = pd.DataFrame(report).transpose()

# === Guardar reporte
report_path = os.path.join(CHECKPOINT_DIR, 'report_version_5_optuna_retrain.csv')
df_report.to_csv(report_path)

print(f" Reporte guardado en: {report_path}")
print(df_report)


#try 2

In [None]:
import os
import numpy as np
import pandas as pd
import joblib

# === Ruta
CHECKPOINT_DIR = '/content/drive/MyDrive/Mendoza_Serey(2025)/version_5'
DIM_BACKBONE = 'backbone'

# === Cargar etiquetas
df_train = pd.read_pickle(os.path.join(CHECKPOINT_DIR, '../embeddings/embeddings_vectors_train_mri.pkl'))
df_test  = pd.read_pickle(os.path.join(CHECKPOINT_DIR, '../embeddings/embeddings_vectors_test_mri.pkl'))
df_val   = pd.read_pickle(os.path.join(CHECKPOINT_DIR, '../embeddings/embeddings_vectors_val_mri.pkl'))
df_aug   = pd.read_pickle(os.path.join(CHECKPOINT_DIR, '../embeddings/embedding_vectors_augmented_neighborhood_mixup.pkl'))

# Concatenar entrenamiento + augmentaci√≥n
df_train_aug = pd.concat([df_train, df_aug], ignore_index=True)

# === Codificar etiquetas
le = joblib.load(os.path.join(CHECKPOINT_DIR, 'label_encoder_version_5.pkl'))
y_train = le.transform(df_train_aug['label'].values)
y_test  = le.transform(df_test['label'].values)

# === Cargar solo la dimensi√≥n backbone
def load_backbone_features(split):
    fpath = os.path.join(CHECKPOINT_DIR, f'ftr_{split}_{DIM_BACKBONE}_version_5.npy')
    return np.load(fpath)

print("üîÑ Cargando caracter√≠sticas solo de dimensi√≥n 'backbone'...")
X_train_backbone = load_backbone_features("train")
X_test_backbone  = load_backbone_features("test")

# === Guardar nuevos archivos con solo backbone
np.save(os.path.join(CHECKPOINT_DIR, 'X_train_backbone_version_5.npy'), X_train_backbone)
np.save(os.path.join(CHECKPOINT_DIR, 'X_test_backbone_version_5.npy'),  X_test_backbone)
np.save(os.path.join(CHECKPOINT_DIR, 'y_train_backbone_version_5.npy'), y_train)
np.save(os.path.join(CHECKPOINT_DIR, 'y_test_backbone_version_5.npy'),  y_test)

print("‚úÖ Caracter√≠sticas 'backbone' guardadas.")
print(f"   X_train_backbone shape: {X_train_backbone.shape}")
print(f"   X_test_backbone shape : {X_test_backbone.shape}")


In [None]:
import os
import numpy as np
import joblib
import xgboost as xgb
import pandas as pd
from sklearn.metrics import classification_report

# === Ruta en tu Google Drive
CHECKPOINT_DIR = '/content/drive/MyDrive/Mendoza_Serey(2025)/version_5'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# === Cargar datos backbone y codificador
X_train = np.load(os.path.join(CHECKPOINT_DIR, 'X_train_backbone_version_5.npy'))
X_test  = np.load(os.path.join(CHECKPOINT_DIR, 'X_test_backbone_version_5.npy'))
y_train = np.load(os.path.join(CHECKPOINT_DIR, 'y_train_backbone_version_5.npy'))
y_test  = np.load(os.path.join(CHECKPOINT_DIR, 'y_test_backbone_version_5.npy'))
le      = joblib.load(os.path.join(CHECKPOINT_DIR, 'label_encoder_version_5.pkl'))

# === Crear DMatrix para XGBoost
dtrain_full = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# === Hiperpar√°metros √≥ptimos (de Optuna Trial 0)
best_params = {
    "objective": "multi:softprob",
    "num_class": len(le.classes_),
    "tree_method": "hist",  # usa "gpu_hist" si tienes CUDA
    "device": "cuda",       # ignorable si no hay GPU
    "eval_metric": "mlogloss",
    "seed": 42,
    "eta": 0.0013292918943162175,
    "max_depth": 8,
    "min_child_weight": 8,
    "subsample": 0.7993292420985183,
    "colsample_bytree": 0.5780093202212182,
    "gamma": 1.7699302940633311e-07,
    "alpha": 2.9152036385288193e-08,
    "lambda": 0.08499808989182997,
    "max_bin": 64
}

# === Entrenar el modelo completo
bst_final = xgb.train(
    best_params,
    dtrain_full,
    num_boost_round=1000,
    evals=[(dtrain_full, "train")],
    early_stopping_rounds=20,
    verbose_eval=True
)

# === Guardar el modelo entrenado (versi√≥n backbone)
model_path = os.path.join(CHECKPOINT_DIR, 'xgboost_booster_backbone_version_5.json')
bst_final.save_model(model_path)
print(f"‚úÖ Modelo backbone guardado en: {model_path}")

# === Evaluaci√≥n final sobre el conjunto de prueba
preds_prob = bst_final.predict(dtest)
preds = np.argmax(preds_prob, axis=1)

# === Reporte de clasificaci√≥n
report = classification_report(
    y_test, preds,
    target_names=le.classes_.astype(str),
    output_dict=True
)
df_report = pd.DataFrame(report).transpose()

# === Guardar reporte (versi√≥n backbone)
report_path = os.path.join(CHECKPOINT_DIR, 'report_backbone_version_5.csv')
df_report.to_csv(report_path)

print(f"‚úÖ Reporte backbone guardado en: {report_path}")
print(df_report)


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns
import numpy as np

# === Ruta del archivo con embeddings del set de testing
pkl_path = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings/embeddings_vectors_train_mri.pkl"

# === Cargar embeddings
df_test = pd.read_pickle(pkl_path)
X = df_test['64'].tolist()
y = df_test['label'].tolist()

# === Convertir a NumPy
X = np.vstack(X)
y = np.array(y)

# === Aplicar t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42, init='pca')
X_tsne = tsne.fit_transform(X)

# === Visualizar
plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], hue=y, palette='tab10', s=60, alpha=0.8)
plt.title("t-SNE de Embeddings (dim=64) - Set de Test", fontsize=16)
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
plt.legend(title="Clase", loc="best")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# === Ruta del archivo con embeddings del set de testing
pkl_path = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings/embeddings_vectors_test_mri.pkl"

# === Cargar DataFrame de embeddings
df_test = pd.read_pickle(pkl_path)

# === Lista de dimensiones
dimensiones = ['backbone', '512', '256', '128', '64']
label = df_test['label'].tolist()

# === Visualizar cada dimensi√≥n con PCA
for dim in dimensiones:
    X = np.vstack(df_test[dim].values)
    y = np.array(label)

    # Aplicar PCA
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X)

    # Visualizaci√≥n
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='tab10', s=60, alpha=0.8)
    plt.title(f"PCA - Dimensi√≥n {dim}", fontsize=14)
    plt.xlabel("PCA 1")
    plt.ylabel("PCA 2")
    plt.legend(title="Clase", loc="best")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

# === Ruta donde tienes los embeddings ===
save_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings"
file_path = os.path.join(save_dir, "embeddings_vectors_train_mri.pkl")

# === Cargar DataFrame de embeddings ===
df = pd.read_pickle(file_path)

# === Columnas que contienen los embeddings ===
embedding_keys = ["backbone", "512", "256", "128", "64"]

# === Crear figura con subplots ===
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, key in enumerate(embedding_keys):
    X = np.vstack(df[key].values)   # Convierte lista de vectores en matriz (n_samples, n_features)
    y = df["label"].values

    # PCA a 2 componentes
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    # Scatter plot
    sc = axes[i].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="tab10", s=10, alpha=0.7)
    axes[i].set_title(f"PCA - {key} ({X.shape[1]}D)", fontsize=12)
    axes[i].set_xlabel("PC1")
    axes[i].set_ylabel("PC2")

# Leyenda
handles, labels = sc.legend_elements()
fig.legend(handles, labels, title="Clases", bbox_to_anchor=(1.02, 0.9))

plt.tight_layout()
plt.show()


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# === Ruta ===
save_dir = "/content/drive/MyDrive/Mendoza_Serey(2025)/embeddings"
file_path = os.path.join(save_dir, "embeddings_vectors_train_mri.pkl")

# === Cargar DataFrame de embeddings ===
df = pd.read_pickle(file_path)

# === Columnas de embeddings ===
embedding_keys = ["backbone", "512", "256", "128", "64"]

for key in embedding_keys:
    X = np.vstack(df[key].values)   # Matriz (n_samples, n_features)
    y = df["label"].values

    # PCA 2D
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    # Figura individual
    plt.figure(figsize=(8, 6))
    sc = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="tab10", s=10, alpha=0.7)

    plt.title(f"PCA - {key} ({X.shape[1]}D)", fontsize=14)
    plt.xlabel("PC1")
    plt.ylabel("PC2")

    # Leyenda individual (a la derecha)
    handles, labels = sc.legend_elements()
    plt.legend(handles, labels, title="Clases", bbox_to_anchor=(1.05, 1), loc="upper left")

    plt.tight_layout()
    plt.show()
