In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
# Paths & URLs

import os
import sys
from pathlib import Path

project_root = Path(os.getcwd()+'/chest_ct_retrieval').resolve()
sys.path.append(str(project_root))

# Enable CUDA stacktrace reporting for debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING'] = "1"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Directorio base
PATH_BASE = '/dataChivo/batch_01/bigdata_02/datareleases/'
USER_HOME_PATH = '/dataChivo/jpmunoz/'
PROJECT_BASE_PATH = os.path.join(USER_HOME_PATH, 'chest_ct_retrieval')

# Data release 2156
DR2156_DATARELEASE_PATH = os.path.join(PATH_BASE, 'DR2156')
DR2156_DATARELEASE_CT_PATH = os.path.join(DR2156_DATARELEASE_PATH, 'DR2156_studies')
DR2156_DATARELEASE_LABELS_PATH = os.path.join(DR2156_DATARELEASE_PATH, 'DR2156_labels', 'DR2156_labels.csv')

# Embeddings ResNet18 DR2156 con dimensiones 300x300x300
DR2156_300_RESNET_EMBEDDINGS_PATH = os.path.join(PROJECT_BASE_PATH, 'data', 'DR2156', 'DR2156_300_resnet18_embeddings')
DR2156_300_PREPROCESSED_PATH = os.path.join(PROJECT_BASE_PATH, 'data', 'DR2156', 'DR2156_300_preprocessed')

# Embeddings ResNet18 DR2156 con dimensiones 512x512x300
DR2156_512_RESNET_EMBEDDINGS_PATH = os.path.join(PROJECT_BASE_PATH, 'data', 'DR2156', 'DR2156_512_resnet18_embeddings')
DR2156_512_PREPROCESSED_PATH = os.path.join(PROJECT_BASE_PATH, 'data', 'DR2156', 'DR2156_512_preprocessed')

# Path de modelos basados en tripletas entrenados en el DR2156
DR2156_TRIPLET_RUNS_PATH = os.path.join(PROJECT_BASE_PATH, 'runs')
#DR2156_TRIPLET_CHECKPOINTS_PATH = os.path.join(PATH_BASE, 'retrieval_models', 'triplets', 'DR2156_checkpoints')
#DR2156_TRIPLET_TENSORBOARD_LOGS_DIR = os.path.join(TRIPLET_MODELS_PATH, 'DR2156_logs')


In [None]:
import numpy as np
from datasets.constants import PROXIMITY_VECTOR_LABELS

%matplotlib inline
import matplotlib.pyplot as plt

proximity_vector_labels = PROXIMITY_VECTOR_LABELS.items()
#proximity_vector_labels = [[0,0,0], [1,0,0], [0,1,0], [0,0,1], [1,0,1], [1,1,0], [0,1,1], [1,1,1]]
#proximity_vector_labels = [[1,0,0], [0,1,0], [0,0,1]]
#proximity_classes = ['(s/a)', 'c', 'n', 'q', 'c+n', 'c+q', 'n+q', 'c+n+q']
proximity_classes = ['(sin anomalías)', 'Condensación', 'Nódulos', 'Quistes', 'Cond.+Nóds.', 'Cond.+Quis.', 'Nóds.+Quis.', 'Cond.+Nóds.+Quis.']
proximity_colors = ['#000000', '#ff0000', '#00ff00', '#0000ff',
              '#ff00ff', '#ffff00', '#00ffff', '#ffffff']
#proximity_colors = ['#ff0000', '#00ff00', '#0000ff']

mnist_classes = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
              '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
              '#bcbd22', '#17becf']

def plot_embeddings(embeddings, targets, xlim=None, ylim=None, zlim=None):
    fig = plt.figure(figsize=(10,10))
    ax = plt.gca()
    ax.set_facecolor('#D1D1D1')

    xlim = (embeddings[:, 0].min(), embeddings[:, 0].max())
    xradius = (xlim[1] - xlim[0]) / 2
    xcenter = xlim[0] + xradius
    xlim = (xcenter - 1.1*xradius, xcenter + 1.1*xradius)

    ylim = (embeddings[:, 1].min(), embeddings[:, 1].max())
    yradius = (ylim[1] - ylim[0]) / 2
    ycenter = ylim[0] + yradius
    ylim = (ycenter - 1.1*yradius, ycenter + 1.1*yradius)
    '''
    zlim = (embeddings[:, 2].min(), embeddings[:, 2].max())
    zradius = (zlim[1] - zlim[0]) / 2
    zcenter = zlim[0] + zradius
    zlim = (zcenter - 1.1*zradius, zcenter + 1.1*zradius)
    '''
    #for i in range(len(proximity_vector_labels)):
    for i in set(targets):
        inds = np.where(targets==i)[0]
        #new_mask = [(t == proximity_vector_labels[i]).all() for t in targets]
        new_mask = inds
        #plt.scatter(embeddings[inds,0], embeddings[inds,1], alpha=0.5, color=colors[i])
        #embeddings_x = (embeddings[new_mask,0] - embeddings[new_mask,0].min()) / (embeddings[new_mask,0].max() - embeddings[new_mask,0].min())
        #embeddings_y = (embeddings[new_mask,1] - embeddings[new_mask,1].min()) / (embeddings[new_mask,1].max() - embeddings[new_mask,1].min())
        #plt.scatter(embeddings_x, embeddings_y, alpha=0.5, edgecolors=proximity_colors[i], color=proximity_colors[i])
        ax.scatter(embeddings[new_mask,0], embeddings[new_mask,1], alpha=0.6, edgecolors=proximity_colors[i], color=proximity_colors[i])
    
    if xlim:
        plt.xlim(xlim[0], xlim[1])
    if ylim:
        plt.ylim(ylim[0], ylim[1])
    '''
    if zlim:
        plt.zlim(zlim[0], zlim[1])
    '''
    plt.legend(proximity_classes)
    plt.show()

# Experimento 3: Entrenamiento por tripletas sobre dataset de volúmenes de tres clases

Objetivo: detección de esferas, cubos y tetrahedros en el volumen de entrada---Las clases no se excluyen, o sea, el dataset es multietiqueta

In [None]:
from config.config import load_config
from utils.seed import set_seed
from training.data_setup import load_dataset, create_loaders
from training.model_setup import initialize_model

cfg = load_config("../config/base.yaml")
set_seed(cfg["seed"])
cuda = torch.cuda.is_available()

train_set, test_set, neg_compatibles = load_dataset(cfg["volume_dir"], cfg["seed"], cfg["train_fraction"])
loaders = create_loaders(train_set, test_set, cfg["n_classes"], cfg["n_samples"], cuda)

model, loss_fn, optimizer, scheduler = initialize_model(
    embedding_size=cfg["embedding_size"],
    margin=cfg["margin"],
    lr=cfg["learning_rate"],
    weight_decay=cfg["weight_decay"],
    negative_compatibles_dict=neg_compatibles,
    print_interval=10,
    cuda=cuda
)


In [None]:
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torchvision.transforms import v2
from torch.optim import lr_scheduler
import torch.optim as optim
from torch.utils.data import default_collate
from utils.compatbility import determine_negative_compatibles
from utils.seed import set_seed
from datasets.constants import PROXIMITY_VECTOR_LABELS_FOR_TRAINING, PROXIMITY_CLASS_NAMES, PROXIMITY_CLASS_NAMES
# Set up the network and training parameters
from losses.losses import OnlineTripletLoss
from utils.selectors.triplet_selector import HardestNegativeTripletSelector, SemihardNegativeTripletSelector
from eval.metrics import AverageNonzeroTripletsMetric, TotalNonzeroTripletsMetric, Loss, NDCG, Recall, AllMetrics
from datasets.ct_volume_dataset import ProximityCTEmbeddingTripletDataset
from datasets.loaders import *
from datasets.samplers import *
from models.networks import Proximity300x300

cuda = torch.cuda.is_available()

seed=0
set_seed(seed)

volumes_path_list = sorted(Path(DR2156_300_PREPROCESSED_PATH).glob('*.npz'))

# generate a list of corresponding labels
def get_class_id(label_vector):
    for (k, v) in PROXIMITY_VECTOR_LABELS_FOR_TRAINING.items():
        equals = label_vector == v
        if hasattr(equals, 'all'):
            if equals.all():
                return k
        else:
            if equals:
                return k
    return None

samples_path_list = [[]]*len(volumes_path_list)
labels_list = [[]]*len(volumes_path_list)
for i, p in enumerate(volumes_path_list):
    _, fname = os.path.split(p)
    info = fname.split('.')[0:-1]
    info = ''.join(info)
    info = info.split('_')
    fid = int(info[0])
    vol_id = info[1]
    sin_anomalias = int(info[2])
    condensacion = int(info[3])
    nodulos = int(info[4])
    quistes = int(info[5])
    samples_path_list[ fid - 1 ] = p
    labels_list[ fid - 1 ] = [sin_anomalias, condensacion, nodulos, quistes]
labels_as_classes = torch.LongTensor([get_class_id(l) for l in labels_list])

negative_compatibles_dict = determine_negative_compatibles(PROXIMITY_VECTOR_LABELS_FOR_TRAINING)

train_frac = 0.8
x_train, x_test, y_train, y_test = train_test_split(samples_path_list, labels_as_classes, train_size=train_frac, stratify=labels_as_classes, random_state=seed)

triplet_training_set = ProximityCTEmbeddingTripletDataset(x_train, y_train, preprocessed=True)
triplet_validation_set = ProximityCTEmbeddingTripletDataset(x_test, y_test, preprocessed=True)

print('Training set: count for each label')
for label, count in enumerate(np.bincount(sorted(triplet_training_set.labels_list))):
    print(f'label {label}: {count}')
print('Training set size:', len(triplet_training_set))
print()
print('Validation set: count for each label')
for label, count in enumerate(np.bincount(sorted(triplet_validation_set.labels_list))):
    print(f'label {label}: {count}')
print('Validation set size:', len(triplet_validation_set))

def collate(batch):
    batch = list(filter(lambda x:x is not None, batch))
    return default_collate(batch)

# balanced batch sampler: batch size is n_classes*n_samples. Each batch contains n_samples samples for n_classes different classes.
sampler_n_classes = 2
sampler_n_samples = 4
sampler_batch_size = sampler_n_classes * sampler_n_samples
train_batch_sampler = ProximityCTEmbeddingTripletDataset(triplet_training_set.labels_list, PROXIMITY_VECTOR_LABELS_FOR_TRAINING, n_classes=sampler_n_classes, n_samples=sampler_n_samples, multilabel=True)
test_batch_sampler = ProximityCTEmbeddingTripletDataset(triplet_validation_set.labels_list, PROXIMITY_VECTOR_LABELS_FOR_TRAINING, n_classes=sampler_n_classes, n_samples=sampler_n_samples, multilabel=True)

# Set up data loaders
#n_classes = 2
#batch_size = 8
kwargs = {'num_workers': 0, 'pin_memory': True} if cuda else {}
train_eval_loader = torch.utils.data.DataLoader(triplet_training_set, batch_size=sampler_batch_size, shuffle=False, **kwargs)
test_eval_loader = torch.utils.data.DataLoader(triplet_validation_set, batch_size=sampler_batch_size, shuffle=False, **kwargs)
triplet_train_loader = torch.utils.data.DataLoader(triplet_training_set, batch_sampler=train_batch_sampler, **kwargs)
triplet_test_loader = torch.utils.data.DataLoader(triplet_validation_set, batch_sampler=test_batch_sampler, **kwargs)
all_triplet_train_loader = TripletDataLoader(triplet_training_set, n_classes=sampler_n_classes, n_samples=sampler_n_samples, **kwargs)
all_triplet_test_loader = TripletDataLoader(triplet_validation_set, n_classes=sampler_n_classes, n_samples=sampler_n_samples, **kwargs)


margin = 0.2
embedding_size = 128
embedding_net = Proximity300x300(embedding_size=embedding_size)
model = embedding_net
if cuda:
    model.cuda()
    
loss_fn = OnlineTripletLoss(margin, SemihardNegativeTripletSelector(margin), negative_compatibles_dict, print_interval=10)
lr = 1e-5
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
scheduler = lr_scheduler.StepLR(optimizer, 50, gamma=0.1, last_epoch=-1)
n_epochs = 50
log_interval = 10


In [None]:
batches = [b for b in train_batch_sampler]

In [None]:
len(batches)

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
%reload_ext tensorboard

In [None]:
from trainer import Trainer
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d%H%M%S') 

log_subdir = f"slice_size=100x100 train_frac={train_frac} sampler={sampler_n_classes}x{sampler_n_samples} margin={margin} embedding_size={embedding_size} lr={lr} distance=euclidean_squared embedding_l2_norm=1 negative_mining=semihard batch_sampling=oversample_underrepresented_classes timestamp={timestamp}"

start_epoch = 0
train_full_loader_switch = False
metrics=[
    AverageNonzeroTripletsMetric(), 
    TotalNonzeroTripletsMetric(), 
    Loss(),
    #NDCG(proximity_vector_labels_dict, proximity_class_names),
    #Recall(proximity_vector_labels_dict, proximity_class_names),
    AllMetrics(PROXIMITY_VECTOR_LABELS_FOR_TRAINING, PROXIMITY_CLASS_NAMES),
]

trainer = Trainer(
    triplet_train_loader, 
    triplet_test_loader,
    train_eval_loader, 
    test_eval_loader, 
    all_triplet_train_loader,
    all_triplet_test_loader,
    model, 
    loss_fn, 
    optimizer, 
    scheduler, 
    n_epochs, 
    cuda, 
    log_interval,
    TRIPLET_CHECKPOINTS_PATH,
    os.path.join(DR2156_TRIPLET_TENSORBOARD_LOGS_DIR, log_subdir),
    train_full_loader_switch,
    metrics,
    start_epoch,
)

trainer.fit()

In [None]:
torch.save(
    model.state_dict(),
    os.path.join(TRIPLET_CHECKPOINTS_PATH, f'triplets_epoch={32}_val-loss={0.3881}_avg-nonzero-triplets={348588.0}.pth'),
)

In [None]:
from utils import extract_embeddings
from sklearn.manifold import TSNE

#BEST_MODEL_PATH = os.path.join(TRIPLET_CHECKPOINTS_PATH, 'triplets_val-loss_0.0467_20240424_050251.pth')

#best_model = SynthDataEmbeddingModel(embedding_size=3).cuda()
#best_model.load_state_dict(torch.load(BEST_MODEL_PATH))

train_embeddings_tl, train_labels_tl = extract_embeddings(train_eval_loader, model)
train_tsne = TSNE(n_components=2, perplexity=10, learning_rate="auto", init="random")
train_embeddings_tsne = train_tsne.fit_transform(train_embeddings_tl.cpu().numpy())
plot_embeddings(train_embeddings_tsne, train_labels_tl.numpy())

test_embeddings_tl, test_labels_tl = extract_embeddings(test_eval_loader, model)
val_tsne = TSNE(n_components=2, perplexity=5, learning_rate="auto", init="random")
test_embeddings_tsne = val_tsne.fit_transform(test_embeddings_tl.cpu().numpy())
plot_embeddings(test_embeddings_tsne, test_labels_tl.numpy())


In [None]:
from utils import extract_embeddings
from sklearn.decomposition import PCA

#BEST_MODEL_PATH = os.path.join(TRIPLET_CHECKPOINTS_PATH, 'triplets_val-loss_0.0467_20240424_050251.pth')

#best_model = SynthDataEmbeddingModel(embedding_size=3).cuda()
#best_model.load_state_dict(torch.load(BEST_MODEL_PATH))

#train_embeddings_tl, train_labels_tl = extract_embeddings(train_eval_loader, model)
train_pca = PCA(n_components=2)
train_embeddings_pca = train_pca.fit_transform(train_embeddings_tl.cpu().numpy())
plot_embeddings(train_embeddings_pca, train_labels_tl.numpy())

#test_embeddings_tl, test_labels_tl = extract_embeddings(test_eval_loader, model)
val_pca = PCA(n_components=2)
test_embeddings_pca = val_pca.fit_transform(test_embeddings_tl.cpu().numpy())
plot_embeddings(test_embeddings_pca, test_labels_tl.numpy())

In [None]:
from utils import extract_embeddings
from sklearn.manifold import TSNE

BEST_MODEL_PATH = os.path.join(TRIPLET_CHECKPOINTS_PATH, 'triplets_epoch=200_val-loss=38.9467_avg-nonzero-triplets=162.0.pth')

best_model = SynthDataEmbeddingModel(embedding_size=8).cuda()
best_model.load_state_dict(torch.load(BEST_MODEL_PATH))

train_embeddings_tl, train_labels_tl = extract_embeddings(train_eval_loader, best_model)
train_tsne = TSNE(n_components=2, perplexity=10, learning_rate="auto", init="random")
train_embeddings_tsne = train_tsne.fit_transform(train_embeddings_tl.cpu().numpy())
plot_embeddings(train_embeddings_tsne, train_labels_tl.numpy())

test_embeddings_tl, test_labels_tl = extract_embeddings(test_eval_loader, best_model)
val_tsne = TSNE(n_components=2, perplexity=5, learning_rate="auto", init="random")
test_embeddings_tsne = val_tsne.fit_transform(test_embeddings_tl.cpu().numpy())
plot_embeddings(test_embeddings_tsne, test_labels_tl.numpy())


In [None]:
from utils import extract_embeddings
from sklearn.decomposition import PCA

#train_embeddings_tl, train_labels_tl = extract_embeddings(train_eval_loader, model)
train_pca = PCA(n_components=2)
train_embeddings_pca = train_pca.fit_transform(train_embeddings_tl.cpu().numpy())
plot_embeddings(train_embeddings_pca, train_labels_tl.numpy())

#test_embeddings_tl, test_labels_tl = extract_embeddings(test_eval_loader, model)
val_pca = PCA(n_components=2)
test_embeddings_pca = val_pca.fit_transform(test_embeddings_tl.cpu().numpy())
plot_embeddings(test_embeddings_pca, test_labels_tl.numpy())