### Exploring data release in search of presence of positive and negative pairs in order to estimate quantity of valid triplets to train on.

In [1]:
# Paths & URLs

import os

# Directorio base
#PATH_BASE = '/content/drive/MyDrive/proximity'
PATH_BASE = 'C:\\Users\\User\\Documents\\Proyecto Proximity'

# Data release actual
DATA_RELEASE_PATH = os.path.join(PATH_BASE, 'datalake_sorted')
DR70_PATH = os.path.join(PATH_BASE, 'DR70')
DR70_LABELS_PATH = os.path.join(DR70_PATH, 'labels.csv')


# CTs in Nibabel format
CT_NIBABEL_PATH = os.path.join(PATH_BASE, 'DR70', 'CTs')

# Embeddings visuales de CTs
CT_EMBEDDINGS_PATH = DATA_RELEASE_PATH + '/visual_embeddings'


# Etiquetas de los CTs del data release actual
#CT_LABELS_CSV_PATH = DATA_RELEASE_PATH + '/labels.csv'

# Data release (CTs + etiquetas) organizados en un DataFrame
CT_DATASET_DF_HDF_PATH = os.path.join(PATH_BASE, 'dataset_df.h5')
CT_DATASET_DF_PICKLE_PATH = os.path.join(PATH_BASE, 'dataset_df.pickle')

# URLs de modelos visuales
RESNET18_URL = 'microsoft/resnet-18'

# Path que contiene los resnet50 embeddings de CTs del data release actual
CT_RESNET18_EMBEDDINGS_PATH = os.path.join(DR70_PATH, 'visual_embeddings', 'resnet18')

# Path de modelos entrenados en base a tripletas
TRIPLET_MODELS_PATH = os.path.join(PATH_BASE, 'retrieval_models', 'triplets')

In [15]:
import pandas as pd
import numpy as np

labels_df = pd.read_csv(
        DR70_LABELS_PATH, 
        header=0, 
        index_col=0, 
        dtype={'ct': str, 'condensacion': bool, 'nodulos': bool, 'quistes': bool}
)

In [16]:
labels_df = labels_df.drop(index=[7])
print(len(labels_df))
labels_df = labels_df.reset_index()
for i, data in labels_df.iterrows():
    print('index:', i)
    a = labels_df.iloc[i]
    print(a)

69
index: 0
index                                                           0
CT              1.3.12.2.1107.5.1.4.83504.30000020010601131176...
condensacion                                                False
nodulos                                                     False
quistes                                                     False
Name: 0, dtype: object
index: 1
index                                                           1
CT              1.3.12.2.1107.5.1.4.83504.30000020011005321836...
condensacion                                                False
nodulos                                                     False
quistes                                                      True
Name: 1, dtype: object
index: 2
index                                                           2
CT              1.3.12.2.1107.5.1.4.83504.30000020011611551677...
condensacion                                                False
nodulos                                                     False


In [3]:
from scipy.spatial.distance import hamming

positive_pairs = list()
positive_candidates_label_vectors = np.array(labels_df.iloc[:,1:], dtype=int)
for j in range(len(labels_df)):
    positives = 0
    anchor_label_vector = np.array(labels_df.iloc[j].iloc[1:], dtype=int)
    for i, positive_candidate_label_vector in enumerate(positive_candidates_label_vectors):
        if i == j:
            continue
        distance = hamming(anchor_label_vector, positive_candidate_label_vector)
        if distance == 0.0:
            positives += 1
    positive_pairs.append(positives)

In [34]:
positive_candidates_label_vectors = labels_df.iloc[[6, 16, 2, 7, 0, 5, 46, 34]].iloc[:,1:]
print(positive_candidates_label_vectors)

    condensacion  nodulos  quistes
6          False    False    False
16         False    False    False
2          False    False    False
7          False    False    False
0          False    False    False
5          False    False    False
46          True     True    False
34         False    False    False


In [32]:
def get_triplets(positive_pairs_ids_list):
        b_ids = set(elem for pair in positive_pairs_ids_list for elem in pair)
        print(b_ids)

In [33]:
get_triplets([[1,2],[3,4]])

{1, 2, 3, 4}


In [57]:
labels_df['emb'] = list(range(2000, 2000+len(labels_df)))
    

In [72]:
torch.tensor(labels_df['emb'])

tensor([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
        2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023,
        2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035,
        2036, 2037, 2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047,
        2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059,
        2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069])

In [1]:
import torch
import random
s = torch.randn(10,2)
t = torch.randn(10,2)

In [7]:
s

tensor([[ 1.0992, -0.3237],
        [-1.2417,  0.2403],
        [ 0.2229, -0.0543],
        [-0.1660,  0.7911],
        [-1.1034,  0.7557],
        [ 0.1606,  0.2917],
        [ 0.8792,  1.5907],
        [-1.0642,  0.4998],
        [ 1.0189,  0.8441],
        [ 1.4127,  1.2274]])

In [6]:
s[[6, 1, 2]]

tensor([[ 0.8792,  1.5907],
        [-1.2417,  0.2403],
        [ 0.2229, -0.0543]])

In [23]:
s[[1,0,3]]

tensor([[-0.1351,  1.3503],
        [ 0.9771,  0.4954],
        [ 1.0844,  1.8087]])

In [120]:
t[:,0] = 2
t

tensor([[ 2.0000, -0.7252],
        [ 2.0000,  1.1478],
        [ 2.0000,  2.1088],
        [ 2.0000,  0.2445],
        [ 2.0000,  0.4198],
        [ 2.0000,  1.8224],
        [ 2.0000,  0.9159],
        [ 2.0000, -0.6040],
        [ 2.0000,  0.8472],
        [ 2.0000,  0.8111]])

In [137]:
v = list()
v = torch.tensor([[ab, b, d] for ([ab, b], [ad, d]) in zip(s, t)])
v

tensor([[ 1.0000,  0.8258, -0.7252],
        [ 1.0000, -0.9663,  1.1478],
        [ 1.0000, -0.5372,  2.1088],
        [ 1.0000,  0.7967,  0.2445],
        [ 1.0000, -2.1733,  0.4198],
        [ 1.0000, -1.6740,  1.8224],
        [ 1.0000, -1.5232,  0.9159],
        [ 1.0000, -0.9808, -0.6040],
        [ 1.0000, -0.7235,  0.8472],
        [ 1.0000, -1.4004,  0.8111]])

In [138]:
x = torch.randn(10,3)
x

tensor([[-1.3739,  1.0549, -0.7603],
        [-0.6921,  0.1192, -0.4236],
        [-0.2932, -1.2329, -2.6938],
        [ 1.8017,  0.8833, -1.1871],
        [ 0.8129,  0.9881, -0.9980],
        [-0.3174,  0.6422,  0.1090],
        [ 2.2931, -2.2608,  0.5755],
        [ 0.2727,  0.7897, -0.1437],
        [-0.2199, -0.5634,  2.3380],
        [-0.7901,  0.3831, -0.6932]])

In [140]:
v-x-2

tensor([[ 0.3739, -2.2291, -1.9649],
        [-0.3079, -3.0856, -0.4286],
        [-0.7068, -1.3043,  2.8026],
        [-2.8017, -2.0866, -0.5684],
        [-1.8129, -5.1613, -0.5821],
        [-0.6826, -4.3162, -0.2867],
        [-3.2931, -1.2624, -1.6597],
        [-1.2727, -3.7705, -2.4603],
        [-0.7801, -2.1601, -3.4908],
        [-0.2099, -3.7835, -0.4957]])

In [11]:
w = torch.select(v, 1, 0)
torch.mean(w)

NameError: name 'v' is not defined

In [20]:
torch.mean(v, axis=-2)

NameError: name 'v' is not defined

In [24]:
a = torch.randn(5)
print(a)
print(a.repeat(10, 1))

tensor([ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340])
tensor([[ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340],
        [ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340],
        [ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340],
        [ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340],
        [ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340],
        [ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340],
        [ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340],
        [ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340],
        [ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340],
        [ 1.1844, -1.2562, -1.3394, -1.8057, -0.9340]])


In [32]:
b = torch.randn(5, 1, 3)
b.squeeze(1)

tensor([[ 0.1454, -1.3578, -0.9561],
        [-0.5726,  2.9780,  0.1769],
        [-1.1211,  0.5450, -1.8344],
        [-1.0660,  0.6298,  0.1756],
        [-1.5861,  1.0915, -1.8288]])

In [154]:
em = list()

In [37]:
c = [(1,1,1),(2,2,2),(3,3,3)]
for g, h, j in c:
    print(g, h, j)

1 1 1
2 2 2
3 3 3


In [155]:
em.append(torch.tensor([1]))
em.append(torch.tensor([4]))

In [4]:
import torch
g = torch.tensor([5,6,12])
f = torch.tensor([[12,5,6]])

def _map(batch, triplet_id):
    return batch.index(triplet_id)
        

f.map_(

RuntimeError: a Tensor with 3 elements cannot be converted to Scalar

In [167]:
em.item()

AttributeError: 'list' object has no attribute 'item'

In [168]:
import numpy as np

def recall(labels_gt, labels_prediction):
    """
    set of ground truth labels and list of prediction lists. 
    This is an independent of k way to calculate the recall.
    """
    n = len(labels_prediction)
    recall = 0
    for i in range(n):
        recall += len(set(labels_gt) & set(labels_prediction[i]))/len(labels_gt)
    return recall/n


In [170]:
a = [2,3,5]
b = [[4,5], [1], [2,3], [2,3,5]]

recall(a, b)

0.5

In [173]:
set(a) & set(b[0])

{5}

In [1]:
import torch

a = torch.randn(7, 3)

In [2]:
a

tensor([[ 0.7367,  0.3216,  0.2477],
        [-0.1080, -0.3605, -1.2283],
        [-0.8314, -0.8390, -2.4077],
        [-0.5620, -1.7346, -2.3677],
        [-0.9147, -2.2411,  1.1041],
        [ 1.0680,  0.7939,  0.3732],
        [ 1.1735,  0.7841,  0.3128]])

In [10]:
b = a.select(1,0)
c = a.select(1,1)
d = a.select(1,2)

In [11]:
for i, j, k in zip(b, c, d):
    print(i)

tensor(0.7367)
tensor(-0.1080)
tensor(-0.8314)
tensor(-0.5620)
tensor(-0.9147)
tensor(1.0680)
tensor(1.1735)


In [13]:
a.transpose(0, 1)

tensor([[ 0.7367, -0.1080, -0.8314, -0.5620, -0.9147,  1.0680,  1.1735],
        [ 0.3216, -0.3605, -0.8390, -1.7346, -2.2411,  0.7939,  0.7841],
        [ 0.2477, -1.2283, -2.4077, -2.3677,  1.1041,  0.3732,  0.3128]])

In [15]:
b = torch.randn(3, 7)
b

tensor([[ 0.4534,  0.7670, -0.0280, -2.9217, -0.8123,  0.3854,  1.1842],
        [ 0.4907,  0.2238,  0.4161, -0.5768, -0.8418,  0.1310,  0.6501],
        [-0.1863, -0.4779,  0.7546,  0.7713,  0.6960, -0.8343, -1.5945]])

In [16]:
b.transpose(0, 1)

tensor([[ 0.4534,  0.4907, -0.1863],
        [ 0.7670,  0.2238, -0.4779],
        [-0.0280,  0.4161,  0.7546],
        [-2.9217, -0.5768,  0.7713],
        [-0.8123, -0.8418,  0.6960],
        [ 0.3854,  0.1310, -0.8343],
        [ 1.1842,  0.6501, -1.5945]])

In [17]:
torch.tensor(0)

tensor(0)

In [23]:
a = torch.randn(7)
a

tensor([ 0.7870, -1.2921,  0.6441,  1.6434, -0.7938,  1.3081, -1.1897])

In [24]:
torch.func.vmap(torch.max)(a)

tensor([ 0.7870, -1.2921,  0.6441,  1.6434, -0.7938,  1.3081, -1.1897])

In [42]:
def _max(a, b):
    if a > b:
        return a
    return b

In [48]:
a.apply(torch.zeros(7), _max)

AttributeError: 'Tensor' object has no attribute 'apply'

In [46]:
a - 0.2

tensor([ 0.5870, -0.2000,  0.4441,  1.4434, -0.2000,  1.1081, -0.2000])

In [53]:
torch.mean(a)

tensor(0.6261)

In [56]:
'a'[-1]

'a'

In [9]:
import torch
from torch.utils.data import BatchSampler, SequentialSampler

seq = SequentialSampler(range(5, 14))

print(list(seq))
print(seq[0])

print(list(BatchSampler(SequentialSampler(range(5, 14)), batch_size=3, drop_last=False)))

[0, 1, 2, 3, 4, 5, 6, 7, 8]


TypeError: 'SequentialSampler' object is not subscriptable