In [2]:
"""
BIOINFORMATICS: LAB08
@author: Irene Benedetto
"""
from models import *
from utils import *
from tqdm import tqdm
import tensorflow_addons as tfa


tf.random.set_seed(3)
np.random.seed(3)

In [3]:
transcriptome_df, genome_df, proteome_df, labels_df = create_dataframe()
y_true = labels_df["cluster.id"].values

transcriptome_df = transcriptome_df.astype(np.float32)
genome_df = genome_df.astype(np.float32)
proteome_df = proteome_df.astype(np.float32)

Length of the transcriptome dataframe: (500, 131)
Length of the genome dataframe: (500, 367)
Length of the proteome dataframe: (500, 160)


# Late integration approach 

## Trainscriptome dataset

### Loading the model: MLPClustering

In [4]:
# for the transcriptome dataset
print(f'With transcriptome dataset')
ds_shape = transcriptome_df.shape
model = MLPClustering(input_shape=ds_shape, n_cetroids=5)
loss = ClusteringLoss()
optimizer = tfa.optimizers.SGDW(learning_rate=0.0001,weight_decay=0.1 )
metrics = None
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

With transcriptome dataset
Number of samples: 500
Number of fts: 131
Number of clusters: 5


### Training

In [5]:
X = np.asarray(transcriptome_df.values).astype(np.float32)
N_EPOCHS = 30
for epoch in range(N_EPOCHS):
    history = model.train_step(X)
    if (epoch+1)%4 == 0 or epoch == 0:
        print(f'Epoch {epoch}/{N_EPOCHS}, loss: {history["loss"]}', end='\r')
        
        
print()
centroids = model(X)
distance = tf.norm(X[:, None, :] - centroids[None, :, :], axis=-1)
y_pred = tf.argmin(distance, axis=1).numpy() + 1
accuracy, best_combination_transcriptome = clustering_accuracy(y_true, y_pred)
transcriptome_probabilities = soft_clustering_weights(X, centroids, best_combination_transcriptome)
print(f'Accuracy: {accuracy}')

Epoch 27/30, loss: 5833.2412109375
Accuracy: 0.8


## Genome dataset

### Loading the model: MLPClustering

In [6]:
# for the genome dataset
print(f'\nWith genome dataset')
ds_shape = genome_df.shape
model = MLPClustering(input_shape=ds_shape, n_cetroids=5)
loss = ClusteringLoss()
optimizer = tfa.optimizers.SGDW(learning_rate=0.0001,weight_decay=0.1 )
metrics = None
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)


With genome dataset
Number of samples: 500
Number of fts: 367
Number of clusters: 5


### Training

In [7]:
X = np.asarray(genome_df.values).astype(np.float32)
N_EPOCHS = 30
for epoch in range(N_EPOCHS):
    history = model.train_step(X)
    if (epoch+1)%4 == 0 or epoch == 0:
        print(f'Epoch {epoch}/{N_EPOCHS}, loss: {history["loss"]}', end='\r')

print()
centroids = model(X)
distance = tf.norm(X[:, None, :] - centroids[None, :, :], axis=-1)
y_pred = tf.argmin(distance, axis=1).numpy() + 1
accuracy, best_combination_genome = clustering_accuracy(y_true, y_pred)
genome_probabilities = soft_clustering_weights(X, centroids, best_combination_genome)
print(f'Accuracy: {accuracy}')

Epoch 27/30, loss: 9694.5146484375
Accuracy: 0.852


## Proteome dataset

### Loading the model: MLPClustering

In [8]:
# for the proteome dataset
print(f'\nWith proteome dataset')
ds_shape = proteome_df.shape
model = MLPClustering(input_shape=ds_shape, n_cetroids=5)
loss = ClusteringLoss()
optimizer = tfa.optimizers.SGDW(learning_rate=0.0001,weight_decay=0.1 )
metrics = None
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)


With proteome dataset
Number of samples: 500
Number of fts: 160
Number of clusters: 5


### Training

In [9]:
X = np.asarray(proteome_df.values).astype(np.float32)
N_EPOCHS = 30
for epoch in range(N_EPOCHS):
    history = model.train_step(X)
    if (epoch + 1) % 4 == 0 or epoch == 0:
        print(f'Epoch {epoch}/{N_EPOCHS}, loss: {history["loss"]}', end='\r')

print()
centroids = model(X)
distance = tf.norm(X[:, None, :] - centroids[None, :, :], axis=-1)
y_pred = tf.argmin(distance, axis=1).numpy() + 1
accuracy, best_combination_proteome = clustering_accuracy(y_true, y_pred)
proteome_probabilities = soft_clustering_weights(X, centroids, best_combination_proteome)
print(f'Accuracy: {accuracy}')

Epoch 27/30, loss: 6390.32275390625
Accuracy: 0.6


## Late integration consensus building

In [10]:
threshold = 0.2
y_pred = []
for sample in range(transcriptome_probabilities.shape[0]):
    # for each sample extract the probabilities according to:
    # - each features (on the colums)
    #  - each class (on the row)
    probabilities = [
        
        transcriptome_probabilities[sample, :],
        genome_probabilities[sample, :],
        proteome_probabilities[sample, :]
    ]

    probabilities = np.array(probabilities).T

    S_a = np.sum(probabilities)
    S_i = np.sum(probabilities, axis=1)
    m = 3
    S_m = S_i / m

    if (np.max(S_i) / S_a < threshold) or (np.max(S_m) < threshold):
        # in order to avoid problems with data type 0 means "unknown"
        y = 0
    else:
        y = np.argmax(S_i) + 1
    y_pred.append(y)

y_pred = np.array(y_pred)
n_unknown = len(np.where(y_pred == 0)[0])
print(f'\nThreshold: {threshold}')
print(f'Number of unknown: {n_unknown}')

accuracy, _ = clustering_accuracy(y_true, y_pred)
print(f'Accuracy: {accuracy}')


Threshold: 0.2
Number of unknown: 0
Accuracy: 1.0
