In [1]:
"""
BIOINFORMATICS: LAB08
@author: Irene Benedetto
"""
from utils import *
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, silhouette_score
import matplotlib
import warnings
matplotlib.use('Agg')
import matplotlib.pyplot as plt
warnings.simplefilter('ignore')

In [2]:
transcriptome_df, genome_df, proteome_df, labels_df = create_dataframe()

transcriptome_df = transcriptome_df.astype(np.float32)
genome_df = genome_df.astype(np.float32)
proteome_df = proteome_df.astype(np.float32)

y_true = labels_df["cluster.id"].values

Length of the transcriptome dataframe: (500, 131)
Length of the genome dataframe: (500, 367)
Length of the proteome dataframe: (500, 160)


# Early integration approach

In [3]:
# EARLY INTEGRATION APPROACH
df = pd.concat([transcriptome_df, genome_df, proteome_df], axis=1)

## Dimensionality reduction

In [4]:
N_FTS = 50
selector = PCA(n_components=N_FTS)
selector.fit(df)
reduced_X_train = selector.transform(df)
print(f'Proportion of variance explained with {N_FTS}: {np.sum(selector.explained_variance_ratio_)}')

Proportion of variance explained with 50: 0.8676226735115051


## Algorithms and hyperparameter tuning

In [5]:
print('Clustering with kmean')
kmean = KMeans(n_clusters=5)
kmean.fit(reduced_X_train)
cetroids = kmean.cluster_centers_
y_pred = kmean.predict(reduced_X_train)
accuracy, best_combination = clustering_accuracy(y_true, y_pred)
silhouette = silhouette_score(df, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Silhouette: {silhouette}')
print(best_combination)

Clustering with kmean
Accuracy: 0.8
Silhouette: 0.3856543302536011
{1: 2, 2: 5, 3: 3, 4: 4, 5: 1}


# Late integration approach

## Algorithms and hyperparameter tuning

In [6]:
# for each dataset transcriptome_df, genome_df, proteome_df we need to find the optimal
# for the transcriptome dataframe
print('\nTranscriptome dataset')
# implementing 4 different classifiers
kmean = KMeans(n_clusters=5)
kmean.fit(transcriptome_df)
centroids = kmean.cluster_centers_
y_pred = kmean.predict(transcriptome_df)
accuracy, best_combination_transcriptome = clustering_accuracy(y_true, y_pred)
silhouette = silhouette_score(transcriptome_df, y_pred)

transcriptome_probabilities = soft_clustering_weights(transcriptome_df.values, centroids, best_combination_transcriptome)

print(f'Accuracy: {accuracy}')
print(f'Silhouette: {silhouette}')

# for the genome dataframe
print('Genome dataset')
# implementing 4 different classifiers
kmean = KMeans(n_clusters=5)
kmean.fit(genome_df)
centroids = kmean.cluster_centers_
y_pred = kmean.predict(genome_df)
accuracy, best_combination_genome = clustering_accuracy(y_true, y_pred)
silhouette = silhouette_score(genome_df, y_pred)

genome_probabilities = soft_clustering_weights(genome_df.values, centroids, best_combination_genome)
print(f'Accuracy: {accuracy}')
print(f'Silhouette: {silhouette}')

# for the proteome dataframe
print('Proteome dataset')
kmean = KMeans(n_clusters=5)
kmean.fit(proteome_df)
centroids = kmean.cluster_centers_
y_pred = kmean.predict(proteome_df)
accuracy, best_combination_proteome = clustering_accuracy(y_true, y_pred)
silhouette = silhouette_score(genome_df, y_pred)

proteome_probabilities = soft_clustering_weights(proteome_df.values, centroids, best_combination_proteome)
print(f'Accuracy: {accuracy}')
print(f'Silhouette: {silhouette}')



Transcriptome dataset
Accuracy: 0.8
Silhouette: 0.394550085067749
Genome dataset
Accuracy: 0.85
Silhouette: 0.32901129126548767
Proteome dataset
Accuracy: 0.8
Silhouette: 0.32901129126548767


## Late integration consensus building

In [7]:
threshold = 0.001
y_pred = []
for sample in range(transcriptome_probabilities.shape[0]):
    # for each sample extract the probabilities according to:
    # - each features (on the colums)
    #  - each class (on the row)
    probabilities = [
        # (500, 5)
        transcriptome_probabilities[sample, :],
        genome_probabilities[sample, :],
        proteome_probabilities[sample, :]
    ]

    probabilities = np.array(probabilities).T

    S_a = np.sum(probabilities)
    S_i = np.sum(probabilities, axis=1)
    m = 3
    S_m = S_i / m

    if (np.max(S_i) / S_a < threshold) or (np.max(S_m) < threshold):
        # in order to avoid problems with data type 0 means "unknown"
        y = 0
    else:
        y = np.argmax(S_i) + 1
    y_pred.append(y)

y_pred = np.array(y_pred)
n_unknown = len(np.where(y_pred == 0)[0])
print(f'\nThreshold: {threshold}')
print(f'Number of unknown: {n_unknown}')

accuracy, _ = clustering_accuracy(y_true, y_pred)
print(f'Accuracy: {accuracy}')



Threshold: 0.001
Number of unknown: 0
Accuracy: 0.786
