In [1]:
"""
BIOINFORMATICS: LAB08
@author: Irene Benedetto
"""
from utils import *
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib
import warnings
matplotlib.use('Agg')
import matplotlib.pyplot as plt
warnings.simplefilter('ignore')

In [2]:
transcriptome_df, genome_df, proteome_df, labels_df = create_dataframe()

transcriptome_df = transcriptome_df.astype(np.float32)
genome_df = genome_df.astype(np.float32)
proteome_df = proteome_df.astype(np.float32)

plt.figure(figsize=(10, 5))
labels_df.groupby(['cluster.id']).size().plot(kind='bar', title='Class distribution in the dataset')
plt.savefig('class_distribution.png')

Length of the transcriptome dataframe: (500, 131)
Length of the genome dataframe: (500, 367)
Length of the proteome dataframe: (500, 160)


# Early integration approach

## Loading data

In [3]:
df = pd.concat([transcriptome_df, genome_df, proteome_df], axis=1)

print(f'Length of the complete dataframe: {df.shape}')
print(f'Length of the labels: {labels_df["cluster.id"].shape}')

X_train, X_test, y_train, y_test = train_test_split(df.values, labels_df["cluster.id"].values, stratify=labels_df["cluster.id"].values)

Length of the complete dataframe: (500, 658)
Length of the labels: (500,)


## Dimensionality reduction

In [4]:
explained_vars = {}

plt.figure(figsize=(15, 7))

MAX_FTS = 375
for i in range(1, MAX_FTS):
    pca = PCA(n_components=i)
    pca.fit(X_train)
    explained_vars[i] = pca.explained_variance_ratio_[-1]

plt.plot(list(explained_vars.keys()), list(explained_vars.values()))
plt.plot(list(explained_vars.keys()), np.cumsum(list(explained_vars.values())))
plt.xlabel('Principal component')
plt.ylabel('Explained variance')
plt.title('Proportion of explained variance (ratio and cumulative) over the components')
plt.grid()
plt.savefig('explained_variance.png')

In [5]:
# the number of features is setted to 20
N_FTS = 50
selector = PCA(n_components=N_FTS)
selector.fit(X_train)
reduced_X_train = selector.transform(X_train)
reduced_X_test = selector.transform(X_test)
print(f'Proportion of variance explained with {N_FTS}: {np.sum(selector.explained_variance_ratio_)}')

Proportion of variance explained with 50: 0.8753693103790283


## Algorithms and hyperparameter tuning

In [6]:
# search for the best classifier and its best configuration of hyperparameters
_ = hyperparameter_tuning(reduced_X_train, y_train, reduced_X_test, y_test)

MLPClassifier
Best configuration: {'hidden_layer_sizes': (50,), 'learning_rate_init': 0.001, 'solver': 'adam'}
Accuracy on validation set (5 folds): 1.0

KNeighborsClassifier
Best configuration: {'n_neighbors': 1}
Accuracy on validation set (5 folds): 1.0

RandomForestClassifier
Best configuration: {'criterion': 'gini', 'n_estimators': 20, 'oob_score': True}
Accuracy on validation set (5 folds): 1.0

LogisticRegression
Best configuration: {'C': 0.001}
Accuracy on validation set (5 folds): 1.0

Best classifier: MLPClassifier
Accuracy on the test set: 1.0


# Late integration approach

## Algorithms and hyperparameter tuning

In [7]:
# LATE INTEGRATION APPROACH
print('\n\nLATE INTEGRATION APPROACH')

# for each dataset transcriptome_df, genome_df, proteome_df we need to find the optimal

# for the transcriptome dataframe
print('\nTranscriptome dataset')
X_train, X_test, y_train, y_test = train_test_split(transcriptome_df, labels_df["cluster.id"].values, shuffle=False)
# implementing 4 different classifiers
best_clf = hyperparameter_tuning(X_train, y_train, X_test, y_test, verbose=False)
transcriptome_probabilities = best_clf.predict_proba(X_test)
transcriptome_probabilities = np.array(transcriptome_probabilities)

# for the genome dataframe
print('\nGenome dataset')
X_train, X_test, y_train, y_test = train_test_split(genome_df, labels_df["cluster.id"].values, shuffle=False)
# implementing 4 different classifiers
best_clf = hyperparameter_tuning(X_train, y_train, X_test, y_test, verbose=False)
genome_probabilities = best_clf.predict_proba(X_test)
genome_probabilities = np.array(genome_probabilities)

# for the proteome dataframe
print('\nProteome dataset')
X_train, X_test, y_train, y_test = train_test_split(proteome_df, labels_df["cluster.id"].values, shuffle=False)
# implementing 4 different classifiers
best_clf = hyperparameter_tuning(X_train, y_train, X_test, y_test, verbose=False)
proteome_probabilities = best_clf.predict_proba(X_test)
proteome_probabilities = np.array(proteome_probabilities)



LATE INTEGRATION APPROACH

Transcriptome dataset
Best classifier: MLPClassifier
Accuracy on the test set: 1.0

Genome dataset
Best classifier: MLPClassifier
Accuracy on the test set: 1.0

Proteome dataset
Best classifier: MLPClassifier
Accuracy on the test set: 1.0


## Late integration consensus building

In [8]:

threshold = 0.99
y_pred = []
for sample in range(transcriptome_probabilities.shape[0]):
    # for each sample extract the probabilities according to:
    # - each features (on the colums)
    #  - each class (on the row)
    probabilities = [
        transcriptome_probabilities[sample],
        genome_probabilities[sample],
        proteome_probabilities[sample]
    ]

    probabilities = np.array(probabilities).T

    S_a = np.sum(probabilities)
    S_i = np.sum(probabilities, axis=1)
    m = 3
    S_m = S_i / m

    if (np.max(S_i) / S_a < threshold) or (np.max(S_m) < threshold):
        y = 'Unknown'
    else:
        y = str(np.argmax(S_i) + 1)
    y_pred.append(y)

y_pred = np.array(y_pred)

n_unknown = len(np.where(y_pred == 'Unknown')[0])
print(f'\nNumber of unknown: {n_unknown}')

for c in range(1, 6):
    correct = len(np.where((y_pred == f'{c}') * (y_test == c))[0])
    n = len(np.where(y_test == c)[0])
    print(f'Class {str(c)}, accuracy: {round(correct / n, 2)}')


Number of unknown: 3
Class 1, accuracy: 0.93
Class 2, accuracy: 1.0
Class 3, accuracy: 1.0
Class 4, accuracy: 0.97
Class 5, accuracy: 1.0
