## Performance per sample

In [None]:
import sys
sys.path.append('../Src/')
import numpy as np
import pandas as pd

##### Read otus original, reconstructed with autoencoder and predicted from environmental features in test set

In [None]:
df_otus_original = pd.read_csv('../Results/ReconstructionAndPredictionMicrobialComposition/otus_original_test.tsv', index_col=0, sep='\t').T
df_otus_reconstructed = pd.read_csv('../Results/ReconstructionAndPredictionMicrobialComposition/otus_reconstAEfromBiome.tsv', index_col=0, sep='\t').T
df_otus_predicted = pd.read_csv('../Results/ReconstructionAndPredictionMicrobialComposition/otus_predFromDomain.tsv', index_col=0, sep='\t').T

In [None]:
data_otus_original = df_otus_original.to_numpy(dtype=np.float32)
data_otus_reconstructed = df_otus_reconstructed.to_numpy(dtype=np.float32)
data_otus_predicted = df_otus_predicted.to_numpy(dtype=np.float32)

In [None]:
print(df_otus_original.shape)
print(df_otus_reconstructed.shape)
print(df_otus_predicted.shape)

In [None]:
# Absolute abundance transformed to TSS (with epsilon=1E-6)
def transform_to_rel_abundance(dataset):
    epsilon=1E-6
    sum_per_sample = dataset.sum(axis=1)
    num_samples = sum_per_sample.shape
    num_OTUs = np.shape(dataset)[-1] 
    sum_per_sample = sum_per_sample + (num_OTUs * epsilon)
    dividend=dataset+epsilon
    dataset_rel_abund = np.divide(dividend,sum_per_sample[:,None])
    #display(Markdown("{}</p>".format(np.array2string(actual_array,precision=6,floatmode='fixed'))))
    #actual_array.sum(axis=1)
    return dataset_rel_abund

In [None]:
data_otus_original_rel = transform_to_rel_abundance(data_otus_original)

#### Pearson correlation per sample

In [None]:
from scipy import stats

def compute_pearson_correlation_per_sample(original,predicted):
    tot = 0.0
    count = 0
    corr_sample=np.zeros(original.shape[0],dtype=np.float32)
    for i, (actual,pred) in enumerate(zip(original,predicted)): 
        r, _ = stats.pearsonr(actual,pred)
        if not np.isnan(r):
            count += 1
            tot += r
            #print(i,": ",r)
            corr_sample[i]=r
    r_final = tot/count
    print('Average Pearson: ',r_final)
    return corr_sample   

In [None]:
# Pearson correlation: original vs reconstructed (AE)
pearson_per_sample_reconst = compute_pearson_correlation_per_sample(data_otus_original_rel,data_otus_reconstructed)
# Pearson correlation: original vs predicted (from env.features)
pearson_per_sample_predicted = compute_pearson_correlation_per_sample(data_otus_original_rel,data_otus_predicted)

#### Bray-Curtis per sample

In [None]:
from skbio.diversity import beta_diversity

def compute_braycurtis_distance_per_sample(original,predicted):
    tot = 0.0
    count = 0
    bc_sample=np.zeros(original.shape[0],dtype=np.float32)
    for i, (actual,pred) in enumerate(zip(original,predicted)): 
        bc_dm = beta_diversity("braycurtis", [actual,pred]) # Source: http://scikit-bio.org/docs/0.4.2/diversity.html
        bc = bc_dm[0,1]
        count += 1
        tot += bc
        #print(i,": ",bc)
        bc_sample[i]=bc
    bc_final = tot/count
    print('Average Bray-Curtis: ',bc_final)
    return bc_sample   

In [None]:
# Pearson correlation: original vs reconstructed (AE)
braycurtis_per_sample_reconst = compute_braycurtis_distance_per_sample(data_otus_original_rel,data_otus_reconstructed)
# Pearson correlation: original vs predicted (from env.features)
braycurtis_per_sample_predicted = compute_braycurtis_distance_per_sample(data_otus_original_rel,data_otus_predicted)

###### Save performance per sample in a file
To plot a graph in R

In [None]:
def save_performance_per_sample(perf1,perf2,perf3,perf4,sample_names,suffix=''):
    df_performance = pd.DataFrame(perf1, index=sample_names, columns=['pearson_reconstructed'])
    df_temp = pd.DataFrame(perf2, index=sample_names, columns=['pearson_predicted'])
    df_performance = df_performance.join(df_temp)
    df_temp = pd.DataFrame(perf3, index=sample_names, columns=['braycurtis_reconstructed'])
    df_performance = df_performance.join(df_temp)
    df_temp = pd.DataFrame(perf4, index=sample_names, columns=['braycurtis_predicted'])
    df_performance = df_performance.join(df_temp)
    df_performance.to_csv(suffix+'.tsv', index=True, header=True, sep='\t')
    return df_performance

df_perf = save_performance_per_sample( \
            pearson_per_sample_reconst,pearson_per_sample_predicted,\
            braycurtis_per_sample_reconst,braycurtis_per_sample_predicted,\
            df_otus_original.index,'../Results/ReconstructionAndPredictionMicrobialComposition/performance_per_sample')