## Review Benchmarking Results

#### Environment Setup

In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import dataframe_image as dfi

#### Import and Clean Results File

In [2]:
metrics = pd.read_csv("full_results.csv", index_col=0)

In [3]:
# Rename Index
index_names = metrics.index
index_names = [sub.replace('hvg2000_', '2000 Genes') for sub in index_names]
index_names = [sub.replace('hvg4000_', '4000 Genes') for sub in index_names]
index_names = [sub.replace('hvg6000_', '6000 Genes') for sub in index_names]
index_names = [sub.replace('hvg8000_', '8000 Genes') for sub in index_names]
index_names = [sub.replace('hvg10000_', '10000 Genes') for sub in index_names]
index_names = [sub.replace('all-data_', 'All Genes') for sub in index_names]

index_names = [sub.replace('harmony', '-Harmony') for sub in index_names]
index_names = [sub.replace('uncorrected', '-Uncorrected') for sub in index_names]
index_names = [sub.replace('scvi_sample', '-scVI (Sample)') for sub in index_names]
index_names = [sub.replace('scvi_study', '-scVI (Study)') for sub in index_names]
index_names = [sub.replace('scanvi_sample', '-scANVI (Sample)') for sub in index_names]
index_names = [sub.replace('scanvi_study', '-scANVI (Study)') for sub in index_names]

In [4]:
new_index = []
for i in index_names:
    s = i.split('-')
    s = s[1] + ' - ' + s[0]
    new_index.append(s)

In [5]:
for s in range(0,len(metrics.index)):
    old = metrics.index[s]
    new = new_index[s]
    metrics = metrics.rename(index={old:new})

#### Visualise Results Table

In [6]:
metrics = metrics.astype(float)
metrics.style.background_gradient(cmap="Blues")

Unnamed: 0,Silhouette label,cLISI,Silhouette batch,iLISI,KBET,Graph connectivity,PCR comparison,Batch correction,Bio conservation,Total
Harmony - All Genes,0.43637,0.968399,0.855282,0.019165,0.32919,0.337658,0.839754,0.47621,0.702385,0.611915
scANVI (Sample) - All Genes,0.440428,0.980825,0.836541,0.009433,0.286381,0.630657,0.430946,0.438792,0.710626,0.601892
scANVI (Study) - All Genes,0.450249,0.998682,0.802837,0.000218,0.172613,0.687917,0.145333,0.361784,0.724465,0.579393
scVI (Sample) - All Genes,0.452091,0.991411,0.854938,0.004182,0.184939,0.684127,0.395585,0.424754,0.721751,0.602953
scVI (Study) - All Genes,0.452679,0.999284,0.809511,8.6e-05,0.134639,0.68497,0.121725,0.350186,0.725981,0.575663
Uncorrected - All Genes,0.46391,1.0,0.853227,8.9e-05,0.189429,0.591828,0.0,0.326915,0.731955,0.569939
Harmony - 2000 Genes,0.435325,0.966038,0.863379,0.019482,0.336493,0.288929,0.819886,0.465634,0.700681,0.606662
scANVI (Sample) - 2000 Genes,0.42804,0.972049,0.797743,0.015036,0.327418,0.596752,0.351288,0.417648,0.700044,0.587086
scANVI (Study) - 2000 Genes,0.440769,0.992854,0.813611,0.002745,0.236356,0.662097,0.115207,0.366003,0.716811,0.576488
scVI (Sample) - 2000 Genes,0.460908,0.989918,0.885495,0.004803,0.263607,0.678827,0.423596,0.451266,0.725413,0.615754


#### Scale Results Table

Scaled from 0-1, where 1 is the best performing method and 0 is the worst.

In [7]:
metrics_scaled = round((metrics - metrics.min()) / (metrics.max() - metrics.min()),4)
metrics_scaled = metrics_scaled.sort_values(by=["Total"], ascending=False)
metrics_scaled.to_csv('results_scaled.csv')
metrics_scaled.style.background_gradient(cmap="Blues")

Unnamed: 0,Silhouette label,cLISI,Silhouette batch,iLISI,KBET,Graph connectivity,PCR comparison,Batch correction,Bio conservation,Total
scVI (Sample) - 2000 Genes,0.906,0.7031,1.0,0.2446,0.6389,0.946,0.5011,0.8411,0.7899,1.0
scVI (Sample) - 6000 Genes,0.8582,0.7694,0.744,0.1903,0.5061,0.9801,0.5244,0.8202,0.798,0.9763
scVI (Sample) - 10000 Genes,0.8094,0.7579,0.7626,0.1976,0.5475,1.0,0.5095,0.8275,0.7644,0.9725
scVI (Sample) - 4000 Genes,0.8449,0.759,0.8628,0.1982,0.5065,0.96,0.5066,0.8042,0.7849,0.9507
Harmony - 6000 Genes,0.1891,0.1253,0.5579,0.9625,0.9112,0.1535,1.0,1.0,0.0795,0.9256
Harmony - All Genes,0.2296,0.0695,0.6557,0.9837,0.9638,0.1182,0.9933,0.9993,0.0729,0.9221
Harmony - 4000 Genes,0.2296,0.0695,0.6557,0.9837,0.9635,0.1182,0.9933,0.9992,0.0729,0.922
Harmony - 10000 Genes,0.1535,0.1487,0.5136,0.9459,0.8596,0.2086,0.987,0.9963,0.0717,0.9179
Harmony - 8000 Genes,0.1598,0.1468,0.5283,0.9538,0.868,0.1833,0.9912,0.9916,0.0743,0.9128
scVI (Sample) - 8000 Genes,0.7854,0.7704,0.7517,0.1899,0.2659,0.921,0.5165,0.7203,0.7574,0.8326


#### Set Some Performance Thresholds

Select methods which peform best overall.

In [8]:
# Filter for methods with best overall performance
metrics_scaled = metrics_scaled[metrics_scaled["Batch correction"]>0.7]
metrics_scaled = metrics_scaled[metrics_scaled["Bio conservation"]>0.7]
metrics_scaled = metrics_scaled.sort_values(by=["Total"], ascending=False)
metrics_scaled.style.background_gradient(cmap="Blues")

Unnamed: 0,Silhouette label,cLISI,Silhouette batch,iLISI,KBET,Graph connectivity,PCR comparison,Batch correction,Bio conservation,Total
scVI (Sample) - 2000 Genes,0.906,0.7031,1.0,0.2446,0.6389,0.946,0.5011,0.8411,0.7899,1.0
scVI (Sample) - 6000 Genes,0.8582,0.7694,0.744,0.1903,0.5061,0.9801,0.5244,0.8202,0.798,0.9763
scVI (Sample) - 10000 Genes,0.8094,0.7579,0.7626,0.1976,0.5475,1.0,0.5095,0.8275,0.7644,0.9725
scVI (Sample) - 4000 Genes,0.8449,0.759,0.8628,0.1982,0.5065,0.96,0.5066,0.8042,0.7849,0.9507
scVI (Sample) - 8000 Genes,0.7854,0.7704,0.7517,0.1899,0.2659,0.921,0.5165,0.7203,0.7574,0.8326


#### Scale Filtered Methods Table

In [9]:
metrics_scaled = metrics_scaled.sort_values(by=["Total"], ascending=False)
metrics_scaled.style.background_gradient(cmap="Blues")
dfi.export(metrics_scaled,"filtered_scaled_metrics_table.png", dpi=300, table_conversion="matplotlib")
metrics_scaled.style.background_gradient(cmap="Blues")

Unnamed: 0,Silhouette label,cLISI,Silhouette batch,iLISI,KBET,Graph connectivity,PCR comparison,Batch correction,Bio conservation,Total
scVI (Sample) - 2000 Genes,0.906,0.7031,1.0,0.2446,0.6389,0.946,0.5011,0.8411,0.7899,1.0
scVI (Sample) - 6000 Genes,0.8582,0.7694,0.744,0.1903,0.5061,0.9801,0.5244,0.8202,0.798,0.9763
scVI (Sample) - 10000 Genes,0.8094,0.7579,0.7626,0.1976,0.5475,1.0,0.5095,0.8275,0.7644,0.9725
scVI (Sample) - 4000 Genes,0.8449,0.759,0.8628,0.1982,0.5065,0.96,0.5066,0.8042,0.7849,0.9507
scVI (Sample) - 8000 Genes,0.7854,0.7704,0.7517,0.1899,0.2659,0.921,0.5165,0.7203,0.7574,0.8326
