In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

In [2]:
df = pd.read_csv('collect_mi_results.csv')
noise_df = pd.read_csv('analysis/final_results/scaling_plots_u_bar_138.109_I_max_1.419.csv')

In [8]:
noise_df

Unnamed: 0,dataset,method,metric,size,fitted_u_bar,fitted_I_max,u_bar_error,I_max_error
0,shendure,Geneformer,author_day,100,116.021,0.802,24.386,0.036
1,shendure,Geneformer,author_day,359,207.217,1.243,20.798,0.026
2,shendure,Geneformer,author_day,1291,271.351,1.504,44.861,0.051
3,shendure,Geneformer,author_day,4641,403.606,1.794,42.211,0.038
4,shendure,Geneformer,author_day,16681,342.730,1.938,18.838,0.020
...,...,...,...,...,...,...,...,...
195,PBMC,SCVI,protein_counts,4641,1366.756,3.896,294.280,0.109
196,PBMC,SCVI,protein_counts,10000,3339.672,4.588,419.178,0.072
197,PBMC,SCVI,protein_counts,21544,6967.881,5.162,814.322,0.074
198,PBMC,SCVI,protein_counts,46415,5215.633,5.106,569.952,0.067


In [19]:
def uq(ubar, Imax, q=0.9):
    I = Imax + np.log2(q)
    return ubar*((2**(2*I)- 1)/(2**(2*Imax) - 2**(2*I)))

def uq_error(ubar, Imax, ubar_error, Imax_error, q=0.9):
    I = Imax + np.log2(q)
    
    # partial derivative with respect to ubar
    du_dubar = (2**(2*I) - 1) / (2**(2*Imax) - 2**(2*I))
    
    # partial derivative with respect to Imax
    numerator = 2**(2*I) - 1
    denominator = 2**(2*Imax) - 2**(2*I)
    
    # d/dImax of numerator
    d_num = 2**(2*I) * 2 * np.log(2) * (1)  # since I depends on Imax
    
    # d/dImax of denominator
    d_denom = 2**(2*Imax) * 2 * np.log(2) - 2**(2*I) * 2 * np.log(2)
    
    du_dImax = ubar * (d_num * denominator - numerator * d_denom) / (denominator**2)
    
    error = np.sqrt((du_dubar * ubar_error)**2 + (du_dImax * Imax_error)**2)
    
    return error

metrics = noise_df['metric'].unique()
methods = noise_df['method'].unique()

u90_dict = {}
u90_error_dict = {}

for metric in metrics:
    u90_dict[metric] = {}
    u90_error_dict[metric] = {}
    for method in methods:
        subset = noise_df[(noise_df['metric'] == metric) & (noise_df['method'] == method)]
        subset = subset[subset['size'] == subset['size'].max()]
        means = subset.mean(numeric_only=True)
        fitted_u_bar = means['fitted_u_bar']
        u_bar_error = means['u_bar_error']  # 2-sigma CI
        fitted_I_max = means['fitted_I_max']
        i_max_error = means['I_max_error']  # 2-sigma CI
        
        u90 = uq(fitted_u_bar, fitted_I_max, q=0.95)
        u90_error = uq_error(fitted_u_bar, fitted_I_max, u_bar_error, i_max_error, q=0.95)
        
        u90_dict[metric][method] = u90
        u90_error_dict[metric][method] = u90_error

# actual
max_umis = df.groupby('signal').max(numeric_only=True)['umis_per_cell'].to_dict()

# latex
print("\\begin{table}[h]")
print("\\centering")
print("\\begin{tabular}{l" + "r" * (len(methods) + 1) + "}")
print("\\hline")

header = "Metric & " + " & ".join(methods) + " & Actual UMIs \\\\"
print(header)
print("\\hline")

for metric in metrics:
    row = metric
    for method in methods:
        u90 = u90_dict[metric][method]
        u90_err = u90_error_dict[metric][method]
        row += f" & ${u90:.2f} \\pm {u90_err:.2f}$"
    actual_umi = max_umis.get(metric, "N/A")
    if isinstance(actual_umi, (int, float)):
        row += f" & {actual_umi:.0f}"
    else:
        row += f" & {actual_umi}"
    row += " \\\\"
    print(row)

print("\\hline")
print("\\end{tabular}")
print("\\caption{$u_{90}$ values by metric and method with 2$\\sigma$ error bars}")
print("\\label{tab:u90}")
print("\\end{table}")

\begin{table}[h]
\centering
\begin{tabular}{lrrrrr}
\hline
Metric & Geneformer & PCA & RandomProjection & SCVI & Actual UMIs \\
\hline
author_day & $2571.69 \pm 201.05$ & $5606.36 \pm 993.97$ & $8012.47 \pm 1028.85$ & $2815.46 \pm 198.79$ & 2500 \\
celltype.l3 & $4228.60 \pm 677.56$ & $15491.45 \pm 2172.25$ & $34352.42 \pm 4126.74$ & $8112.99 \pm 647.37$ & 8100 \\
clone & $4212.89 \pm 854.17$ & $20190.04 \pm 3764.73$ & $7339.64 \pm 1673.02$ & $5073.33 \pm 456.98$ & 2580 \\
ng_idx & $2212.04 \pm 537.27$ & $32107.48 \pm 31922.48$ & $1339.94 \pm 177.00$ & $5000.44 \pm 830.70$ & 367 \\
protein_counts & $8090.07 \pm 1510.31$ & $46266.58 \pm 9330.52$ & $46989.55 \pm 5007.84$ & $44958.50 \pm 6071.71$ & 8100 \\
\hline
\end{tabular}
\caption{$u_{90}$ values by metric and method with 2$\sigma$ error bars}
\label{tab:u90}
\end{table}


In [11]:
pd.DataFrame(u90_dict)

Unnamed: 0,author_day,celltype.l3,clone,ng_idx,protein_counts
Geneformer,2571.686831,4228.60227,4212.893279,2212.041026,8090.068957
PCA,5606.356827,15491.454246,20190.041003,32107.479742,46266.58123
RandomProjection,8012.472092,34352.424179,7339.642309,1339.942664,46989.546775
SCVI,2815.463231,8112.987307,5073.334062,5000.442171,44958.500087


In [12]:
pd.DataFrame(u90_error_dict)

Unnamed: 0,author_day,celltype.l3,clone,ng_idx,protein_counts
Geneformer,201.046401,677.555046,854.168723,537.265068,1510.309037
PCA,993.970446,2172.252364,3764.730875,31922.477577,9330.515202
RandomProjection,1028.853347,4126.735534,1673.020498,177.001686,5007.84134
SCVI,198.78913,647.373801,456.976572,830.698176,6071.70861


In [20]:
max_umis

{'author_day': 2500,
 'celltype.l3': 8100,
 'clone': 2580,
 'ng_idx': 367,
 'protein_counts': 8100}