In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

In [2]:
df = pd.read_csv('collect_mi_results.csv')
noise_df = pd.read_csv('analysis/final_results/scaling_plots_u_bar_138.109_I_max_1.419.csv')

In [3]:
def uq(ubar, Imax, q=0.9):
    I = q*Imax
    return ubar*((2**(2*I)- 1)/(2**(2*Imax) - 2**(2*I)))

In [12]:
for metric in noise_df['metric'].unique():
    for method in noise_df['method'].unique():
        subset = noise_df[(noise_df['metric'] == metric) & (noise_df['method'] == method)]
        subset = subset[subset['size']== subset['size'].max()]
        means = subset.mean(numeric_only=True)
        fitted_u_bar = means['fitted_u_bar']
        fitted_I_max = means['fitted_I_max']
        u90 = uq(fitted_u_bar, fitted_I_max, q=0.95)
        print(f'Metric: {metric}, Method: {method}, u90: {u90}')
    print()

df.groupby('dataset').max(numeric_only=True)['umis_per_cell']

Metric: author_day, Method: Geneformer, u90: 1826.9760880813888
Metric: author_day, Method: PCA, u90: 7006.158266932281
Metric: author_day, Method: RandomProjection, u90: 15477.372184533328
Metric: author_day, Method: SCVI, u90: 1624.5033818393458

Metric: celltype.l3, Method: Geneformer, u90: 1916.675353651136
Metric: celltype.l3, Method: PCA, u90: 6565.81994751731
Metric: celltype.l3, Method: RandomProjection, u90: 18421.12738870599
Metric: celltype.l3, Method: SCVI, u90: 3100.337250169485

Metric: clone, Method: Geneformer, u90: 4800.557538161478
Metric: clone, Method: PCA, u90: 15455.880260119675
Metric: clone, Method: RandomProjection, u90: 22622.515629645422
Metric: clone, Method: SCVI, u90: 3973.292483975075

Metric: ng_idx, Method: Geneformer, u90: 2106.8848377341365
Metric: ng_idx, Method: PCA, u90: 13466.024792120035
Metric: ng_idx, Method: RandomProjection, u90: 3042.005134584293
Metric: ng_idx, Method: SCVI, u90: 2775.1500069287904

Metric: protein_counts, Method: Geneforme

dataset
PBMC        8100
larry       2580
merfish      367
shendure    2500
Name: umis_per_cell, dtype: int64

In [14]:
# Get unique metrics and methods
metrics = noise_df['metric'].unique()
methods = noise_df['method'].unique()

# Create a dictionary to store u90 values
u90_dict = {}

for metric in metrics:
    u90_dict[metric] = {}
    for method in methods:
        subset = noise_df[(noise_df['metric'] == metric) & (noise_df['method'] == method)]
        subset = subset[subset['size'] == subset['size'].max()]
        means = subset.mean(numeric_only=True)
        fitted_u_bar = means['fitted_u_bar']
        fitted_I_max = means['fitted_I_max']
        u90 = uq(fitted_u_bar, fitted_I_max, q=0.95)
        u90_dict[metric][method] = u90

# Get max UMIs per cell for each metric (assuming metric corresponds to dataset)
max_umis = df.groupby('signal').max(numeric_only=True)['umis_per_cell'].to_dict()

# Generate LaTeX table
print("\\begin{table}[h]")
print("\\centering")
print("\\begin{tabular}{l" + "r" * (len(methods) + 1) + "}")
print("\\hline")

# Header row
header = "Metric & " + " & ".join(methods) + " & Actual UMIs \\\\"
print(header)
print("\\hline")

# Data rows
for metric in metrics:
    row = metric
    for method in methods:
        row += f" & {u90_dict[metric][method]:.2f}"
    # Add actual UMIs (try to match metric to dataset)
    actual_umi = max_umis.get(metric, "N/A")
    if isinstance(actual_umi, (int, float)):
        row += f" & {actual_umi:.0f}"
    else:
        row += f" & {actual_umi}"
    row += " \\\\"
    print(row)

print("\\hline")
print("\\end{tabular}")
print("\\caption{u95 values by metric and method}")
print("\\label{tab:u95}")
print("\\end{table}")

\begin{table}[h]
\centering
\begin{tabular}{lrrrrr}
\hline
Metric & Geneformer & PCA & RandomProjection & SCVI & Actual UMIs \\
\hline
author_day & 1826.98 & 7006.16 & 15477.37 & 1624.50 & 2500 \\
celltype.l3 & 1916.68 & 6565.82 & 18421.13 & 3100.34 & 8100 \\
clone & 4800.56 & 15455.88 & 22622.52 & 3973.29 & 2580 \\
ng_idx & 2106.88 & 13466.02 & 3042.01 & 2775.15 & 367 \\
protein_counts & 2912.21 & 15212.56 & 20493.15 & 11248.87 & 8100 \\
\hline
\end{tabular}
\caption{u95 values by metric and method}
\label{tab:u95}
\end{table}


In [10]:
max_umis

{'PBMC': 8100, 'larry': 2580, 'merfish': 367, 'shendure': 2500}