In [6]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(
    "/home/jupyter/igor_repos/exploration/noise_scaling_laws/Scaling-up-measurement-noise-scaling-laws/collect_mi_results.csv"
)

In [7]:
# Rename columns for clarity
df = df.rename(columns={"algorithm": "method", "signal": "metric"})
df

Unnamed: 0,dataset,size,quality,method,metric,seed,mi_value,umis_per_cell
0,PBMC,100,0.001235,Geneformer,celltype.l3,42,0.53784,10
1,PBMC,100,0.001235,Geneformer,celltype.l3,2701,0.39424,10
2,PBMC,100,0.001235,Geneformer,protein_counts,42,0.71425,10
3,PBMC,100,0.001235,Geneformer,protein_counts,2701,0.63226,10
4,PBMC,100,0.001235,PCA,celltype.l3,42,0.30650,10
...,...,...,...,...,...,...,...,...
4839,shendure,10000000,1.000000,PCA,author_day,42,1.14821,2500
4840,shendure,10000000,1.000000,PCA,author_day,1404,1.09550,2500
4841,shendure,10000000,1.000000,RandomProjection,author_day,42,0.62163,2500
4842,shendure,10000000,1.000000,RandomProjection,author_day,1404,0.62163,2500


In [8]:
import matplotlib.pyplot as plt
import numpy as np
from lmfit import Model, Parameters


def info_scaling(x, A, B):
    """
    Information scaling function: I(x) = 0.5 * log2((x*B + 1)/(1 + A*x))
    """
    return 0.5 * np.log2((x * B + 1) / (1 + A * x))


def fit_noise_scaling_model(u_values, mi_values):
    """
    Fit the noise scaling model to data and return I_max and u_bar with uncertainties.

    Parameters:
    u_values: array of UMI per cell values
    mi_values: array of mutual information values

    Returns:
    dict with I_max, u_bar and their uncertainties, plus fit success status
    """

    # Define the noise scaling function for fitting
    def info_scaling_local(x, A, B):
        """
        Information scaling function: I(x) = 0.5 * log2((x*B + 1)/(1 + A*x))
        """
        return 0.5 * np.log2((x * B + 1) / (1 + A * x))

    def info_max(A, B):
        """Calculate maximum information"""
        return 0.5 * np.log2(B / A)

    # Create lmfit model
    model = Model(info_scaling_local)

    # Set up parameters with initial values and bounds
    params = model.make_params(A=1e-2, B=1e-2)  # initial guesses
    params["A"].min = 0  # set bounds
    params["B"].min = 0

    # Fit the curve
    try:
        result = model.fit(mi_values, params, x=u_values)
        A_fit = result.params["A"].value
        B_fit = result.params["B"].value
        A_err = result.params["A"].stderr
        B_err = result.params["B"].stderr

        # Calculate derived quantities
        I_max = info_max(A_fit, B_fit)
        u_bar = 1 / A_fit

        # Calculate uncertainties using error propagation
        # For I_max = 0.5 * log2(B/A), error propagation gives:
        I_max_err = 0.5 * np.sqrt((A_err / A_fit) ** 2 + (B_err / B_fit) ** 2) / np.log(2)

        # For u_bar = 1/A, error is |du_bar/dA| * A_err = A_err/A^2
        u_bar_err = A_err / (A_fit**2)

        return {
            "I_max": I_max,
            "I_max_err": I_max_err,
            "u_bar": u_bar,
            "u_bar_err": u_bar_err,
            "fit_success": True,
            "result": result,
            "A_fit": A_fit,
            "B_fit": B_fit,
        }

    except Exception as e:
        print(f"    Fitting failed: {e}")
        return {
            "I_max": np.nan,
            "I_max_err": np.nan,
            "u_bar": np.nan,
            "u_bar_err": np.nan,
            "fit_success": False,
            "result": None,
        }

In [11]:
# Create a loop to iterate over different datasets, methods, metrics and cell sizes
import os

# Get unique combinations of dataset, size, method, and metric
unique_combinations = df.groupby(["dataset", "size", "method", "metric"]).size().reset_index(name="count")

print(f"Found {len(unique_combinations)} unique combinations to fit")

# Initialize list to store all results
all_fit_results = []

# Loop through each combination
for idx, row in unique_combinations.iterrows():
    dataset = row["dataset"]
    size = row["size"]
    method = row["method"]
    metric = row["metric"]

    print(f"Processing {idx+1}/{len(unique_combinations)}: {dataset}, {size}, {method}, {metric}")

    # Filter data for current combination
    filtered_data = df[
        (df["dataset"] == dataset) & (df["size"] == size) & (df["method"] == method) & (df["metric"] == metric)
    ]

    # Skip if not enough data points
    if len(filtered_data) < 3:
        print(f"  Skipping - insufficient data points ({len(filtered_data)})")
        continue

    # Extract data for fitting
    u_values = filtered_data["umis_per_cell"].values
    mi_values = filtered_data["mi_value"].values

    # Fit the model
    fit_results = fit_noise_scaling_model(u_values, mi_values)

    if fit_results["fit_success"]:
        print(f"  Success: I_max={fit_results['I_max']:.4f}, u_bar={fit_results['u_bar']:.1f}")

        # Calculate fitted MI values for the original data points
        fitted_mi_values = info_scaling(u_values, fit_results["A_fit"], fit_results["B_fit"])

        # Add new columns to the filtered data
        filtered_data_with_fits = filtered_data.copy()
        filtered_data_with_fits["fitted_mi_value"] = fitted_mi_values
        filtered_data_with_fits["I_max"] = fit_results["I_max"]
        filtered_data_with_fits["I_max_err"] = fit_results["I_max_err"]
        filtered_data_with_fits["u_bar"] = fit_results["u_bar"]
        filtered_data_with_fits["u_bar_err"] = fit_results["u_bar_err"]
        filtered_data_with_fits["A_fit"] = fit_results["A_fit"]
        filtered_data_with_fits["B_fit"] = fit_results["B_fit"]

        # Append to results list
        all_fit_results.append(filtered_data_with_fits)

    else:
        print(f"  Failed: Fit unsuccessful")

# Combine all results into a single DataFrame
if all_fit_results:
    combined_results = pd.concat(all_fit_results, ignore_index=True)

    # Save to CSV
    output_filename = "/home/jupyter/igor_repos/exploration/noise_scaling_laws/Scaling-up-measurement-noise-scaling-laws/fitted_data_with_results.csv"
    combined_results.to_csv(output_filename, index=False)
    print(f"\nAll fitted data saved to: {output_filename}")

    # Display summary
    print(f"\nSuccessfully fitted {len(all_fit_results)} combinations")
    print(f"Total data points with fits: {len(combined_results)}")

    # Display first few rows
    print("\nFirst few rows of combined results:")
    display(combined_results.head())

else:
    print("\nNo successful fits to save")

Found 200 unique combinations to fit
Processing 1/200: PBMC, 100, Geneformer, celltype.l3
  Success: I_max=2.0316, u_bar=166.2
Processing 2/200: PBMC, 100, Geneformer, protein_counts
  Success: I_max=1.9125, u_bar=63.0
Processing 3/200: PBMC, 100, PCA, celltype.l3
  Success: I_max=3.0905, u_bar=1624.5
Processing 4/200: PBMC, 100, PCA, protein_counts
  Success: I_max=3.7364, u_bar=3435.4
Processing 5/200: PBMC, 100, RandomProjection, celltype.l3
  Success: I_max=2.6343, u_bar=3660.6
Processing 6/200: PBMC, 100, RandomProjection, protein_counts
  Success: I_max=3.1474, u_bar=4874.1
Processing 7/200: PBMC, 100, SCVI, celltype.l3
  Success: I_max=2.8794, u_bar=1095.3
Processing 8/200: PBMC, 100, SCVI, protein_counts
  Success: I_max=3.0906, u_bar=1029.4
Processing 9/200: PBMC, 215, Geneformer, celltype.l3
  Success: I_max=2.1771, u_bar=202.1
Processing 10/200: PBMC, 215, Geneformer, protein_counts
  Success: I_max=2.2610, u_bar=134.9
Processing 11/200: PBMC, 215, PCA, celltype.l3
  Success

Unnamed: 0,dataset,size,quality,method,metric,seed,mi_value,umis_per_cell,fitted_mi_value,I_max,I_max_err,u_bar,u_bar_err,A_fit,B_fit
0,PBMC,100,0.001235,Geneformer,celltype.l3,42,0.53784,10,0.459889,2.031599,0.120924,166.228424,22.113866,0.006016,0.100563
1,PBMC,100,0.001235,Geneformer,celltype.l3,2701,0.39424,10,0.459889,2.031599,0.120924,166.228424,22.113866,0.006016,0.100563
2,PBMC,100,0.002598,Geneformer,celltype.l3,42,0.71172,21,0.733065,2.031599,0.120924,166.228424,22.113866,0.006016,0.100563
3,PBMC,100,0.002598,Geneformer,celltype.l3,2701,0.76171,21,0.733065,2.031599,0.120924,166.228424,22.113866,0.006016,0.100563
4,PBMC,100,0.005468,Geneformer,celltype.l3,42,1.08677,44,1.050387,2.031599,0.120924,166.228424,22.113866,0.006016,0.100563
