In [1]:
import pandas as pd
import numpy as np
from lmfit import Model
import glob
import re

In [None]:
def info_scaling_model(x, A, B):
    return 0.5 * np.log2((x*B + 1)/(1 + A*x))

def fit_info_model(x_data, y_data):
    model = Model(info_scaling_model)
    params = model.make_params(A=1e-2, B=1e-2)
    params['A'].min = params['B'].min = 0
    
    try:
        result = model.fit(y_data, params, x=x_data)
        a, b = result.params['A'], result.params['B']
        
        ss_res = np.sum((y_data - result.best_fit)**2)
        ss_tot = np.sum((y_data - np.mean(y_data))**2)
        r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else np.nan
        
        if a.stderr and b.stderr and a.stderr < a.value and b.stderr < b.value:
            return 1/a.value, 0.5*np.log2(b.value/a.value), r_squared
    except:
        pass
    return None, None, None

# DATA LOADING

BASE_URL = 'https://raw.githubusercontent.com/igor-sadalski/Scaling-up-measurement-noise-scaling-laws/main/'
CALTECH_URL = 'https://raw.githubusercontent.com/ggdna/scScaling/main/results/'

RENAME_DICT = {
    'celltype.l3': 'Cell type MI',
    'protein_counts': 'Protein MI',
    'clone': 'Clonal MI',
    'author_day': 'Temporal MI',
    'ng_idx': 'Spatial MI',
    'RandomProjection': 'Rand. Proj.'
}

gaussian_df = pd.read_csv(f'{CALTECH_URL}Caltech101_Gaussian.csv')
gaussian_df['Scale'] = gaussian_df['Scale']**2
res_df = pd.read_csv(f'{CALTECH_URL}Caltech101_resolution.csv')

df = pd.read_csv(f'{BASE_URL}collect_mi_results.csv').replace(RENAME_DICT)
sc_param_df_noise = pd.read_csv('analysis/final_results/scaling_plots_u_bar_138.109_I_max_1.419.csv').replace(RENAME_DICT)

seq_df = pd.read_csv('seq/multisize_gisaid_results.csv')

csv_files = glob.glob('images/tissuemnist_models/result_*.csv')
dfs_tissue = []
for file in csv_files:
    match = re.search(r'result_(.+)\.csv', file)
    if match.group(1) == 'clean':
        downsampling_type, downsampling_level = 'clean', 0.0
    elif 'pix' in match.group(1):
        downsampling_type = 'pixel'
        downsampling_level = float(match.group(1).split('_')[1][:-1])
    elif 'gauss' in match.group(1):
        downsampling_type = 'gaussian'
        downsampling_level = float(match.group(1).split('_')[1][:-1])
    else:
        downsampling_type, downsampling_level = 'unknown', 0.0
    
    df_temp = pd.read_csv(file)
    df_temp['downsampling_level'] = downsampling_level
    df_temp['downsampling_type'] = downsampling_type
    dfs_tissue.append(df_temp)

combined_df = pd.concat(dfs_tissue, ignore_index=True)

In [10]:
results = []

# 1. SEQUENCES
print("Processing sequences...")
for model_size in sorted(seq_df['model_size'].unique()):
    data = seq_df[seq_df['model_size'] == model_size]
    x_data = data['true/error'].values
    y_data = data['mutual_information'].values
    
    _, _, r2 = fit_info_model(x_data, y_data)
    if r2 is not None:
        results.append({
            'dataset': 'sequences',
            'curve_identifier': f'model_size={model_size}',
            'r_squared': r2
        })

# 2. TRANSCRIPTOMICS (precomputed parameters)
print("Processing transcriptomic curves...")
hue_order_metrics = ['Protein MI', 'Clonal MI', 'Temporal MI', 'Spatial MI']
hue_order_methods = ['PCA', 'SCVI', 'Geneformer', 'Rand. Proj.']

for sig in hue_order_metrics:
    for size in df['size'].unique():
        for alg in hue_order_methods:
            data = df[(df['signal']==sig) & (df['size']==size) & (df['algorithm']==alg)]
            if len(data) < 10:
                continue
            
            avg_data = data.groupby('umis_per_cell').mi_value.mean()
            p = sc_param_df_noise[(sc_param_df_noise['metric']==sig) & 
                                   (sc_param_df_noise['method']==alg) & 
                                   (sc_param_df_noise['size']==size)]
            if p.empty:
                continue
            
            xbar, imax = p['fitted_u_bar'].values[0], p['fitted_I_max'].values[0]
            xbar_err, imax_err = p['u_bar_error'].values[0], p['I_max_error'].values[0]
            
            # if xbar_err < xbar and imax_err < imax:
            x_data = avg_data.index.values
            y_data = avg_data.values
            
            # convert back to A, B
            # xbar = 1/A
            # Imax = 0.5*log2(B/A)
            A = 1 / xbar
            B = (2**(2*imax)) * A
            y_pred = info_scaling_model(x_data, A, B)
            
            # Compute RÂ²
            ss_res = np.sum((y_data - y_pred)**2)
            ss_tot = np.sum((y_data - np.mean(y_data))**2)
            r2 = 1 - (ss_res / ss_tot) if ss_tot > 0 else np.nan
            
            results.append({
                'dataset': 'transcriptomics',
                'curve_identifier': f'signal={sig}, size={size}, method={alg}',
                'r_squared': r2
            })
# 3. CALTECH101
print("Processing Caltech101...")
for class_label in gaussian_df['Class label'].unique()[:-1]:
    data = gaussian_df[gaussian_df['Class label'] == class_label]
    x_data = 1/data['Scale'].values
    y_data = data['MI'].values
    
    _, _, r2 = fit_info_model(x_data, y_data)
    if r2 is not None:
        results.append({
            'dataset': 'caltech101',
            'curve_identifier': f'class={class_label}, type=gaussian',
            'r_squared': r2
        })

for class_label in res_df['Class label'].unique()[:-1]:
    data = res_df[res_df['Class label'] == class_label]
    x_data = 1/data['Factor'].values
    y_data = data['MI'].values
    
    _, _, r2 = fit_info_model(x_data, y_data)
    if r2 is not None:
        results.append({
            'dataset': 'caltech101',
            'curve_identifier': f'class={class_label}, type=resolution',
            'r_squared': r2
        })

# 4. TISSUEMNIST
print("Processing TissueMNIST...")
for downsampling_type in ['pixel', 'gaussian']:
    data_subset = combined_df[combined_df['downsampling_type'] == downsampling_type].copy()
    x_transform = (lambda x: x**2) if downsampling_type == 'pixel' else (lambda x: x)
    data_subset['inv_factor'] = 1 / data_subset['downsampling_level']
    
    ova_columns = ['mi_score'] + [col for col in data_subset.columns if 'ova_mi_continuous' in col]
    
    for col in ova_columns[::-1]:
        mask = ~data_subset[col].isna() & ~data_subset['inv_factor'].isna()
        x_data = x_transform(data_subset[mask]['inv_factor'].values)
        y_data = data_subset[mask][col].values
        
        if len(x_data) < 3:
            continue
        
        _, _, r2 = fit_info_model(x_data, y_data)
        if r2 is not None:
            results.append({
                'dataset': 'tissuemnist',
                'curve_identifier': f'type={downsampling_type}, column={col}',
                'r_squared': r2
            })

Processing sequences...
Processing transcriptomic curves...
Processing Caltech101...
Processing TissueMNIST...


In [13]:
r2_df = pd.DataFrame(results)

print(f"\nTotal curves fit: {len(r2_df)}")
print(r2_df.groupby('dataset')['r_squared'].describe())
print(f"\nSample of results:")
print(r2_df.head(10))

r2_df.to_csv('fit_quality_results.csv', index=False)
print("\nResults saved to 'fit_quality_results.csv'")


Total curves fit: 191
                 count      mean       std       min       25%       50%  \
dataset                                                                    
caltech101        10.0  0.984489  0.011143  0.963400  0.977610  0.987664   
sequences          3.0  0.991853  0.004972  0.988016  0.989045  0.990073   
tissuemnist       18.0  0.978859  0.021454  0.913129  0.978262  0.986711   
transcriptomics  160.0  0.978969  0.047045  0.453035  0.978904  0.990488   

                      75%       max  
dataset                              
caltech101       0.992934  0.996977  
sequences        0.993771  0.997470  
tissuemnist      0.992125  0.996483  
transcriptomics  0.993686  0.998850  

Sample of results:
           dataset                                 curve_identifier  r_squared
0        sequences                                  model_size=150M   0.988016
1        sequences                                   model_size=35M   0.990073
2        sequences                 

In [14]:
# mean and standard error of mean by dataset
r2_df.groupby('dataset')['r_squared'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
caltech101,10.0,0.984489,0.011143,0.9634,0.97761,0.987664,0.992934,0.996977
sequences,3.0,0.991853,0.004972,0.988016,0.989045,0.990073,0.993771,0.99747
tissuemnist,18.0,0.978859,0.021454,0.913129,0.978262,0.986711,0.992125,0.996483
transcriptomics,160.0,0.978969,0.047045,0.453035,0.978904,0.990488,0.993686,0.99885


In [17]:
r2_df.groupby('dataset')['r_squared'].sem()

dataset
caltech101         0.003524
sequences          0.002871
tissuemnist        0.005057
transcriptomics    0.003719
Name: r_squared, dtype: float64