In [4]:
import os
import numpy as np
import pandas as pd

def bootstrap_with_noise(input_file, n, output_folder, noise_std=0.05, random_state=None):
    """
    Bootstraps a dataset with added Gaussian noise to each column, preserving relationships.
    
    Args:
        input_file (str): Path to the input CSV file.
        n (int): Number of rows (patients) in the output dataset.
        output_folder (str): Folder to save the new CSV file.
        noise_std (float): Standard deviation of the Gaussian noise (relative to column std).
        random_state (int or None): Seed for reproducibility.
    """
    np.random.seed(random_state)
    df = pd.read_csv(input_file)
    orig_n = len(df)
    columns = df.columns

    # Bootstrap sampling with replacement
    sampled_df = df.sample(n=n, replace=True, random_state=random_state).reset_index(drop=True)
    
    # Add noise to each column (proportional to original std)
    noisy_df = sampled_df.copy()
    for col in columns:
        col_std = df[col].std()
        noise = np.random.normal(0, noise_std * col_std, size=n)
        noisy_df[col] += noise

    # Convert last two columns to integer
    if len(columns) >= 2:
        noisy_df[columns[-2]] = noisy_df[columns[-2]].round().astype(int)
        noisy_df[columns[-1]] = noisy_df[columns[-1]].round().astype(int)

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)
    base_name = os.path.basename(input_file)
    output_file = os.path.join(output_folder, f"bootstrapped_{n}_{base_name}")
    noisy_df.to_csv(output_file, index=False)
    print(f"Bootstrapped dataset saved to: {output_file}")


In [5]:
bootstrap_with_noise(input_file="/home/studenti/lia/projects/phuse_thesis_2024/PRL/PRL_preanalysis_T1mdc_vs_T1.csv",
                     n=64,
                     output_folder="/home/studenti/lia/projects/phuse_thesis_2024/PRL/output",
                     noise_std=0.05,
                     random_state=42
)

Bootstrapped dataset saved to: /home/studenti/lia/projects/phuse_thesis_2024/PRL/output/bootstrapped_64_PRL_preanalysis_T1mdc_vs_T1.csv


In [6]:
bootstrap_with_noise(input_file="/home/studenti/lia/projects/phuse_thesis_2024/PRL/PRL_preanalysis_T1mdc_vs_T1xFLAIR.csv",
                     n=64,
                     output_folder="/home/studenti/lia/projects/phuse_thesis_2024/PRL/output",
                     noise_std=0.05,
                     random_state=42
)

Bootstrapped dataset saved to: /home/studenti/lia/projects/phuse_thesis_2024/PRL/output/bootstrapped_64_PRL_preanalysis_T1mdc_vs_T1xFLAIR.csv


In [15]:
from scipy.stats import ttest_rel, wilcoxon

def compare_scores(file1, file2, output_file):
    """
    Compares scores between two CSV files (patients as rows, scores as columns) using one-sided t-tests and Wilcoxon test.
    For scores where lower is better (e.g., containing 'Hausdorff'), tests if file1 < file2.
    Otherwise, tests if file1 > file2.
    Saves the statistics for each score to an output CSV file.
    """
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    common_scores = set(df1.columns).intersection(df2.columns)
    results = []

    for score in common_scores:
        data1 = df1[score].dropna()
        data2 = df2[score].dropna()
        paired_data = pd.concat([data1, data2], axis=1, join='inner').dropna()
        if "Hausdorff" in score:
            alternative = 'less'
        else:
            alternative = 'greater'
        t_stat, t_pval = ttest_rel(paired_data.iloc[:, 0], paired_data.iloc[:, 1], alternative=alternative)
        try:
            w_stat, w_pval = wilcoxon(paired_data.iloc[:, 0], paired_data.iloc[:, 1], alternative=alternative)
        except ValueError:
            w_stat, w_pval = float('nan'), float('nan')
        results.append({
            'score': score,
            'mean_file1': data1.mean(),
            'mean_file2': data2.mean(),
            't_statistic': t_stat,
            't_p_value': t_pval,
            'wilcoxon_statistic': w_stat,
            'wilcoxon_p_value': w_pval,
            'n_file1': len(data1),
            'n_file2': len(data2)
        })

    results_df = pd.DataFrame(results)
    results_df.to_csv(output_file, index=False)
    print(f"Statistical test results saved to: {output_file}")


In [16]:
file_T1mdc_vs_T1 ="/home/studenti/lia/projects/phuse_thesis_2024/PRL/output/bootstrapped_64_PRL_preanalysis_T1mdc_vs_T1.csv"
file_T1mdc_vs_T1xFLAIR="/home/studenti/lia/projects/phuse_thesis_2024/PRL/output/bootstrapped_64_PRL_preanalysis_T1mdc_vs_T1xFLAIR.csv"
output_path = "/home/studenti/lia/projects/phuse_thesis_2024/PRL/output/statistical_comparison.csv" 
compare_scores(file_T1mdc_vs_T1xFLAIR, file_T1mdc_vs_T1, output_path)

Statistical test results saved to: /home/studenti/lia/projects/phuse_thesis_2024/PRL/output/statistical_comparison.csv


  z = (r_plus - mn) / se
