In [1]:
import os, glob, random
import pandas as pd
import SimpleITK as sitk
import numpy as np
from scipy import stats  

def get_image_path_by_id(patient_id, image_dir):
    image_order  = patient_id
    file_name = [os.path.relpath(os.path.join(image_dir, x)) \
                    for x in os.listdir(image_dir) \
                    if os.path.isfile(os.path.join(image_dir, x)) and patient_id in x][0] 
    return file_name



In [2]:
import numpy as np
from scipy import stats

def compare_arrays_with_stats(v1, v2, label1="v1", label2="v2"):
    """
    Compare two arrays using statistical tests (Shapiro-Wilk for normality, 
    t-test if Gaussian, Kolmogorov-Smirnov otherwise).
    
    Parameters:
    - v1, v2: Input numpy arrays to compare.
    - label1, label2: Labels for the arrays for display purposes.

    Outputs:
    - Prints the results of the statistical tests.
    """
    # Check normality for both v1 and v2
    shapiro_v1_p = stats.shapiro(v1)[1]
    shapiro_v2_p = stats.shapiro(v2)[1]
    
    print(f'Data amount: {len(v1)}')
    print(f'{label1} ||| mean: {np.mean(v1):.2f}, std: {np.std(v1):.2f}')
    print(f'{label2} ||| mean: {np.mean(v2):.2f}, std: {np.std(v2):.2f}')
    print(f'Normality test p-values - {label1}: {shapiro_v1_p:.3f}, {label2}: {shapiro_v2_p:.3f}\n')
    
    # Decide which statistical test to use based on normality
    if shapiro_v1_p > 0.05 and shapiro_v2_p > 0.05:
        # Use t-test
        t_stat, t_p = stats.ttest_ind(v1, v2)
        test_name = "t-test"
        p_value = t_p
    else:
        # Use KS test
        ks_stat, ks_p = stats.ks_2samp(v1, v2)
        test_name = "KS test"
        p_value = ks_p
    
    # Print the statistical test result
    print(f'Using {test_name}.')
    if p_value > 0.05:
        significance = "Not Significant"
    elif p_value > 0.01:
        significance = "* Significant"
    elif p_value > 0.001:
        significance = "** Significant"
    else:
        significance = "*** Significant"
    
    print(f'{label1} vs {label2}: {test_name} p-value = {p_value:.3f}, {significance}\n')


In [3]:
csv_path = 'MRI_volume_internal_test331.csv'

df = pd.read_csv(csv_path)

df_compare = df
print(df_compare.shape)
df_compare.head()

(331, 10)


Unnamed: 0,Data_set,CT_id,volume1_gd,volume1_ai,volume2_gd,volume2_ai,volume3_gd,volume3_ai,volume4_gd,volume4_ai
0,MRI515,0299_T2,170.11519,172.20813,194.23134,191.95846,166.86214,166.70588,188.9327,189.35886
1,MRI515,0537_T2,201.85218,201.29868,192.23131,193.20245,177.24146,179.0026,207.6086,209.5207
2,MRI515,0450_T1,243.65192,242.78687,96.87088,95.30644,96.70063,96.84787,218.77276,216.18684
3,MRI515,0019_T2,378.97104,373.09953,155.45229,154.73026,143.26801,141.63341,359.64668,356.35743
4,MRI515,0387_T2,175.05756,175.03249,109.21884,109.46109,114.64449,114.17251,178.15256,178.4324


In [4]:
import numpy as np
from scipy import stats  

for i in range(1, 5):
    column_str1, column_str2 = f'volume{i}_gd', f'volume{i}_ai'
    
    v1, v2 = df_compare[column_str1], df_compare[column_str2]
    
    compare_arrays_with_stats(v1, v2)
    print('\n\n\n')

Data amount: 331
v1 ||| mean: 250.81, std: 100.27
v2 ||| mean: 251.48, std: 100.81
Normality test p-values - v1: 0.000, v2: 0.000

Using KS test.
v1 vs v2: KS test p-value = 1.000, Not Significant





Data amount: 331
v1 ||| mean: 142.79, std: 34.50
v2 ||| mean: 142.88, std: 34.49
Normality test p-values - v1: 0.000, v2: 0.000

Using KS test.
v1 vs v2: KS test p-value = 1.000, Not Significant





Data amount: 331
v1 ||| mean: 142.30, std: 33.71
v2 ||| mean: 142.53, std: 33.81
Normality test p-values - v1: 0.000, v2: 0.000

Using KS test.
v1 vs v2: KS test p-value = 1.000, Not Significant





Data amount: 331
v1 ||| mean: 259.66, std: 101.31
v2 ||| mean: 259.63, std: 101.00
Normality test p-values - v1: 0.000, v2: 0.000

Using KS test.
v1 vs v2: KS test p-value = 1.000, Not Significant





