In [12]:
import os, glob, random
import pandas as pd
import SimpleITK as sitk
import numpy as np
from scipy import stats  

def get_image_path_by_id(patient_id, image_dir):
    image_order  = patient_id
    file_name = [os.path.relpath(os.path.join(image_dir, x)) \
                    for x in os.listdir(image_dir) \
                    if os.path.isfile(os.path.join(image_dir, x)) and patient_id in x][0] 
    return file_name

In [22]:
import numpy as np
from scipy import stats

def compare_arrays_with_stats(v1, v2, label1="v1", label2="v2"):
    """
    Compare two arrays using statistical tests (Shapiro-Wilk for normality, 
    t-test if Gaussian, Kolmogorov-Smirnov otherwise).
    
    Parameters:
    - v1, v2: Input numpy arrays to compare.
    - label1, label2: Labels for the arrays for display purposes.

    Outputs:
    - Prints the results of the statistical tests.
    """
    # Check normality for both v1 and v2
    shapiro_v1_p = stats.shapiro(v1)[1]
    shapiro_v2_p = stats.shapiro(v2)[1]
    
    print(f'Data amount: {len(v1)}')
    print(f'{label1} ||| mean: {np.mean(v1):.2f}, std: {np.std(v1):.2f}')
    print(f'{label2} ||| mean: {np.mean(v2):.2f}, std: {np.std(v2):.2f}')
    print(f'Normality test p-values - {label1}: {shapiro_v1_p:.3f}, {label2}: {shapiro_v2_p:.3f}\n')
    
    # Decide which statistical test to use based on normality
    if shapiro_v1_p > 0.05 and shapiro_v2_p > 0.05:
        # Use t-test
        t_stat, t_p = stats.ttest_ind(v1, v2)
        test_name = "t-test"
        p_value = t_p
    else:
        # Use KS test
        ks_stat, ks_p = stats.ks_2samp(v1, v2)
        test_name = "KS test"
        p_value = ks_p
    
    # Print the statistical test result
    print(f'Using {test_name}.')
    if p_value > 0.05:
        significance = "Not Significant"
    elif p_value > 0.01:
        significance = "* Significant"
    elif p_value > 0.001:
        significance = "** Significant"
    else:
        significance = "*** Significant"
    
    print(f'{label1} vs {label2}: {test_name} p-value = {p_value:.3f}, {significance}\n')


In [25]:
csv_path = 'csv_and_figure//MRI_volume_external_test211_dataset.csv'
# csv_path = 'csv_and_figure/MRI_Bedrest_fat_ratio_72.csv'
# csv_path = 'csv_and_figure/MRI_volume_internal_test331.csv'

df = pd.read_csv(csv_path)


# df_compare = df[df['Dataset'] == 'AFL']
# # df_compare = df[df['Dataset'] == 'BedRest']
# df_compare = df[df['Dataset'] == 'Tass']

df_compare = df
print(df_compare.shape)
df_compare.head()

(211, 10)


Unnamed: 0,Dataset,CT_id,volume1_gd,volume1_ai,volume2_gd,volume2_ai,volume3_gd,volume3_ai,volume4_gd,volume4_ai
0,AFL,GL_101,354.93566,349.41279,149.88174,154.6536,144.53499,147.41273,348.39595,350.67222
1,AFL,GL_102,468.26079,474.95004,179.84878,178.04438,178.87513,180.49677,471.49742,472.84325
2,AFL,GL_103,325.74621,322.16066,129.83392,128.07937,125.40765,121.38347,328.47109,325.83261
3,AFL,GL_104,485.90606,480.76866,192.52944,196.38083,183.19839,185.0327,490.00003,490.72112
4,AFL,GL_105,378.08718,380.28038,148.91806,147.85137,147.86466,149.44642,372.64739,379.18045


In [26]:
import numpy as np
from scipy import stats  

for i in range(1, 5):
    column_str1, column_str2 = f'volume{i}_gd', f'volume{i}_ai'
    
    v1, v2 = df_compare[column_str1], df_compare[column_str2]
    
    compare_arrays_with_stats(v1, v2)
    print('\n\n\n')

Data amount: 211
v1 ||| mean: 290.29, std: 80.06
v2 ||| mean: 293.04, std: 78.00
Normality test p-values - v1: 0.019, v2: 0.026

Using KS test.
v1 vs v2: KS test p-value = 0.662, Not Significant





Data amount: 211
v1 ||| mean: 124.75, std: 28.38
v2 ||| mean: 121.60, std: 30.23
Normality test p-values - v1: 0.176, v2: 0.143

Using t-test.
v1 vs v2: t-test p-value = 0.271, Not Significant





Data amount: 211
v1 ||| mean: 122.54, std: 27.30
v2 ||| mean: 118.36, std: 28.50
Normality test p-values - v1: 0.381, v2: 0.085

Using t-test.
v1 vs v2: t-test p-value = 0.126, Not Significant





Data amount: 211
v1 ||| mean: 289.61, std: 79.70
v2 ||| mean: 292.14, std: 77.19
Normality test p-values - v1: 0.005, v2: 0.003

Using KS test.
v1 vs v2: KS test p-value = 0.938, Not Significant







In [17]:
import numpy as np
from scipy import stats  

csv_path = 'csv_and_figure/MRI_Bedrest_fat_ratio_72.csv'
df_compare  = pd.read_csv(csv_path)


for i in range(1,5):
    column_str1, column_str2 = 'ratio'+str(i) +'_gd', 'ratio'+str(i) +'_ai'
    v1, v2 = df_compare[column_str1], df_compare[column_str2]

    compare_arrays_with_stats(v1, v2)
    print('\n\n\n')    

Data amount: 72
ratio1_gd ||| mean: 0.14, std: 0.05
ratio1_ai ||| mean: 0.13, std: 0.04
Normality test p-values - ratio1_gd: 0.000, ratio1_ai: 0.000

At least one distribution is not Gaussian. Using KS test.
ratio1_gd vs ratio1_ai: ks_p_value = 0.088 Not Significant





Data amount: 72
ratio2_gd ||| mean: 0.15, std: 0.05
ratio2_ai ||| mean: 0.17, std: 0.07
Normality test p-values - ratio2_gd: 0.000, ratio2_ai: 0.000

At least one distribution is not Gaussian. Using KS test.
ratio2_gd vs ratio2_ai: ks_p_value = 0.192 Not Significant





Data amount: 72
ratio3_gd ||| mean: 0.15, std: 0.05
ratio3_ai ||| mean: 0.16, std: 0.06
Normality test p-values - ratio3_gd: 0.003, ratio3_ai: 0.000

At least one distribution is not Gaussian. Using KS test.
ratio3_gd vs ratio3_ai: ks_p_value = 0.372 Not Significant





Data amount: 72
ratio4_gd ||| mean: 0.13, std: 0.05
ratio4_ai ||| mean: 0.13, std: 0.05
Normality test p-values - ratio4_gd: 0.000, ratio4_ai: 0.000

At least one distribution is not G

In [19]:
import numpy as np
from scipy import stats  


for i in range(1,5):
    column_str1, column_str2 = 'ratio'+str(i) +'_gd', 'ratio'+str(i) +'_ai'
    v1, v2 = df_compare[column_str1], df_compare[column_str2]

    compare_arrays_with_stats(v1, v2)
    print('\n\n\n')      



Data amount: 72
ratio1_gd ||| mean: 0.14, std: 0.05
ratio1_ai ||| mean: 0.13, std: 0.04
Normality test p-values - ratio1_gd: 0.000, ratio1_ai: 0.000

At least one distribution is not Gaussian. Using KS test.
ratio1_gd vs ratio1_ai: ks_p_value = 0.088 Not Significant





Data amount: 72
ratio2_gd ||| mean: 0.15, std: 0.05
ratio2_ai ||| mean: 0.17, std: 0.07
Normality test p-values - ratio2_gd: 0.000, ratio2_ai: 0.000

At least one distribution is not Gaussian. Using KS test.
ratio2_gd vs ratio2_ai: ks_p_value = 0.192 Not Significant





Data amount: 72
ratio3_gd ||| mean: 0.15, std: 0.05
ratio3_ai ||| mean: 0.16, std: 0.06
Normality test p-values - ratio3_gd: 0.003, ratio3_ai: 0.000

At least one distribution is not Gaussian. Using KS test.
ratio3_gd vs ratio3_ai: ks_p_value = 0.372 Not Significant





Data amount: 72
ratio4_gd ||| mean: 0.13, std: 0.05
ratio4_ai ||| mean: 0.13, std: 0.05
Normality test p-values - ratio4_gd: 0.000, ratio4_ai: 0.000

At least one distribution is not G

In [6]:
df.head()

Unnamed: 0,Dataset,CT_id,volume1_gd,volume1_ai,volume2_gd,volume2_ai,volume3_gd,volume3_ai,volume4_gd,volume4_ai
0,AFL,GL_101,354.93566,349.41279,149.88174,154.6536,144.53499,147.41273,348.39595,350.67222
1,AFL,GL_102,468.26079,474.95004,179.84878,178.04438,178.87513,180.49677,471.49742,472.84325
2,AFL,GL_103,325.74621,322.16066,129.83392,128.07937,125.40765,121.38347,328.47109,325.83261
3,AFL,GL_104,485.90606,480.76866,192.52944,196.38083,183.19839,185.0327,490.00003,490.72112
4,AFL,GL_105,378.08718,380.28038,148.91806,147.85137,147.86466,149.44642,372.64739,379.18045
