In [3]:

import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import FuncFormatter
import numpy as np
import os
import seaborn as sns
import warnings
from scipy import stats

warnings.filterwarnings('ignore')

workspace_path = Path(os.getcwd()).parent.parent

# Misc functions

In [4]:


def crosstab_with_percentages(rows, columns, x_order=None, y_order=None, table=False, heatmap=False):
    # Create the crosstab with absolutes
    cols = [c.astype(str) for c in columns]
    rows = [r.astype(str) for r in rows]

    xabs = pd.crosstab(rows, cols).fillna(0)
    xperc = pd.crosstab(rows, cols, normalize='columns').fillna(0)

    if x_order:
        xabs = xabs.reindex(x_order, axis=1)
        xperc = xperc.reindex(x_order, axis=1)

    if y_order:
        xabs = xabs.reindex(y_order)
        xperc = xperc.reindex(y_order)

    combined_data = {}

    for (adx, rabs), (pdx, rperc) in zip(xabs.iterrows(), xperc.iterrows()):

        row_data = {}
        row_total = 0

        for col in xabs.columns:
            row_data[col] = f"{rabs[col]} ({rperc[col]:.1%})"
            row_total += rabs[col]

        row_data['Total'] = f'{row_total} ({row_total / xabs.sum().sum():.1%})'
        combined_data[adx] = row_data
        

    row_data = {}
    row_total = 0
    for col in xabs.columns:
        row_data[col] = f'{sum(xabs[col])}'
        row_total += xabs[col].sum()

    row_data['Total'] = row_total
    combined_data['Total'] = row_data

    if table:
        display(pd.DataFrame.from_dict(combined_data).T)
        
    
    if heatmap:
        sns.heatmap(xperc, annot=True, fmt=".1%", cmap='Blues')
        plt.show()

    return xperc, pd.DataFrame.from_dict(combined_data)

def hist_comparison(data1, cat1, data2, cat2, bins, title, variable, density=True):

    group1 = data1[variable]
    group2 = data2[variable]

    stat, p_value = stats.mannwhitneyu(group1, group2)
    print(f'Group: Male vs Group: Female')
    print(f"H-statistic: {stat}, p-value: {p_value}")

    fig, ax = plt.subplots(1, 2, figsize=(15, 5))
    ax[0].hist(group1, bins=bins, density=True, alpha=0.6, color='red')
    ax[0].set_title(title)

    ax[1].hist(group2, bins=bins, density=True, alpha=0.6, color='blue')
    ax[1].set_title(title)
    plt.show()

def differences_bar_chart(data1, cat1, data2, cat2, variable, title):

    values1 = data1[variable].value_counts().sort_index().rename(f'{variable}_{cat1}').to_frame()
    values2 = data2[variable].value_counts().sort_index().rename(f'{variable}_{cat2}').to_frame()

    values = values1.merge(values2, left_index=True, right_index=True, how='outer').fillna(0)

    # Calculate differences
    differences = values[f'{variable}_{cat1}'] - values[f'{variable}_{cat2}']

    fig, ax = plt.subplots(figsize=(10, 5))

    # Create the bar chart
    bars = ax.bar(differences.index, differences, color=['blue' if value < 0 else 'red' for value in differences])

    # Add labels and title
    ax.set_xticklabels(differences.index, rotation=45)
    # ax.set_ylim(-0.2, 0.2)
    ax.set_xlabel('Categories')
    ax.set_ylabel('Difference')
    ax.set_title(title)
    # Function to format y-tick labels as percentages with sign
    def percentage_with_sign(x, pos):
        return f'{np.abs(x)*100:.0f}%'

    # Apply the formatter to the y-axis
    # ax.yaxis.set_major_formatter(FuncFormatter(percentage_with_sign))

    positive_patch = Patch(color='red', label=f'{cat1} greater than {cat2}')
    negative_patch = Patch(color='blue', label=f'{cat2} greater than {cat1}')
    ax.legend(handles=[positive_patch, negative_patch])

def chi_square_test(data1, cat1, data2, cat2, variable):
    # Create the crosstab with absolutes
    xabs = (
        pd.DataFrame(
            data1[variable]
            .value_counts()
            .sort_index()
        )
        .merge(
            pd.DataFrame(
                data2[variable]
                .value_counts()
                .sort_index()
            ),
            left_index=True,
            right_index=True,
            how='outer',
            suffixes=('_'+cat1, '_'+cat2))
        .fillna(0)
    ).astype(int)

    # Calculate the chi-square test
    chi2, p, dof, expected = stats.chi2_contingency(xabs)

    # print(f'Chi-square test for {variable} between {cat1} and {cat2}')
    # print(f'Chi2: {chi2}')
    # print(f'p-value: {p}')
    # print(f'Degrees of freedom: {dof}')
    # print(f'Expected frequencies table:')
    # display(pd.DataFrame(expected, index=xabs.index, columns=xabs.columns))

    return chi2, p, dof, expected

def run_comparison(data1, cat1, data2, cat2, categorical_dict, numerical_dict, master_bins_dict):

    for variable, name in numerical_dict.items():    
        hist_comparison(data1, cat1, data2, cat2, master_bins_dict[variable], name, variable)

    for variable, name in categorical_dict.items():
        _, _, _, _ = chi_square_test(data1, cat1, data2, cat2, variable)
        differences_bar_chart(data1, cat1, data2, cat2, variable, name)    




# Set up the variable dictionaries we want to examine

In [5]:
categorical_dict = {
    
    'nodule_site' : 'site',
    'nodule_type' : 'type',
    'management_plan' : 'management_plan'
}

numerical_dict = {
    'nodule_diameter_mm' : 'diameter'
}

# Replicate what we have in Results

First off we show that there is minimal difference between male and female and ethnicity for Test Balanced - Actionable Nodules Only


## Test Balanced, Training Data Male Vs Female

In [6]:
diameter_cats = [0, 4, 8, 15, 20, 30, 40, 999]
diameter_lbs = [
    '0-4mm',
    '4-8mm',
    '8-15mm',
    '15-20mm',
    '20-30mm',
    '30-40mm',
    '40+mm'
]

trn_data = (
    pd.read_csv(f'{workspace_path}/metadata/summit/test_balanced/training_metadata.csv')
    .assign(nodule_size=lambda x: pd.cut(x['nodule_diameter_mm'], bins=diameter_cats, labels=diameter_lbs))
)
# training_data = training_data[training_data['management_plan'].isin(['3_MONTH_FOLLOW_UP_SCAN','URGENT_REFERRAL', 'ALWAYS_SCAN_AT_YEAR_1'])]


tst_data = (
    pd.read_csv(f'{workspace_path}/metadata/summit/test_balanced/test_metadata.csv')
    .assign(nodule_size=lambda x: pd.cut(x['nodule_diameter_mm'], bins=diameter_cats, labels=diameter_lbs))
)


male_only = tst_data[
    (tst_data['gender']=='MALE')&\
    (tst_data['management_plan'].isin(['3_MONTH_FOLLOW_UP_SCAN','URGENT_REFERRAL', 'ALWAYS_SCAN_AT_YEAR_1']))
]

female_only = tst_data[
    (tst_data['gender']=='FEMALE')&\
    (tst_data['management_plan'].isin(['3_MONTH_FOLLOW_UP_SCAN','URGENT_REFERRAL', 'ALWAYS_SCAN_AT_YEAR_1']))
]


In [7]:
actionable_tst_data = tst_data[tst_data.management_plan.isin(['3_MONTH_FOLLOW_UP_SCAN','URGENT_REFERRAL', 'ALWAYS_SCAN_AT_YEAR_1'])]
pd.crosstab(
    trn_data.gender,
    trn_data.nodule_type,
    normalize='index'
).merge(pd.crosstab(
    trn_data.gender,
    trn_data.nodule_type,
    ), left_index=True, right_index=True)


nodule_type,CALCIFIED_x,NON_SOLID_x,PART_SOLID_x,PERIFISSURAL_x,SOLID_x,CALCIFIED_y,NON_SOLID_y,PART_SOLID_y,PERIFISSURAL_y,SOLID_y
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
FEMALE,0.104972,0.07577,0.069061,0.464878,0.28532,266,192,175,1178,723
MALE,0.066633,0.046169,0.044921,0.527577,0.314699,267,185,180,2114,1261


In [9]:

df = pd.read_csv('/Users/john/Projects/SOTAEvaluationNoduleDetection/notebooks/FairnessInNoduleDetectionAlgorithms/hits_and_misses/grt123/test_balanced/hits_and_misses_0')

df_a = df[df['management_plan'].isin(['3_MONTH_FOLLOW_UP_SCAN','URGENT_REFERRAL', 'ALWAYS_SCAN_AT_YEAR_1'])]
a = pd.crosstab([df_a['gender'], df_a.nodule_type, df_a.diameter_cats], df_a.miss, margins='index').rename(columns={0: 'Hit', 1: 'Miss'})
p = pd.crosstab([df_a['gender'], df_a.nodule_type, df_a.diameter_cats], df_a.miss, normalize='index').applymap(lambda x: f'{x:.1%}').rename(columns={0: 'Hit', 1: 'Miss'})
a = a.reindex([
    ('FEMALE',  'CALCIFIED',   '0-4mm'),
    ('FEMALE',  'CALCIFIED',   '4-8mm'),    
    ('FEMALE',  'CALCIFIED',  '8-15mm'),    
    ('FEMALE',  'CALCIFIED', '15-20mm'),
    ('FEMALE',  'CALCIFIED', '20-30mm'),
    ('FEMALE',  'CALCIFIED', '30-40mm'),
    ('FEMALE',  'NON_SOLID',   '4-8mm'),
    ('FEMALE',  'NON_SOLID',  '8-15mm'),
    ('FEMALE', 'PART_SOLID',   '4-8mm'),
    ('FEMALE', 'PART_SOLID',  '8-15mm'),    
    ('FEMALE', 'PART_SOLID', '15-20mm'),
    ('FEMALE', 'PART_SOLID', '20-30mm'),
    ('FEMALE', 'PART_SOLID', '30-40mm'),
    ('FEMALE',      'SOLID',   '4-8mm'),
    ('FEMALE',      'SOLID',  '8-15mm'),    
    ('FEMALE',      'SOLID', '15-20mm'),
    ('FEMALE',      'SOLID', '20-30mm'),
    ('FEMALE',      'SOLID', '30-40mm'),
    ('FEMALE',      'SOLID',   '40+mm'),
    (  'MALE',  'CALCIFIED',   '0-4mm'),
    (  'MALE',  'CALCIFIED',   '4-8mm'),
    (  'MALE',  'CALCIFIED',  '8-15mm'),    
    (  'MALE',  'CALCIFIED', '20-30mm'),
    (  'MALE',  'CALCIFIED', '30-40mm'),
    (  'MALE',  'NON_SOLID',   '4-8mm'),
    (  'MALE',  'NON_SOLID',  '8-15mm'),
    (  'MALE',  'NON_SOLID', '15-20mm'),
    (  'MALE',  'NON_SOLID', '20-30mm'),
    (  'MALE', 'PART_SOLID',   '4-8mm'),
    (  'MALE', 'PART_SOLID',  '8-15mm'),    
    (  'MALE', 'PART_SOLID', '20-30mm'),
    (  'MALE', 'PART_SOLID', '30-40mm'),
    (  'MALE',      'SOLID',   '4-8mm'),
    (  'MALE',      'SOLID',  '8-15mm'),   
    (  'MALE',      'SOLID', '15-20mm'),
    (  'MALE',      'SOLID', '20-30mm'),
    (  'MALE',      'SOLID', '30-40mm')])


In [10]:
from utils import copy_numpy_from_cluster, show_numpy_candidate_location


i = df[(df.nodule_type=='SOLID')&(df.diameter_cats=='5-8mm')]


for idx, row in i.sort_values(['miss','gender'], ascending=[False, True]).iterrows():
    print(row['scan_id'], row['miss'], row['gender'])
    copy_numpy_from_cluster(row['scan_id'])
    show_numpy_candidate_location(
        row['scan_id'],
        row['row'], 
        row['col'], 
        row['index'], 
        row['diameter'], 
        row['miss'], 
        row['gender']
    )
    
