In [1]:
import numpy as np
import pandas as pd
from equiflow import *

In [2]:
np.random.seed(42)
n = 100000
data = {
    'age': np.random.randint(10, 80, size=n),
    'sofa': np.random.choice([0,1,2,3,4,5,6,7,8,9,10,15, np.nan], size=n),
    'race': np.random.choice(['White', 'Black', 'Asian', 'Hispanic', None],
                             size=n),
    'sex': np.random.choice(['Male', 'Female'], size=n),
    'english': np.random.choice(['Fluent', 'Limited', None], size=n),
}

for i in range(1, 11):
    data[f'var{i}'] = np.random.randn(n)

df = pd.DataFrame(data)

In [3]:
eqfl = EquiFlow(data=df,
                initial_cohort_label='in MIMIC-IV',
                categorical=['sex','race', 'english'],
                normal=['age'],
                nonnormal=['sofa'],
                rename={'sofa': 'SOFA Score'},
                )

eqfl.add_exclusion(
    df.english.notnull(),
    exclusion_reason='missing English Proficiency',
    new_cohort_label='with English Proficiency data'
)

eqfl.view_table_characteristics()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cohort,Cohort
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Variable,Value,Unnamed: 2_level_2,Unnamed: 3_level_2
Overall,,100000,66397
"sex, N (%)",Female,"49,938 (49.9)","33,271 (50.1)"
"sex, N (%)",Male,"50,062 (50.1)","33,126 (49.9)"
"sex, N (%)",Missing,0 (0.0),0 (0.0)
"race, N (%)",Asian,"20,035 (20.0)","13,315 (20.1)"
"race, N (%)",White,"20,028 (20.0)","13,290 (20.0)"
"race, N (%)",Hispanic,"19,900 (19.9)","13,292 (20.0)"
"race, N (%)",Black,"20,008 (20.0)","13,236 (19.9)"
"race, N (%)",Missing,"20,029 (20.0)","13,264 (20.0)"
"english, N (%)",Limited,"33,154 (33.2)","33,154 (49.9)"


In [4]:
eqfl = EquiFlow(data=df,
                initial_cohort_label='in MIMIC-IV',
                categorical=['sex','race', 'english'],
                normal=['age'],
                nonnormal=['sofa'],
                rename={'sofa': 'SOFA Score'},
                )

eqfl.add_exclusion(
    df.english.notnull(),
    exclusion_reason='missing English Proficiency',
    new_cohort_label='with English Proficiency data'
)

eqfl.plot_flows()

In [5]:
eqfl = EquiFlow(df,
                initial_cohort_label = 'in MIMIC-IV',
                categorical = ['sex', 'race', 'english'],
                nonnormal = ['sofa'],
                )

# exclude patients with missing English proficiency
eqfl.add_exclusion(
    mask = df.english.notnull(),
    exclusion_reason = 'missing English Proficiency',
    new_cohort_label = 'with English Proficiency data'
)

# exclude kids
# eqfl.add_exclusion(
#     mask=df.age >= 18,
#     exclusion_reason='age < 18',
#     new_cohort_label='adults only'
# )     

eqfl.plot_flows()

In [6]:
eqfl = EquiFlow(df,
                initial_cohort_label='MIMIC-IV',
                categorical=['english'],
                normal=['age'],
                nonnormal=['sofa'],
                # rename={
                #     'sofa': 'SOFA',
                #     'age': 'Age',
                #     'english': 'English Proficiency',
                # },
                )

eqfl.add_exclusion(
    mask=df.english.notnull(),
    exclusion_reason='missing English Proficiency',
    new_cohort_label='with English Proficiency data'
)

# add a filter for age
# eqfl.add_exclusion(
#     mask=df.age >= 18,
#     exclusion_reason='age < 18',
#     new_cohort_label='adults only'
# )                    

eqfl.plot_flows(
    legend=True
)

In [5]:
eqfl.view_table_flows()

Cohort Flow,0 to 1
,
"Initial, n",100000.0
"Removed, n",33603.0
"Result, n",66397.0


In [6]:
eqfl.view_table_characteristics()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cohort,Cohort
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Variable,Value,Unnamed: 2_level_2,Unnamed: 3_level_2
Overall,,100000,66397
"sex, N (%)",Female,"49,938 (49.9)","33,271 (50.1)"
"sex, N (%)",Male,"50,062 (50.1)","33,126 (49.9)"
"sex, N (%)",Missing,0 (0.0),0 (0.0)
"race, N (%)",Asian,"20,035 (20.0)","13,315 (20.1)"
"race, N (%)",White,"20,028 (20.0)","13,290 (20.0)"
"race, N (%)",Hispanic,"19,900 (19.9)","13,292 (20.0)"
"race, N (%)",Black,"20,008 (20.0)","13,236 (19.9)"
"race, N (%)",Missing,"20,029 (20.0)","13,264 (20.0)"
"english, N (%)",Limited,"33,154 (33.2)","33,154 (49.9)"


In [7]:
eqfl.view_table_drifts(drifts_by_class=True)

Cohort Flow,0 to 1
sex,0.0
race,0.0
english,0.9
age,0.0
SOFA Score,0.0


In [8]:
eqfl.view_table_drifts(drifts_by_class=False)

KeyError: 'Overall'

In [None]:
data_0 = df.copy()
data_1 = data_0.loc[data_0.english.notnull()]
data_2 = data_1.loc[data_1.sofa.notnull()]

: 

In [None]:
TableFlows(
    dfs = [data_0, data_1, data_2],
    label_suffix=True,
    thousands_sep=False,
).view()

Cohort Flow,0 to 1,1 to 2
,,
"Initial, n",100000.0,49914.0
"Removed, n",50086.0,3770.0
"Result, n",49914.0,46144.0


: 

In [None]:
TableCharacteristics(
    dfs = [data_0, data_1, data_2],
    categorical = ['race','sex', 'english'],
    nonnormal = ['sofa'],
    normal = ['age'],
    format_cat = 'N (%)',
    format_normal = 'Mean',
    format_nonnormal='Median [IQR]',
    missingness = True,
    decimals = 1,
    label_suffix = True,
    thousands_sep = True,
    rename={'race': 'Race and Ethnicity',
            'english': 'English Proficiency',
            'sex':'Sex',
            'sofa': 'SOFA',
            'age': 'Age',  
            }
).view()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cohort,Cohort,Cohort
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,2
Variable,Value,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Overall,,100000,49914,46144
"Race and Ethnicity, N (%)",Asian,"20,035 (20.0)","10,119 (20.3)","9,308 (20.2)"
"Race and Ethnicity, N (%)",White,"20,028 (20.0)","9,917 (19.9)","9,191 (19.9)"
"Race and Ethnicity, N (%)",Hispanic,"19,900 (19.9)","9,985 (20.0)","9,222 (20.0)"
"Race and Ethnicity, N (%)",Black,"20,008 (20.0)","9,937 (19.9)","9,196 (19.9)"
"Race and Ethnicity, N (%)",Missing,"20,029 (20.0)","9,956 (19.9)","9,227 (20.0)"
"Sex, N (%)",Female,"49,938 (49.9)","24,975 (50.0)","23,069 (50.0)"
"Sex, N (%)",Male,"50,062 (50.1)","24,939 (50.0)","23,075 (50.0)"
"Sex, N (%)",Missing,0 (0.0),0 (0.0),0 (0.0)
"English Proficiency, N (%)",Limited,"24,924 (24.9)","24,924 (49.9)","23,047 (49.9)"


: 

In [None]:
TableDrifts(
    dfs=[data_0, data_1, data_2],
    categorical = ['race','sex', 'english'],
    nonnormal = ['sofa'],
    normal = ['age'],
    missingness = True,
    decimals = 3,
    rename={'race': 'Race and Ethnicity',
            'english': 'English Proficiency',
            'sex':'Sex',
            'sofa': 'SOFA',
            'age': 'Age',  
            }
).view()

Unnamed: 0_level_0,Cohort Flow,0 to 1,1 to 2
Variable,Value,Unnamed: 2_level_1,Unnamed: 3_level_1
Overall,,,
Race and Ethnicity,Asian,0.006,0.003
Race and Ethnicity,White,0.004,0.001
Race and Ethnicity,Hispanic,0.003,0.0
Race and Ethnicity,Black,0.003,0.001
Sex,Female,0.002,0.001
Sex,Male,0.002,0.001
English Proficiency,Limited,0.535,0.0
English Proficiency,Fluent,0.536,0.0
Age,,-0.0,-0.0


: 

: 