In [1]:
import pandas as pd
import numpy as np


In [61]:
def summarize_demographics(subset_df):
    
    report_counts(subset_df,"sex")
    report_counts(subset_df,"rf")
    report_counts(subset_df,"smoking")
    
    report_statistics(subset_df, "age")
    report_statistics(subset_df, "bmi")
    report_statistics(subset_df, "crp")
    report_statistics(subset_df, "esr")
    report_statistics(subset_df, "das28crp")
    
    count_drug_usage(subset_df,"mtx")
    count_drug_usage(subset_df,"pred")
    
    tnfi_dmard_list = ["adalimumab", "certolizumab", "etanercept", "infliximab"]
    count_drug_list_usage(subset_df, tnfi_dmard_list, "TNFi-bDMARD")
    
    non_tnfi_dmard_list = ["abatacept", "rituximab", "tocilizumab"]
    count_drug_list_usage(subset_df, non_tnfi_dmard_list, "non-TNFi-bDMARD")
    
    non_mtx_csdmard_list = ["aza", "hcq", "lef", "ssz"]
    count_drug_list_usage(subset_df, non_mtx_csdmard_list, "non-MTX-csDMARD")
    
    
def report_statistics(data_df, column_of_interest):
    #report MEAN, SD, IQR, N/A
    
    q75, q25 = np.nanpercentile(data_df[column_of_interest], [75 ,25])
    
    print ("<< %s >> " % column_of_interest)
    print ("  Mean: %s ± %s [%s–%s]" % (round(data_df[column_of_interest].mean(), 2), round(data_df[column_of_interest].std(),2), round(q25,2), round(q75,2)))
    print ("  Min–Max: %s–%s" % (round(data_df[column_of_interest].min(),2), round(data_df[column_of_interest].max(),2)))
    print ("  N/A:", data_df[column_of_interest].isna().sum())
    
def report_counts(data_df, column_of_interest):
    
    print ("<< %s >> " % column_of_interest)
    value_count = data_df[column_of_interest].value_counts(dropna=False)
    print (value_count)
    
def count_drug_usage(data_df, column_of_interest):
    print ("<< %s >> " % column_of_interest)
    value_count = data_df[column_of_interest].value_counts(dropna=False)
    print (value_count)
    print ("%s given: %s (%s)" % (column_of_interest, value_count[1], (value_count[1]/40)))
    
def count_drug_list_usage(data_df, drug_list, column_of_interest):
    print ("<< %s >> " % column_of_interest)
    
    drug_info_df = data_df[drug_list]
    r, c = drug_info_df.shape

    count = 0
    for i in range(r):
        drug_usage_flag = 0
        for j in range(c):
            drug_use = drug_info_df.iloc[i, j]
            if drug_use == 1:
                drug_usage_flag = 1

        if drug_usage_flag == 1:
            count += 1
    print ("drug usage: %s (%s)" % (count, count/40))



In [36]:
data_file = "../../../preprocessed_data/meta/patient_info_for_statistics.tsv"
data_df = pd.read_csv(data_file, sep ="\t")

In [37]:
control_df = data_df.loc[data_df['acpa'] == 0]
acpa_pos = data_df.loc[data_df['acpa'] == 1]
acpa_neg = data_df.loc[data_df['acpa'] == 2]

In [42]:
summarize_demographics(control_df)

In [59]:
summarize_demographics(acpa_neg)

<< TNFi-bDMARD >> 
drug usage: 3 (0.075)
<< non-TNFi-bDMARD >> 
drug usage: 4 (0.1)
<< non-MTX-csDMARD >> 
drug usage: 10 (0.25)


In [62]:
summarize_demographics(acpa_pos)

<< mtx >> 
1    22
0    18
Name: mtx, dtype: int64
mtx given: 22 (0.55)
<< pred >> 
0    32
1     8
Name: pred, dtype: int64
pred given: 8 (0.2)
<< TNFi-bDMARD >> 
drug usage: 8 (0.2)
<< non-TNFi-bDMARD >> 
drug usage: 2 (0.05)
<< non-MTX-csDMARD >> 
drug usage: 15 (0.375)
