# Check Old Patience Prevalence

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import os
import random
import dill
import pickle
from tabulate import tabulate
import matplotlib.pyplot as plt


import sys

import warnings
warnings.filterwarnings("ignore")

In [2]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/mimim_iii_readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission


In [3]:
# PARAMETERS

session_seed = 42 # set seed for our session
include_val = False # set to True if we want to also create a validation set
tune_models = False # set to True if we want to perform parameter tuning

icu_stays = True # set to TRUE if we want to have only ICU stays
lemmatize = True # set to false if we want to do stemming
lemma_tag = str(np.where(lemmatize, "_lemma",""))
heavier_proc = True # if we want a heavier processing
if heavier_proc:
    heavier_tag = '_heavier'
else:
    heavier_tag = ''
    
spacy = True
if spacy: lemma_tag = str(np.where(lemmatize, "_lemma_spacy",""))

seed_tag = f'_{session_seed}'

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

random.seed(session_seed)

med_7 = False # set to True if we want to use our Med7 preprocessing

if med_7:
    med_tag = "_med7"
else:
    med_tag = ''
    
feat_select = False # select True if we want to use Lasso as a feature selection method

if feat_select:
    feat_tag = "_featselect"
else:
    feat_tag = ''
    
expanded_def = True # set to True if we want to consider future readmissions and avoid using CMS 

if icu_stays == True:
    icu_folder = 'icu_only'
    if expanded_def:
        icu_folder = 'expanded'
else:
    icu_folder = 'all_hosp'

In [4]:
path_to_data = os.path.join(path_to_repo, "data", icu_folder,"")
print(path_to_data)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\expanded\


In [5]:
path_to_processed = os.path.join(path_to_data,"processed","")
os.makedirs(path_to_processed, exist_ok=True) # we create the directory if it does not exist
print(path_to_processed)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\expanded\processed\


In [6]:
df = pd.read_feather(os.path.join(path_to_data,f"df_cleaned{lemma_tag}{med_tag}{heavier_tag}"))

In [7]:
df.columns

Index(['index', 'subject_id', 'hadm_id', 'admittime', 'dischtime',
       'first_careunit', 'last_careunit', 'age', 'gender', 'marital_status',
       'insurance', 'diagnosis', 'text', 'next_readmit_dt', 'target', 'clean'],
      dtype='object')

In [8]:
df.age

0        76.526788
1        47.845044
2        65.940670
3        50.148292
4        39.866116
           ...    
42820    74.610874
42821    54.459307
42822    54.611874
42823    65.262831
42824    65.377831
Name: age, Length: 42825, dtype: float64

In [9]:
#%% Creating categorical age variable "agecat"

df.loc[(df["age"]<18), "agecat"] = "1(0-17 ans)"
df.loc[(df["age"]>=18) & (df["age"]<45), "agecat"] = "2(18-44 ans)"
df.loc[(df["age"]>=45) & (df["age"]<65), "agecat"] = "3(45-64 ans)"
df.loc[(df["age"]>=65) & (df["age"]<85), "agecat"] = "4(65-84 ans)"
df.loc[(df["age"]>=85), "agecat"] = "5(85 ans et plus)"

In [11]:
df.agecat.value_counts()

4(65-84 ans)         17283
3(45-64 ans)         15074
2(18-44 ans)          6387
5(85 ans et plus)     4081
Name: agecat, dtype: int64

In [12]:
df.marital_status.value_counts()

MARRIED              20594
SINGLE               11141
WIDOWED               5934
DIVORCED              2753
SEPARATED              487
UNKNOWN (DEFAULT)      268
LIFE PARTNER            15
Name: marital_status, dtype: int64

In [14]:
df.insurance.value_counts()

Medicare      23322
Private       13951
Medicaid       3904
Government     1231
Self Pay        417
Name: insurance, dtype: int64

In [16]:
df.gender.value_counts()

M    24170
F    18655
Name: gender, dtype: int64

In [18]:
df.columns

Index(['index', 'subject_id', 'hadm_id', 'admittime', 'dischtime',
       'first_careunit', 'last_careunit', 'age', 'gender', 'marital_status',
       'insurance', 'diagnosis', 'text', 'next_readmit_dt', 'target', 'clean',
       'agecat'],
      dtype='object')

In [36]:
# Features we want to test
categorical_test = ['agecat', 'gender', 'marital_status', 'insurance']
data = df.loc[:,categorical_test + ['target']]

In [37]:
#%% Test proportion difference

def two_prop_test(n1, n2, N1, N2):
    '''
    Implement a proportions difference test between two samples
    Large sample assumed (Z statistics), 2 tailed
    
    Parameters
    ----------
    n1 : counts of success for the first sample
    N1 : first sample size
    n2 : counts of success for the secons sample
    N2 : second sample
    
    Returns
    -------
    Statistic of test and p value
    '''
    
    import numpy as np
    from scipy.stats import norm
    
    p1 = n1/N1
    p2 = n2/N2
    p = (N1*p1 + N2*p2)/(N1+N2)
  
    Z_stat = (p1-p2)/np.sqrt(p*(1-p)/N1 + p*(1-p)/N2)
    p_value = 2*norm.cdf(-np.abs(Z_stat))
    
    return Z_stat, p_value

#%% Multivariate Mahalanobis distance

def mahalanobis(data):
    '''
    Computes the multivariate mahalanobis distance between treatment an control

    Parameters
    ----------
    data : contigency table of proportions
      
    Returns
    -------
    The mahalanobis distance

    '''

    from numpy.linalg import inv
    
    data = data.reset_index()
    T = data["(%+)"]
    C = data["(%-)"]
    
    S_size = data.shape[0]
    
    T = T[[i for i in range(1,S_size)]]
    C = C[[i for i in range(1,S_size)]]
       
    S = np.empty([S_size-1, S_size-1])
    
    for k in range(1,S_size) :
        for l in range(1,S_size):
            if k==l :
                S[k-1,l-1] = (T[k]*(1-T[k])+C[k]*(1-C[k]))/2
            else :
                S[k-1,l-1] = (C[k]*C[l]+T[k]*T[l])/2
    
    print(S)
    
    d_square = np.matmul(np.transpose(T-C),np.matmul(inv(S),(T-C)))
    return np.sqrt(d_square)

#%% Standardized difference

def std_diff(p1, p2):
    '''
    Computes Cohen's d on proportions

    Parameters
    ----------
    p1 : proportion of success in sample 1
    p2 : proportion of success in sample 2

    Returns
    -------
    None.

    '''
    import numpy as np
    d = (p1-p2)/np.sqrt((p1*(1-p1)+p2*(1-p2))/2)
    
    return d


#%% Implement chi-square tests

def los_table(data_quali):
    '''
    Generates a bivariate table with the LOS- or LOS+ as columns
    Then compare proportions with p values and standardized difference

    Parameters
    ----------
    data_quali : a catagorical pandas table with "dureecat" as outcome
    

    Returns
    -------
    Table

    '''

    from scipy.stats import chi2_contingency
    
    restable = []
    
    
    for col in categorical_test:
            
        contable = pd.crosstab(data_quali.loc[:,col], data_quali["target"], margins=True)        
        obs = np.array(contable.iloc[0:(contable.shape[0]-1),0:(contable.shape[1]-1)])       
        chisquare, p_value, df, exp = chi2_contingency(obs)        
        residuals = (obs - exp)/ np.sqrt(exp)       
        p = obs/np.sum(obs, axis=0)        
        phi = np.sqrt(chisquare/data_quali.shape[0])
        
        for i in range(obs.shape[0]) :
            
            Z_stat, mod_p_value = two_prop_test(obs[i,0], obs[i,1], np.sum(obs[:,0], 
                                                                           axis=0), np.sum(obs[:,1], axis=0))
            
            d = std_diff(p[i,1], p[i,0])
            
            restable.append([col, contable.index[i], np.sum(obs, axis=1)[i], 
                             obs[i,0], p[i,0], obs[i,1], p[i,1], p_value, phi, mod_p_value, d])
           
    resdata = pd.DataFrame(restable)
    resdata.columns = ["Variable", "Modality", "N", "NoReadm-", "(%-)", "Readm+", "(%+)", 
                       "p-value", "Size Effect (Phi)", "modality p-value", "Cohen's d"]
    
    return(resdata)   

In [38]:
bivariee = los_table(data)

In [39]:
bivariee

Unnamed: 0,Variable,Modality,N,NoReadm-,(%-),Readm+,(%+),p-value,Size Effect (Phi),modality p-value,Cohen's d
0,agecat,2(18-44 ans),6387,6035,0.149451,352,0.144026,0.369443,0.008573,0.4647097,-0.015333
1,agecat,3(45-64 ans),15074,14237,0.352567,837,0.342471,0.369443,0.008573,0.3102244,-0.021202
2,agecat,4(65-84 ans),17283,16255,0.402541,1028,0.420622,0.369443,0.008573,0.07686084,0.036748
3,agecat,5(85 ans et plus),4081,3854,0.095441,227,0.092881,0.369443,0.008573,0.6755006,-0.008767
4,gender,F,18655,17597,0.435774,1058,0.432897,0.7967375,0.001245,0.7805691,-0.005805
5,gender,M,24170,22784,0.564226,1386,0.567103,0.7967375,0.001245,0.7805691,0.005805
6,marital_status,DIVORCED,2753,2579,0.066512,174,0.07199,0.0002147904,0.024679,0.295413,0.021579
7,marital_status,LIFE PARTNER,15,15,0.000387,0,0.0,0.0002147904,0.024679,0.3334765,-0.027821
8,marital_status,MARRIED,20594,19468,0.502076,1126,0.465867,0.0002147904,0.024679,0.0005517575,-0.072503
9,marital_status,SEPARATED,487,446,0.011502,41,0.016963,0.0002147904,0.024679,0.01595709,0.046116


In [40]:
bivariee.to_excel(f'{path_to_data}check_proportions.xlsx') 