In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import shutil

### Input files 

In [2]:
clinical_file = '../input/clinical_data.xlsx'
survival_file = '../input/severity_category.xlsx'
oxygen_file = '../input/oxygensuppl_category.xlsx'
ethinicity_file = '../input/ethnicity_category.xlsx'
race_file = '../input/race_category.xlsx'

### Objects

In [3]:
os.chdir('classes')
%run longit.ipynb
ecf = LONGIT()
%run categories.ipynb
cat = CATEGORY()
etnic = ETHNICITY()
race = RACE()
%run correct_pval.ipynb
os.chdir('..')

### Variable of interest upon which perform all analyses 

In [4]:
outdir = '../output/'
targetVar = 'CRP' #WBC, neutrophil, lymphocyte, eosinophil, platelet, BUN, CRP
surv_cat_order = ['Survival','Non-survival']
surv_palet_order = ['#98FAF3','#E8384F']
oxyg_cat_order = ['Room air','Nasal canula', 'NRM','CPAP-BPAP','HiFlow', 'Intubation', 'Non-survival']
oxyg_palet_order = ['#4178BC','#ABB2B9','#D6EAF8','#ABEBC6','#F6DDCC','#D7BDE2','#E8384F']
sever_cat_order = ['Room_air', 'Mild', 'Severe', 'Non-survival']
sever_palet_order = ['#4178BC','#37A862','#D68910','#E8384F']
etnic_cat_order = ['Hispanic', 'Non-hispanic', 'Unknown']
etnic_palette_order = ['#4178BC','#37A862','#D68910']
race_cat_order = ['Black', 'White', 'Other']
race_palette_order = ['#454545','#D68910','#1E8449']

### Initializing directory 

In [5]:
# Initialize directory if exists
if os.path.isdir(outdir + targetVar):
    shutil.rmtree(outdir + targetVar)
os.mkdir(outdir + targetVar)

if not os.path.isdir(outdir + targetVar + '/overview'):
    os.mkdir(outdir + targetVar + '/overview')
if not os.path.isdir(outdir + targetVar + '/overview_win5'):
    os.mkdir(outdir + targetVar + '/overview_win5')

if not os.path.isdir(outdir + targetVar + '/survival_win5'):
    os.mkdir(outdir + targetVar + '/survival_win5')

if not os.path.isdir(outdir + targetVar + '/oxygenation_win5'):
    os.mkdir(outdir + targetVar + '/oxygenation_win5')

if not os.path.isdir(outdir + targetVar + '/severity_win5'):
    os.mkdir(outdir + targetVar + '/severity_win5')
# Ethnicity
if not os.path.isdir(outdir + targetVar + '/ethnicity_win5'):
    os.mkdir(outdir + targetVar + '/ethnicity_win5')
# Race
if not os.path.isdir(outdir + targetVar + '/race_win5'):
    os.mkdir(outdir + targetVar + '/race_win5')# Severity

### Loading the data with variables of interest into dataframe 

In [6]:
df0 = pd.read_excel(clinical_file)
# Preparing the df with the same format as with EC50s
df1 = df0.copy()
for indx, row in df0.iterrows():
    patient_onset = df0.at[indx,'sampleID_onset']
    ls = patient_onset.split('-')
    patient = int(ls[0])
    day = ls[1]
#     print(len(ls))
    df1.at[indx,'Pt#'] = patient
    df1.at[indx,'sampleID_onset'] = day
    df1.at[indx,'sampleID'] = patient_onset
display(df1.head())

Unnamed: 0,sampleID_onset,presentation date,WBC,platelet,neutrophil,lymphocyte,eosinophil,CRP,BUN,Pt#,sampleID
0,7,2020-03-10,,,,,,,,6.0,6-7
1,8,2020-03-10,,,,,,,,6.0,6-8
2,9,2020-03-10,6.8,132.0,5.8,0.6,0.0,,9.0,6.0,6-9
3,10,2020-03-10,7.8,131.0,6.8,0.7,0.0,,10.0,6.0,6-10
4,11,2020-03-10,10.0,169.0,6.3,0.7,0.0,,20.0,6.0,6-11


### Loading survival data

In [7]:
# Survival
# Load
# infile = '/Users/gorkalasso/Dropbox (EinsteinMed)/coronavirus/longitudinal_MMC_cohort/update_aug/patient_category_09152020.xlsx'
df_surv = pd.read_excel(survival_file,index_col=0,header=0)
df_surv_gr = df_surv.groupby(['Outcome'])
#Outcome: (0-home, 1-nursing home, 2-dead)
#Survival groups: groups 0 & 1
survival_indx_ar = np.concatenate([np.array(df_surv_gr.get_group('Mild').index), np.array(df_surv_gr.get_group('Severe').index)])
nonsurvival_indx_ar = np.array(df_surv_gr.get_group('Non-survival').index)

### Loading patient categories

In [8]:
# Oxygen supplementation
df_ox = cat.oxygen_supply_to_df(oxygen_file)

# Ethnicity
df_etnic = etnic.cat_to_df(ethinicity_file)

# Race
df_race = race.cat_to_df(race_file)

### Loading data into df with right format for plotting

In [9]:
# Dataframe for a particular variable
df_var = ecf.transform_df_for_plotting(df1,'Pt#','sampleID_onset',targetVar)
df_var_win5 = ecf.average_EC50_slidingWindow(df_var, 2)

### New dataframes with survival data, oxygenation data and severity data 

In [10]:
# add survival category to new df
df_var_surv_win5 = ecf.add_survival_to_df(df_var_win5, survival_indx_ar, nonsurvival_indx_ar)
df_var_surv_win5.to_excel(outdir + targetVar + '/survival_win5/' + targetVar + '_survival.xlsx')

In [11]:
# Add oxygenation to new df
df_var_ox_win5 = ecf.add_category_to_df(df_var_win5, df_ox)

for index, row in df_var_ox_win5.iterrows():
    if row['Patient'] in nonsurvival_indx_ar:
        df_var_ox_win5.at[index, 'Category'] = 'Non-survival'
        
df_var_ox_win5.to_excel(outdir + targetVar + '/oxygenation_win5/'+ targetVar + '_oxygen_win5.xlsx')

In [12]:
# Add severity to new df
cat_four = ['Room air','Nasal canula', 'NRM','CPAP-BPAP','HiFlow', 'Intubation']
cat_two = ['Room_air','Mild', 'Mild', 'Severe','Severe', 'Severe']
df_var_ox2_win5 = df_var_ox_win5.copy()
df_var_ox2_win5.replace(cat_four, cat_two, inplace=True)

for index, row in df_var_ox2_win5.iterrows():
    if row['Patient'] in nonsurvival_indx_ar:
        df_var_ox2_win5.at[index, 'Category'] = 'Non-survival'

df_var_ox2_win5.to_excel(outdir + targetVar + '/severity_win5/' + targetVar + '_severity_5cat_window5.xlsx')

In [13]:
# Add ethnicity to df
df_var_etnic_win5 = ecf.add_category_to_df(df_var_win5, df_etnic)
df_var_etnic_win5.to_excel(outdir + targetVar + '/ethnicity_win5/' + targetVar + '_ethnicity_win5.xlsx')

In [14]:
# Add race to df
df_var_race_win5 = ecf.add_category_to_df(df_var_win5, df_race)
df_var_race_win5.to_excel(outdir + targetVar + '/race_win5/' + targetVar + '_race_win5.xlsx')

### New df with all info for a given variable of interest 

In [15]:
# Making a dataframe with all the required info
df_var_info = df_var.copy()
df_var_win5_info = df_var_win5.copy()

# Survival data
for item in survival_indx_ar:
    df_var_info.loc[item, 'Survival'] = 'Survival'
    df_var_win5_info.loc[item, 'Survival'] = 'Survival'
for item in nonsurvival_indx_ar:
    df_var_info.loc[item, 'Survival'] = 'Non-Survival'
    df_var_win5_info.loc[item, 'Survival'] = 'Non-Survival'
# Oxygenation type
for patient, row in df_ox.iterrows():
    cat = row[0]
    df_var_info.loc[patient, 'Oxygenation'] = cat
    df_var_win5_info.loc[patient, 'Oxygenation'] = cat
    if df_var_info.loc[patient,'Survival'] == 'Survival':
        if cat == 'Room air':
            df_var_info.loc[patient, 'Category'] = 'Room air'
            df_var_win5_info.loc[patient, 'Category'] = 'Room air'
        elif cat == 'Nasal canula' or cat == 'NRM':
            df_var_info.loc[patient, 'Category'] = 'Mild'
            df_var_win5_info.loc[patient, 'Category'] = 'Mild'
        elif cat == 'HiFlow' or cat == 'Intubation' or cat == 'CPAP-BPAP':
            df_var_info.loc[patient, 'Category'] = 'Severe'
            df_var_win5_info.loc[patient, 'Category'] = 'Severe'
    elif df_var_info.loc[patient,'Survival'] == 'Non-Survival':
        df_var_info.loc[patient, 'Category'] = 'Non-Survival'
        df_var_win5_info.loc[patient, 'Category'] = 'Non-Survival'
# ethnicity
for patient, row in df_etnic.iterrows():
#     print(row[0])
    cat = row[0]
    df_var_info.loc[patient, 'Ethnicity'] = cat
    df_var_win5_info.loc[patient, 'Ethnicity'] = cat
    
# race
for patient, row in df_race.iterrows():
    cat = row[0]
    df_var_info.loc[patient, 'Race'] = cat
    df_var_win5_info.loc[patient, 'Race'] = cat
    
df_var_info.to_excel(outdir + targetVar + '/' + targetVar + '_dataframe.xlsx')
df_var_win5_info.to_excel(outdir + targetVar + '/' + targetVar + '_win5_dataframe.xlsx')

### Plotting data coverage and histograms 

In [16]:
%%capture
# Plot datapoints per day
ecf.plot_points_per_day(df_var, outdir + targetVar + '/overview/' + targetVar + '_histogram_patients_per_onsetday.pdf')
ecf.plot_points_per_day(df_var_win5,outdir + targetVar + '/overview_win5/' + targetVar + '_histogram_patients_per_onsetday_win5.pdf')
# Plot datapoints per patient
ecf.plot_points_per_patient(df_var, outdir + targetVar + '/overview/' + targetVar + '_histogram_patients_per_readoutDays.pdf')
ecf.plot_points_per_patient(df_var_win5, outdir + targetVar + '/overview_win5/' + targetVar + '_histogram_patients_per_readoutDays_win5.pdf')
# Plot available data for each patient in a clustermap
ecf.plot_clustermap_availdata(df_var, outdir + targetVar + '/overview/' + targetVar + '_data_coverage.pdf')
ecf.plot_clustermap_availdata(df_var_win5, outdir + targetVar + '/overview_win5/' + targetVar + '_data_coverage_win5.pdf')

### Plotting raw data for each individual patient 

In [17]:
# Each day no window
ecf.plot_ec50(np.array(df_var.index), df_var, outdir + targetVar + '/time_plot', \
              targetVar + '_timeline_all.pdf', targetVar)
ecf.plot_ec50(np.array(df_var.index), df_var_win5, outdir + targetVar + '/time_plot_win5', \
              targetVar + '_timeline_all_win5.pdf', targetVar)
for indx in np.array(df_var.index):
    outfile = targetVar + '_pat_' + str(indx) + '.pdf'
    outfile_win = targetVar + '_pat_' + str(indx) + '_win5.pdf'
    ecf.plot_ec50([indx], df_var, outdir + targetVar + '/time_plot', outfile, targetVar)
    ecf.plot_ec50([indx], df_var_win5, outdir + targetVar + '/time_plot_win5', outfile_win, targetVar)

### Plotting survival/nonsurvival using window5

In [18]:
%%capture
# Gather all pvalues for multiple testing correction
# apply only to days with a mimimun number of samples per category (survival, non-survival)
min_size_pvalcor = 10
day_2_pval, day_2_pvalcor = ecf.multipletesting_day_to_pval(df_var_surv_win5, min_size_pvalcor, False)

# boxplot for each day, statistically comparing survival Vs nonSurvival
day_st = set(df_var_surv_win5['Day'].tolist())
for day in day_st:
    ecf.plot_ec50_surival_boxplot(day, df_var_surv_win5, \
                                  outdir + targetVar + '/survival_win5/'+ targetVar + '_survival_boxplot_win5',
                                  day_2_pval, day_2_pvalcor, min_size_pvalcor, False, targetVar, 0, 0, surv_cat_order, surv_palet_order)

In [19]:
# boxplot and lineplot trajectories
ecf.plot_ec50_survival_trajectory(df_var_surv_win5, \
                                  outdir + targetVar + '/survival_win5/'+ targetVar + '_survival_lineplot_trajectory_win5_mean.pdf', \
                                  False, targetVar, 'lineplot', 'mean', 0, 0, surv_cat_order, surv_palet_order)
ecf.plot_ec50_survival_trajectory(df_var_surv_win5, \
                                  outdir + targetVar + '/survival_win5/'+ targetVar + '_survival_lineplot_trajectory_win5_median.pdf', \
                                  False, targetVar, 'lineplot', 'median', 0, 0, surv_cat_order, surv_palet_order)

In [20]:
# All against all statistical comparison with mann-whitey
day_st = set(df_var_surv_win5['Day'].tolist())
cat_order = ['Non-survival','Survival']
for day in day_st:
    ecf.all_against_all_pairwise_mann_whitney(df_var_surv_win5, day, outdir + targetVar + '/survival_win5',
                                              '/' + targetVar + '_survival_pairw_stats_win5' + '.txt', \
                                              surv_cat_order)

In [21]:
# Correct P-value
correct = CORRECTPVAL(outdir + targetVar + '/survival_win5/'+ targetVar + '_survival_pairw_stats_win5' + '.txt')
correct.readfile()
correct.correctpval_byday(0.05)
correct.save_correction()

### Plotting oxygen supplementation using window5

In [22]:
# Trajectory
ecf.lineplot_ec50_category_trajectory(df_var_ox_win5, 
                                      outdir + targetVar + '/oxygenation_win5/'+ targetVar + '_oxygenation_lineplot_win5_mean.pdf', \
                                      '', '', targetVar, 'mean', oxyg_cat_order, oxyg_palet_order, 0, 0)
ecf.lineplot_ec50_category_trajectory(df_var_ox_win5, 
                                      outdir + targetVar + '/oxygenation_win5/'+ targetVar + '_oxygenation_lineplot_win5_median.pdf', \
                                      '', '', targetVar, 'median', oxyg_cat_order, oxyg_palet_order, 0, 0)

In [23]:
# boxplot_day_by_day
day_st = set(df_var_ox_win5['Day'].tolist())
for day in day_st:
    ecf.plot_ec50_category_boxplot(day, df_var_ox_win5, outdir + targetVar + '/oxygenation_win5/oxygenation_boxplot_window5', 
                                   'oxygen_suppy', oxyg_cat_order, 0, 0, targetVar, oxyg_palet_order)

In [24]:
# All against all statistical comparison with mann-whitey
day_st = set(df_var_ox_win5['Day'].tolist())
for day in day_st:
    ecf.all_against_all_pairwise_mann_whitney(df_var_ox_win5, day,  outdir + targetVar + '/oxygenation_win5',
                                              '/' + targetVar + '_oxygenation_pairw_stats_win5' + '.txt', \
                                              oxyg_cat_order)

In [25]:
# Correct P-value
correct = CORRECTPVAL(outdir + targetVar + '/oxygenation_win5/' + targetVar + '_oxygenation_pairw_stats_win5' + '.txt')
correct.readfile()
correct.correctpval_byday(0.05)
correct.save_correction()

### Plotting severity using window5

In [26]:
# Trajectories
ecf.lineplot_ec50_category_trajectory(df_var_ox2_win5, \
                                      outdir + targetVar + '/severity_win5/' + targetVar + '_lineplot_severity_5cat_win5_mean.pdf', 
                                      '', '', targetVar, 'mean', sever_cat_order, sever_palet_order, '', '')
ecf.lineplot_ec50_category_trajectory(df_var_ox2_win5, \
                                      outdir + targetVar + '/severity_win5/' + targetVar + '_lineplot_severity_5cat_win5_median.pdf', 
                                      '', '', targetVar, 'median', sever_cat_order, sever_palet_order, '', '')

In [27]:
# boxplot_day_by_day
day_st = set(df_var_ox2_win5['Day'].tolist())
for day in day_st:
    ecf.plot_ec50_category_boxplot(day, df_var_ox2_win5, \
                                   outdir + targetVar + '/severity_win5/severity_5cat_boxplot_win5', 
                                   'oxygen_suppy', sever_cat_order, 0, 0, targetVar, sever_palet_order)

In [28]:
# All against all statistical comparison with mann-whitney
for day in day_st:
    ecf.all_against_all_pairwise_mann_whitney(df_var_ox2_win5, day, outdir + targetVar + '/severity_win5/',
                                              targetVar + '_severity_5cat_pairw_stats_win5' + '.txt', \
                                              sever_cat_order) 

In [29]:
# Correct P-value
correct = CORRECTPVAL(outdir + targetVar + '/severity_win5/' + targetVar + '_severity_5cat_pairw_stats_win5' + '.txt')
correct.readfile()
correct.correctpval_byday(0.05)
correct.save_correction()

### Plotting ethnicity using window5¶

In [30]:
# Trajectories
ecf.lineplot_ec50_category_trajectory(df_var_etnic_win5, \
                                      outdir + targetVar + '/ethnicity_win5/' + targetVar + '_lineplot_ethnicity_3cat_win5_mean.pdf', 
                                      '', '', targetVar, 'mean', etnic_cat_order, etnic_palette_order, '', '')
ecf.lineplot_ec50_category_trajectory(df_var_etnic_win5, \
                                      outdir + targetVar + '/ethnicity_win5/' + targetVar + '_lineplot_ethnicity_3cat_win5_median.pdf',
                                      '', '', targetVar, 'median', etnic_cat_order, etnic_palette_order, '', '')

In [31]:
# boxplot_day_by_day
day_st = set(df_var_etnic_win5['Day'].tolist())
for day in day_st:
    ecf.plot_ec50_category_boxplot(day, df_var_etnic_win5, \
                                   outdir + targetVar + '/ethnicity_win5/ethinicity_boxplot_win5', 
                                   'ethnicity', etnic_cat_order, 0, 0, targetVar, etnic_palette_order)

In [32]:
# All against all statistical comparison with mann-whitney
for day in day_st:
    ecf.all_against_all_pairwise_mann_whitney(df_var_etnic_win5, day, outdir + targetVar + '/ethnicity_win5/',
                                              targetVar + '_ethnicity_pairw_stats_win5' + '.txt', etnic_cat_order)

In [33]:
# Correct P-value
%run correct_pvalue_class.ipynb
correct = CORRECTPVAL(outdir + targetVar + '/ethnicity_win5/' + targetVar + '_ethnicity_pairw_stats_win5' + '.txt')
correct.readfile()
correct.correctpval_byday(0.05)
correct.save_correction()

ERROR:root:File `'correct_pvalue_class.ipynb.py'` not found.


### Plotting race using window5¶

In [34]:
# Trajectories
ecf.lineplot_ec50_category_trajectory(df_var_race_win5, \
                                      outdir + targetVar + '/race_win5/' + targetVar + '_lineplot_race_win5_mean.pdf', 
                                      '', '', targetVar, 'mean', race_cat_order, race_palette_order, '', '')
ecf.lineplot_ec50_category_trajectory(df_var_race_win5, \
                                      outdir + targetVar + '/race_win5/' + targetVar + '_lineplot_race_win5_median.pdf',
                                      '', '', targetVar, 'median', race_cat_order, race_palette_order, '', '')

In [35]:
# boxplot_day_by_day
day_st = set(df_var_race_win5['Day'].tolist())
for day in day_st:
    ecf.plot_ec50_category_boxplot(day, df_var_race_win5, \
                                   outdir + targetVar + '/race_win5/ethinicity_boxplot_win5', 
                                   'race', race_cat_order, 0, 0, targetVar, race_palette_order)

In [36]:
# All against all statistical comparison with mann-whitney
for day in day_st:
    ecf.all_against_all_pairwise_mann_whitney(df_var_race_win5, day, outdir + targetVar + '/race_win5/',
                                              targetVar + '_race_pairw_stats_win5' + '.txt', race_cat_order)

In [37]:
# Correct P-value
correct = CORRECTPVAL(outdir + targetVar + '/race_win5/' + targetVar + '_race_pairw_stats_win5' + '.txt')
correct.readfile()
correct.correctpval_byday(0.05)
correct.save_correction()

### The end

In [38]:
print('Finito')

Finito
