In [1]:
import re
import pandas as pd
import numpy as np

## Cleaning up clinical data table

In [2]:
master_dir = ''
df = pd.read_csv(master_dir + "data/validation_metabolites/observations_2020-09-04_1027.tsv", sep='\t')
df.head()

Unnamed: 0,source_label,observation_id,incov_redcap_event_name,observation_days_since_enrollment,abdominal_pain,ambulatory,blood_draw_status,blood_draw_status_bad,blood_draw_type,blood_oxygenation,...,pulse,respiratory_rate,respiratory_support,sputum,systolic_bp,temperature,temperature_max,temperature_min,weight,who_ordinal_scale
0,INCOV001,d79653c2-9fd2-42d4-bc55-389c3a195199,admittance_arm_1,-8.42,No,Limitation of activities,,,,96.0,...,69.0,20.0,,No,138.0,36.8,37.1,36.8,97.5,3
1,INCOV001,0136c6b6-5c01-42ca-a1ea-d1286f193041,baseline_blood_dra_arm_1,0.92,No,Limitation of activities,Complete,,Baseline,95.0,...,55.0,22.0,,No,139.0,36.6,36.6,36.4,97.5,3
2,INCOV001,d372353b-edf7-4d01-adc9-567603c30bc7,acute_blood_draw_arm_1,5.92,No,Limitation of activities,Complete,,Acute,96.0,...,66.0,18.0,,No,140.0,36.3,36.6,36.3,97.5,3
3,INCOV001,0c738d29-36ec-4de2-8c0a-b0e4fa76ebd2,convalescent_blood_arm_1,105.21,Unknown,,Complete,,Convalescent,,...,,,,,,,,,,<=2
4,INCOV002,febb8188-16ae-48e5-8a73-e711dd811856,admittance_arm_1,-1.04,No,Limitation of activities,,,,94.0,...,97.0,20.0,Nasal cannula,No,137.0,37.8,38.5,37.2,90.7,4


In [3]:
df['incov_redcap_event_name'].unique()

array(['admittance_arm_1', 'baseline_blood_dra_arm_1',
       'acute_blood_draw_arm_1', 'convalescent_blood_arm_1',
       'physician_discreti_arm_1'], dtype=object)

In [4]:
# Only need to look at T1 and T2, which are the two items below respectively
df = df[df['incov_redcap_event_name'].isin(['baseline_blood_dra_arm_1', 'acute_blood_draw_arm_1'])]
df['incov_redcap_event_name'] = df['incov_redcap_event_name'].map({'baseline_blood_dra_arm_1': 'T1', 
                                                                   'acute_blood_draw_arm_1': 'T2'})

In [5]:
df['patient_num'] = df['source_label'].str.replace("INCOV", "").str.replace("^0+", "")

In [6]:
df['patient_num'] = df['patient_num'] + '-' + df['incov_redcap_event_name'].apply(lambda x: x[-1])

In [7]:
df.tail()

Unnamed: 0,source_label,observation_id,incov_redcap_event_name,observation_days_since_enrollment,abdominal_pain,ambulatory,blood_draw_status,blood_draw_status_bad,blood_draw_type,blood_oxygenation,...,respiratory_rate,respiratory_support,sputum,systolic_bp,temperature,temperature_max,temperature_min,weight,who_ordinal_scale,patient_num
535,INCOV162,96187e76-58a4-4bf5-afdc-014cd81b70b8,T1,0.0,No,Limitation of activities,Complete,,Baseline,99.0,...,24.0,,Unknown,122.0,36.7,37.2,36.4,106.5,3,162-1
536,INCOV162,d75aee66-b021-4181-9601-0b390172c442,T2,11.12,Unknown,No limitation of activities,Complete,,Acute,,...,,,Unknown,,,,,,1,162-2
537,INCOV163,c51b79f1-b200-44fd-b355-3bd685c7d5ef,T1,0.58,Unknown,Limitation of activities,Complete,,Baseline,95.0,...,16.0,Nasal cannula,Yes,103.0,36.6,36.6,36.4,82.1,4,163-1
538,INCOV163,1f9d562c-22ee-459c-9353-72c7ad23a76b,T2,2.58,No,Limitation of activities,Complete,,Acute,95.0,...,16.0,,Yes,97.0,36.4,36.7,36.4,82.7,3,163-2
540,INCOV165,5d2f8a85-dca3-4c2d-822e-af0130e80a08,T1,0.71,No,,Complete,,Baseline,99.0,...,20.0,High flow nasal cannula (HFNC),No,100.0,36.3,36.9,36.3,80.0,5,165-1


In [8]:
df.to_csv(master_dir + "data/observations_2020-09-04_1027-cleaned.tsv", index=False)

## Cleaning up drug treatment data table

In [9]:
df = pd.read_csv(master_dir + 'data/validation_metabolites/drug_treatments_2020-09-04_1027.tsv', sep='\t')
df.head()

Unnamed: 0,source_label,observation_id,drug_treatment,treatment_type,start_treatment_days_since_enrollment,end_treatment_days_since_enrollment
0,INCOV001,4556f31a-2c98-47a3-b91d-6c83ecd306bc,vir_0,Remdesivir (RDV),1.67,6.0
1,INCOV002,6ca4bf39-8c44-455d-adfa-24c781b43016,adj_0,Tocilizumab,0.42,0.46
2,INCOV002,6ca4bf39-8c44-455d-adfa-24c781b43016,adj_1,Tocilizumab,3.0,3.04
3,INCOV002,6ca4bf39-8c44-455d-adfa-24c781b43016,adj_2,Tocilizumab,3.54,3.58
4,INCOV002,6ca4bf39-8c44-455d-adfa-24c781b43016,adj_3,Vitamin C,-0.12,4.88


In [10]:
df['drug_treatment'].unique()

array(['vir_0', 'adj_0', 'adj_1', 'adj_2', 'adj_3', 'adj_4', 'vir_1',
       'vir_2', 'adj_5'], dtype=object)

In [11]:
df['patient_num'] = df['source_label'].str.replace("INCOV", "").str.replace("^0+", "")

In [12]:
df_clinical = pd.read_csv(master_dir + "data/observations_2020-09-04_1027-cleaned.tsv")
df_clinical.head()

Unnamed: 0,source_label,observation_id,incov_redcap_event_name,observation_days_since_enrollment,abdominal_pain,ambulatory,blood_draw_status,blood_draw_status_bad,blood_draw_type,blood_oxygenation,...,respiratory_rate,respiratory_support,sputum,systolic_bp,temperature,temperature_max,temperature_min,weight,who_ordinal_scale,patient_num
0,INCOV001,0136c6b6-5c01-42ca-a1ea-d1286f193041,T1,0.92,No,Limitation of activities,Complete,,Baseline,95.0,...,22.0,,No,139.0,36.6,36.6,36.4,97.5,3,1-1
1,INCOV001,d372353b-edf7-4d01-adc9-567603c30bc7,T2,5.92,No,Limitation of activities,Complete,,Acute,96.0,...,18.0,,No,140.0,36.3,36.6,36.3,97.5,3,1-2
2,INCOV002,7403b5ac-6810-431e-bc21-f39a81edbf27,T1,0.71,No,Limitation of activities,Complete,,Baseline,96.0,...,20.0,High flow nasal cannula (HFNC),Yes,96.0,37.3,38.5,36.3,90.7,5,2-1
3,INCOV002,b21d9bf4-07cf-4d0d-9638-0e02f739f989,T2,5.75,Unknown,Limitation of activities,Complete,,Acute,91.0,...,20.0,Other,No,125.0,36.8,36.8,36.4,94.8,7,2-2
4,INCOV003,04e24104-e195-4a32-85e7-bdc935800f4a,T1,0.54,Unknown,Limitation of activities,Complete,,Baseline,96.0,...,21.0,Other,Yes,89.0,37.2,37.2,36.7,77.6,7,3-1


In [13]:
df['treatment_type'].unique()

array(['Remdesivir (RDV)', 'Tocilizumab', 'Vitamin C', 'Zinc',
       'Hydroxychloroquine (HCQ)', 'HCQ + Azithromycin', 'Steroid',
       'Other', 'Plasma Transfusions'], dtype=object)

In [19]:
df[df['treatment_type']=='Remdesivir (RDV)']

Unnamed: 0,source_label,observation_id,drug_treatment,treatment_type,start_treatment_days_since_enrollment,end_treatment_days_since_enrollment,patient_num
0,INCOV001,4556f31a-2c98-47a3-b91d-6c83ecd306bc,vir_0,Remdesivir (RDV),1.67,6.0,1
7,INCOV002,e64110e8-ef0a-40ca-a668-b29895e878e1,vir_1,Remdesivir (RDV),1.29,4.25,2
9,INCOV003,5ca162aa-bf11-4fd9-9b9f-a2f56a4d0dd8,vir_1,Remdesivir (RDV),1.54,10.54,3
13,INCOV004,b3899e45-3ad7-4b5b-b727-f54256df3eb0,vir_0,Remdesivir (RDV),0.88,9.92,4
14,INCOV005,c6bf24de-b963-4cb7-ac9f-9467239573d2,vir_0,Remdesivir (RDV),0.12,9.17,5
21,INCOV006,456b2bac-b5c7-4449-bba3-a6ef38f36f10,vir_0,Remdesivir (RDV),0.17,2.17,6
24,INCOV007,7d265e46-9aba-42c6-81d4-b10b220931b2,vir_0,Remdesivir (RDV),1.0,4.0,7
31,INCOV009,4084c9e0-9d73-4280-9c23-7ef773664c1b,vir_0,Remdesivir (RDV),-6.38,2.62,9
34,INCOV012,d299b7bd-675d-4606-9db0-d48f4c7d8592,vir_0,Remdesivir (RDV),0.29,5.29,12
37,INCOV013,0fc445ff-776d-4662-bc30-79a3e4dcbe8c,vir_2,Remdesivir (RDV),1.92,10.92,13


In [21]:
df[df['treatment_type']=='Tocilizumab']

Unnamed: 0,source_label,observation_id,drug_treatment,treatment_type,start_treatment_days_since_enrollment,end_treatment_days_since_enrollment,patient_num
1,INCOV002,6ca4bf39-8c44-455d-adfa-24c781b43016,adj_0,Tocilizumab,0.42,0.46,2
2,INCOV002,6ca4bf39-8c44-455d-adfa-24c781b43016,adj_1,Tocilizumab,3.0,3.04,2
3,INCOV002,6ca4bf39-8c44-455d-adfa-24c781b43016,adj_2,Tocilizumab,3.54,3.58,2
10,INCOV003,6175bcbe-9eb9-497e-84e6-17921ff0a98f,adj_0,Tocilizumab,0.71,0.75,3
11,INCOV003,6175bcbe-9eb9-497e-84e6-17921ff0a98f,adj_1,Tocilizumab,1.5,1.54,3
16,INCOV005,ffbffca8-3add-44cf-9901-37f3d6ba4c6b,adj_0,Tocilizumab,0.96,1.0,5
17,INCOV005,ffbffca8-3add-44cf-9901-37f3d6ba4c6b,adj_1,Tocilizumab,3.08,3.12,5
18,INCOV005,ffbffca8-3add-44cf-9901-37f3d6ba4c6b,adj_2,Tocilizumab,3.58,3.62,5
22,INCOV006,aec2ddc7-0c91-409e-8867-1f77f8710ec5,adj_0,Tocilizumab,3.12,3.17,6
26,INCOV008,b7c83eeb-85ac-4080-b9d7-c8910f04914f,adj_0,Tocilizumab,1.12,1.17,8


In [23]:
df_clinical

Unnamed: 0,source_label,observation_id,incov_redcap_event_name,observation_days_since_enrollment,abdominal_pain,ambulatory,blood_draw_status,blood_draw_status_bad,blood_draw_type,blood_oxygenation,...,patient_num,time_wrt_Remdesivir (RDV),time_wrt_Tocilizumab,time_wrt_Vitamin C,time_wrt_Zinc,time_wrt_Hydroxychloroquine (HCQ),time_wrt_HCQ + Azithromycin,time_wrt_Steroid,time_wrt_Other,time_wrt_Plasma Transfusions
0,INCOV001,0136c6b6-5c01-42ca-a1ea-d1286f193041,T1,0.92,No,Limitation of activities,Complete,,Baseline,95.0,...,1-1,Before,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1
1,INCOV001,d372353b-edf7-4d01-adc9-567603c30bc7,T2,5.92,No,Limitation of activities,Complete,,Acute,96.0,...,1-2,After,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2
2,INCOV002,7403b5ac-6810-431e-bc21-f39a81edbf27,T1,0.71,No,Limitation of activities,Complete,,Baseline,96.0,...,2-1,Before,After,Before,Before,After,No.T1,No.T1,No.T1,No.T1
3,INCOV002,b21d9bf4-07cf-4d0d-9638-0e02f739f989,T2,5.75,Unknown,Limitation of activities,Complete,,Acute,91.0,...,2-2,After,After,After,Before,After,No.T2,No.T2,No.T2,No.T2
4,INCOV003,04e24104-e195-4a32-85e7-bdc935800f4a,T1,0.54,Unknown,Limitation of activities,Complete,,Baseline,96.0,...,3-1,Before,Before,Before,No.T1,After,No.T1,No.T1,No.T1,No.T1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,INCOV162,96187e76-58a4-4bf5-afdc-014cd81b70b8,T1,0.00,No,Limitation of activities,Complete,,Baseline,99.0,...,162-1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1
297,INCOV162,d75aee66-b021-4181-9601-0b390172c442,T2,11.12,Unknown,No limitation of activities,Complete,,Acute,,...,162-2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2
298,INCOV163,c51b79f1-b200-44fd-b355-3bd685c7d5ef,T1,0.58,Unknown,Limitation of activities,Complete,,Baseline,95.0,...,163-1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1
299,INCOV163,1f9d562c-22ee-459c-9353-72c7ad23a76b,T2,2.58,No,Limitation of activities,Complete,,Acute,95.0,...,163-2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2


In [24]:
# Determine whether the treatment was before or after each timepoint
# Compare df_clinical observation_days_since_enrollment vs. start_treatment_days_since_enrollment
# Do this per drug treatment
for tx in df['treatment_type'].unique():
    colname = 'time_wrt_' + tx
    df_clinical[colname] = pd.Series()
    for obs in df_clinical.index:
        clin = df_clinical.loc[obs]
        source = clin['source_label']
        try:
            temp = df[df['treatment_type']==tx][df['source_label']==source].iloc[0]
            
            # If patient received any of this treatment, take the T1 and T2 as already defined
            if clin['incov_redcap_event_name']=='T1':
                df_clinical.loc[obs, colname] = 'Before'
            elif clin['incov_redcap_event_name']=='T2':
                df_clinical.loc[obs, colname] = 'After'
                
        except:
            df_clinical.loc[obs, colname] = 'No.T' + clin['patient_num'][-1]
            
df_clinical.head()

  df_clinical[colname] = pd.Series()
  temp = df[df['treatment_type']==tx][df['source_label']==source].iloc[0]


Unnamed: 0,source_label,observation_id,incov_redcap_event_name,observation_days_since_enrollment,abdominal_pain,ambulatory,blood_draw_status,blood_draw_status_bad,blood_draw_type,blood_oxygenation,...,patient_num,time_wrt_Remdesivir (RDV),time_wrt_Tocilizumab,time_wrt_Vitamin C,time_wrt_Zinc,time_wrt_Hydroxychloroquine (HCQ),time_wrt_HCQ + Azithromycin,time_wrt_Steroid,time_wrt_Other,time_wrt_Plasma Transfusions
0,INCOV001,0136c6b6-5c01-42ca-a1ea-d1286f193041,T1,0.92,No,Limitation of activities,Complete,,Baseline,95.0,...,1-1,Before,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1
1,INCOV001,d372353b-edf7-4d01-adc9-567603c30bc7,T2,5.92,No,Limitation of activities,Complete,,Acute,96.0,...,1-2,After,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2
2,INCOV002,7403b5ac-6810-431e-bc21-f39a81edbf27,T1,0.71,No,Limitation of activities,Complete,,Baseline,96.0,...,2-1,Before,Before,Before,Before,Before,No.T1,No.T1,No.T1,No.T1
3,INCOV002,b21d9bf4-07cf-4d0d-9638-0e02f739f989,T2,5.75,Unknown,Limitation of activities,Complete,,Acute,91.0,...,2-2,After,After,After,After,After,No.T2,No.T2,No.T2,No.T2
4,INCOV003,04e24104-e195-4a32-85e7-bdc935800f4a,T1,0.54,Unknown,Limitation of activities,Complete,,Baseline,96.0,...,3-1,Before,Before,Before,No.T1,Before,No.T1,No.T1,No.T1,No.T1


In [25]:
sum(df_clinical['time_wrt_Remdesivir (RDV)'].isin(['No.T1', 'No.T2', 'After']))

250

In [26]:
sum(df_clinical['time_wrt_Remdesivir (RDV)']=='Before')

51

In [27]:
sum(df_clinical['time_wrt_Remdesivir (RDV)']=='After')

45

In [28]:
df.to_csv(master_dir + 'data/drug_treatments_2020-09-04_1027-cleaned.tsv', index=False)
df_clinical.to_csv(master_dir + "data/observations_2020-09-04_1027-cleaned-withTxInfo.tsv", index=False)