In [18]:
import re
import pandas as pd
import numpy as np

## Cleaning up clinical data table

In [19]:
master_dir = '../'
df = pd.read_csv(master_dir + "data/observations_2020-08-19_0929.tsv", sep='\t')
                   #dtype={'patient_id': str, 'sample_id': str, 'sequencing_sample_id': str, 'demux_donor': str})
df.head()

Unnamed: 0,source_label,observation_id,incov_redcap_event_name,observation_days_since_enrollment,abdominal_pain,ambulatory,blood_draw_status,blood_draw_status_bad,blood_draw_type,blood_oxygenation,...,pulse,respiratory_rate,respiratory_support,sputum,systolic_bp,temperature,temperature_max,temperature_min,weight,who_ordinal_scale
0,INCOV001,3e60de44-0d8e-4ec6-8589-c2871f69f748,admittance_arm_1,-8.42,No,Limitation of activities,,,,96.0,...,69.0,20.0,,No,138.0,36.8,37.1,36.8,97.5,3
1,INCOV001,ece28cf5-4484-4297-8c30-7d8cae8d4968,baseline_blood_dra_arm_1,0.92,No,Limitation of activities,Complete,,Baseline,95.0,...,55.0,22.0,,No,139.0,36.6,36.6,36.4,97.5,3
2,INCOV001,aa56f179-ad6f-4585-a75d-61bd6d994a19,acute_blood_draw_arm_1,5.92,No,Limitation of activities,Complete,,Acute,96.0,...,66.0,18.0,,No,140.0,36.3,36.6,36.3,97.5,3
3,INCOV001,489a3a0e-af35-4189-baf5-8a5b572c4e7f,convalescent_blood_arm_1,105.21,Unknown,,Complete,,Convalescent,,...,,,,,,,,,,<=2
4,INCOV002,6821e627-08bd-49fd-9abf-fe569182d069,admittance_arm_1,-1.04,No,Limitation of activities,,,,94.0,...,97.0,20.0,Nasal cannula,No,137.0,37.8,38.5,37.2,90.7,4


In [20]:
df['incov_redcap_event_name'].unique()

array(['admittance_arm_1', 'baseline_blood_dra_arm_1',
       'acute_blood_draw_arm_1', 'convalescent_blood_arm_1',
       'physician_discreti_arm_1'], dtype=object)

In [21]:
# Only need to look at T1 and T2, which are the two items below respectively
df = df[df['incov_redcap_event_name'].isin(['baseline_blood_dra_arm_1', 'acute_blood_draw_arm_1'])]
df['incov_redcap_event_name'] = df['incov_redcap_event_name'].map({'baseline_blood_dra_arm_1': 'T1', 
                                                                   'acute_blood_draw_arm_1': 'T2'})

In [22]:
df['patient_num'] = df['source_label'].str.replace("INCOV", "").str.replace("^0+", "")

In [25]:
df['patient_num'] = df['patient_num'] + '-' + df['incov_redcap_event_name'].apply(lambda x: x[-1])

In [26]:
df.head()

Unnamed: 0,source_label,observation_id,incov_redcap_event_name,observation_days_since_enrollment,abdominal_pain,ambulatory,blood_draw_status,blood_draw_status_bad,blood_draw_type,blood_oxygenation,...,respiratory_rate,respiratory_support,sputum,systolic_bp,temperature,temperature_max,temperature_min,weight,who_ordinal_scale,patient_num
1,INCOV001,ece28cf5-4484-4297-8c30-7d8cae8d4968,T1,0.92,No,Limitation of activities,Complete,,Baseline,95.0,...,22.0,,No,139.0,36.6,36.6,36.4,97.5,3,1-1
2,INCOV001,aa56f179-ad6f-4585-a75d-61bd6d994a19,T2,5.92,No,Limitation of activities,Complete,,Acute,96.0,...,18.0,,No,140.0,36.3,36.6,36.3,97.5,3,1-2
5,INCOV002,61b14fe9-9b64-4378-bd35-a05924b02dcc,T1,0.71,No,Limitation of activities,Complete,,Baseline,96.0,...,20.0,High flow nasal cannula (HFNC),Yes,96.0,37.3,38.5,36.3,90.7,5,2-1
6,INCOV002,3ed235a5-d6b2-40dc-855f-5bdae4036d84,T2,5.75,Unknown,Limitation of activities,Complete,,Acute,91.0,...,20.0,Other,No,125.0,36.8,36.8,36.4,94.8,7,2-2
9,INCOV003,e1eb0bc7-0e40-4761-9945-dcd850c1c656,T1,0.54,Unknown,Limitation of activities,Complete,,Baseline,96.0,...,21.0,Other,Yes,89.0,37.2,37.2,36.7,77.6,7,3-1


In [27]:
df.to_csv(master_dir + "data/observations_2020-08-19_0929-cleaned.tsv", index=False)

## Cleaning up drug treatment data table

In [29]:
df = pd.read_csv(master_dir + 'data/drug_treatments_2020-08-19_0929.tsv', sep='\t')
df.head()

Unnamed: 0,source_label,observation_id,drug_treatment,treatment_type,start_treatment_days_since_enrollment,end_treatment_days_since_enrollment
0,INCOV001,e928e085-46ca-4e51-a68f-c9352e76865b,vir_0,Remdesivir (RDV),1.67,6.0
1,INCOV002,9b6e7a7b-b948-4bf4-a83e-3ed901580fdf,adj_0,Tocilizumab,0.42,0.46
2,INCOV002,9b6e7a7b-b948-4bf4-a83e-3ed901580fdf,adj_1,Tocilizumab,3.0,3.04
3,INCOV002,9b6e7a7b-b948-4bf4-a83e-3ed901580fdf,adj_2,Tocilizumab,3.54,3.58
4,INCOV002,9b6e7a7b-b948-4bf4-a83e-3ed901580fdf,adj_3,Vitamin C,-0.12,4.88


In [30]:
df['drug_treatment'].unique()

array(['vir_0', 'adj_0', 'adj_1', 'adj_2', 'adj_3', 'adj_4', 'vir_1',
       'vir_2', 'adj_5'], dtype=object)

In [31]:
df['patient_num'] = df['source_label'].str.replace("INCOV", "").str.replace("^0+", "")

In [33]:
df_clinical = pd.read_csv(master_dir + "data/observations_2020-08-19_0929-cleaned.tsv")
df_clinical.head()

Unnamed: 0,source_label,observation_id,incov_redcap_event_name,observation_days_since_enrollment,abdominal_pain,ambulatory,blood_draw_status,blood_draw_status_bad,blood_draw_type,blood_oxygenation,...,respiratory_rate,respiratory_support,sputum,systolic_bp,temperature,temperature_max,temperature_min,weight,who_ordinal_scale,patient_num
0,INCOV001,ece28cf5-4484-4297-8c30-7d8cae8d4968,T1,0.92,No,Limitation of activities,Complete,,Baseline,95.0,...,22.0,,No,139.0,36.6,36.6,36.4,97.5,3,1-1
1,INCOV001,aa56f179-ad6f-4585-a75d-61bd6d994a19,T2,5.92,No,Limitation of activities,Complete,,Acute,96.0,...,18.0,,No,140.0,36.3,36.6,36.3,97.5,3,1-2
2,INCOV002,61b14fe9-9b64-4378-bd35-a05924b02dcc,T1,0.71,No,Limitation of activities,Complete,,Baseline,96.0,...,20.0,High flow nasal cannula (HFNC),Yes,96.0,37.3,38.5,36.3,90.7,5,2-1
3,INCOV002,3ed235a5-d6b2-40dc-855f-5bdae4036d84,T2,5.75,Unknown,Limitation of activities,Complete,,Acute,91.0,...,20.0,Other,No,125.0,36.8,36.8,36.4,94.8,7,2-2
4,INCOV003,e1eb0bc7-0e40-4761-9945-dcd850c1c656,T1,0.54,Unknown,Limitation of activities,Complete,,Baseline,96.0,...,21.0,Other,Yes,89.0,37.2,37.2,36.7,77.6,7,3-1


In [39]:
df['treatment_type'].unique()

array(['Remdesivir (RDV)', 'Tocilizumab', 'Vitamin C', 'Zinc',
       'Hydroxychloroquine (HCQ)', 'HCQ + Azithromycin', 'Steroid',
       'Other', 'Plasma Transfusions'], dtype=object)

In [45]:
# Determine whether the treatment was before or after each timepoint
# Compare df_clinical observation_days_since_enrollment vs. start_treatment_days_since_enrollment
# Do this per drug treatment
for tx in df['treatment_type'].unique():
    colname = 'time_wrt_' + tx
    df_clinical[colname] = pd.Series()
    for obs in df_clinical.index:
        clin = df_clinical.loc[obs]
        source = clin['source_label']
        try:
            temp = df[df['treatment_type']==tx][df['source_label']==source].iloc[0]
            if clin['observation_days_since_enrollment'] < temp['start_treatment_days_since_enrollment']:
                df_clinical.loc[obs, colname] = 'Before'
            else:
                df_clinical.loc[obs, colname] = 'After'
        except:
            df_clinical.loc[obs, colname] = 'No.T' + clin['patient_num'][-1]
        
df_clinical.head()

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,source_label,observation_id,incov_redcap_event_name,observation_days_since_enrollment,abdominal_pain,ambulatory,blood_draw_status,blood_draw_status_bad,blood_draw_type,blood_oxygenation,...,patient_num,time_wrt_Remdesivir (RDV),time_wrt_Tocilizumab,time_wrt_Vitamin C,time_wrt_Zinc,time_wrt_Hydroxychloroquine (HCQ),time_wrt_HCQ + Azithromycin,time_wrt_Steroid,time_wrt_Other,time_wrt_Plasma Transfusions
0,INCOV001,ece28cf5-4484-4297-8c30-7d8cae8d4968,T1,0.92,No,Limitation of activities,Complete,,Baseline,95.0,...,1-1,Before,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1,No.T1
1,INCOV001,aa56f179-ad6f-4585-a75d-61bd6d994a19,T2,5.92,No,Limitation of activities,Complete,,Acute,96.0,...,1-2,After,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2,No.T2
2,INCOV002,61b14fe9-9b64-4378-bd35-a05924b02dcc,T1,0.71,No,Limitation of activities,Complete,,Baseline,96.0,...,2-1,Before,After,After,Before,After,No.T1,No.T1,No.T1,No.T1
3,INCOV002,3ed235a5-d6b2-40dc-855f-5bdae4036d84,T2,5.75,Unknown,Limitation of activities,Complete,,Acute,91.0,...,2-2,After,After,After,After,After,No.T2,No.T2,No.T2,No.T2
4,INCOV003,e1eb0bc7-0e40-4761-9945-dcd850c1c656,T1,0.54,Unknown,Limitation of activities,Complete,,Baseline,96.0,...,3-1,Before,Before,After,No.T1,After,No.T1,No.T1,No.T1,No.T1


In [46]:
df.to_csv(master_dir + 'data/drug_treatments_2020-08-19_0929-cleaned.tsv', index=False)
df_clinical.to_csv(master_dir + "data/observations_2020-08-19_0929-cleaned-withTxInfo.tsv", index=False)