# Notebook to extract the clinical features of the COVID dataset 
- dataset: http://ictcf.biocuckoo.cn/index.php
- Author: Alexander Schwarz

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import collections
pd.set_option('display.max_columns', None)

In [150]:
raw_path = "/Users/Alex/GIT/supergcn_2.0/data/covid/covid_download/CF_dataset.txt"
disease_path = "/Users/Alex/GIT/supergcn_2.0/data/covid/covid_download/underlying_disease.csv"
store_path = "/Users/Alex/GIT/supergcn_2.0/data/covid/covid_download/"

In [151]:
df=pd.read_csv(raw_path,sep='\t',header=None)
disease=pd.read_csv(disease_path,sep=';',header=None)

In [173]:
#all_diseases = []
#for col in disease.columns:
#    all_diseases.append(disease[col])###

#flat_list = []
#for sublist in all_diseases:
#    for item in sublist:
#        flat_list.append(item)    
##print(len(all_diseases))  
##print(set(flat_list))

#dic_disease = {}
#for dis in set(flat_list):
#    dic_disease[dis] = flat_list.count(dis)


In [174]:
#dict1 = dic_disease
#sorted_dict = {}
#sorted_keys = sorted(dict1, key=dict1.get)##

#for w in sorted_keys:
#    sorted_dict[w] = dict1[w]#

#print(sorted_dict)

In [5]:
# lists containing all the clinical features
BLOOD_FEATURES = ['MCHC', 'MCH', 'MCV', 'HCT', 'HGB', 'RBC', 'PDW', 'PLCT', 'MPV', 'PLT',
                      'BA', 'EO', 'MO', 'LY', 'NE', 'NE', 'BAP', 'EOP', 'MOP', 'LYP', 'NEP', 
                      'WBC', 'PLCR', 'RDWSD', 'RDWCV']

INFLAMMATION_FEATURES = ['ESR', 'CRP', 'PCT']

BLOOD_COALGUALATION_TEST_FEATURES = ['DD', 'TT', 'FIB', 'APTT', 'INR', 'PT']

BIOCHEMICAL_TEST_FEATURES = ['ALG', 'ALB', 'ALP', 'ALT', 'AST', 'BUN', 'CA', 'CL', 'CO2', 'CREA', 'GGT', 'GLB',
                             'K', 'MG', 'Na', 'PHOS', 'TBIL', 'TP', 'URIC', 'CHOL', 'CK', 'HDLC', 'LDH', 'TG', 'AnG',
                            'DBIL', 'GLU', 'LDLC', 'OSM', 'PA', 'TBA', 'HBDH', 'CysC', 'LAP', '5NT', 'HC', 'SAA', 'SdLDL']

IMMUNE_CELL_TYPING_FEATURES = ['CD3+', 'CD4+', 'CD8+', 'BC', 'NKC', 'CD4/CD8']

CYTOKINE_PROFILE_TEST_FEATURES = ['IL-2', 'IL-4', 'IL-6', 'IL-10', 'TNF', 'IFN']

concat_features = list(['ID']) + BLOOD_FEATURES + INFLAMMATION_FEATURES + BLOOD_COALGUALATION_TEST_FEATURES +\
                    BIOCHEMICAL_TEST_FEATURES + IMMUNE_CELL_TYPING_FEATURES + CYTOKINE_PROFILE_TEST_FEATURES

print('The following features will be extracted:')
print(concat_features)
print()

feature_df = pd.DataFrame(columns=concat_features)
## extract features features:

print('Extracting features...')

for patient in range(0, len(df)):  
    curr_string = df[1][patient]
    data_dict = {}
    
    split = curr_string.split('_')
    for feat in feature_df.columns:
        if feat in split:
            id = split.index(feat)
            
            # check if feature contains ">" or "<" sign, and remove if so
            try:
                data_dict['{}'.format(feat)] = float(split[id+1])
            except:
                #print('Patient: {}'.format(patient))
                #print(feat)
                #print(split[id+1])
                #print()
                data_dict['{}'.format(feat)] = float(split[id+1][1:])
        else:
            #print('feature {} missing for patient!'.format(feat))
            data_dict['{}'.format(feat)] = np.nan
    data_dict['ID'] = patient+1
    feature_df = feature_df.append(data_dict, ignore_index=True)

print('done.')    


The following features will be extracted:
['ID', 'MCHC', 'MCH', 'MCV', 'HCT', 'HGB', 'RBC', 'PDW', 'PLCT', 'MPV', 'PLT', 'BA', 'EO', 'MO', 'LY', 'NE', 'NE', 'BAP', 'EOP', 'MOP', 'LYP', 'NEP', 'WBC', 'PLCR', 'RDWSD', 'RDWCV', 'ESR', 'CRP', 'PCT', 'DD', 'TT', 'FIB', 'APTT', 'INR', 'PT', 'ALG', 'ALB', 'ALP', 'ALT', 'AST', 'BUN', 'CA', 'CL', 'CO2', 'CREA', 'GGT', 'GLB', 'K', 'MG', 'Na', 'PHOS', 'TBIL', 'TP', 'URIC', 'CHOL', 'CK', 'HDLC', 'LDH', 'TG', 'AnG', 'DBIL', 'GLU', 'LDLC', 'OSM', 'PA', 'TBA', 'HBDH', 'CysC', 'LAP', '5NT', 'HC', 'SAA', 'SdLDL', 'CD3+', 'CD4+', 'CD8+', 'BC', 'NKC', 'CD4/CD8', 'IL-2', 'IL-4', 'IL-6', 'IL-10', 'TNF', 'IFN']

Extracting features...
done.


In [6]:
#feature_df[feature_df['CRP'] < 8]['CRP']
feature_df

Unnamed: 0,ID,MCHC,MCH,MCV,HCT,HGB,RBC,PDW,PLCT,MPV,PLT,BA,EO,MO,LY,NE,NE.1,BAP,EOP,MOP,LYP,NEP,WBC,PLCR,RDWSD,RDWCV,ESR,CRP,PCT,DD,TT,FIB,APTT,INR,PT,ALG,ALB,ALP,ALT,AST,BUN,CA,CL,CO2,CREA,GGT,GLB,K,MG,Na,PHOS,TBIL,TP,URIC,CHOL,CK,HDLC,LDH,TG,AnG,DBIL,GLU,LDLC,OSM,PA,TBA,HBDH,CysC,LAP,5NT,HC,SAA,SdLDL,CD3+,CD4+,CD8+,BC,NKC,CD4/CD8,IL-2,IL-4,IL-6,IL-10,TNF,IFN
0,1.0,344.0,31.2,90.7,35.8,123.0,3.94,16.4,0.26,8.5,304.0,0.01,0.07,0.21,1.00,1.13,1.13,0.30,3.10,8.70,41.2,46.70,2.42,17.2,39.3,12.5,22.0,15.40,0.13,1.26,17.2,4.46,38.5,1.11,14.1,1.2,34.7,52.0,63.0,44.0,6.96,2.08,101.8,29.9,74.3,21.0,29.0,4.15,0.94,141.1,0.97,9.8,63.7,273.8,4.01,232.0,0.64,299.0,0.72,9.0,4.1,4.82,2.92,302.3,0.13,0.7,215.0,0.89,25.0,6.0,6.60,155.8,1.10,76.74,52.59,22.40,7.56,5.72,2.35,2.53,2.21,9.59,4.48,1.67,1.75
1,2.0,349.0,31.5,90.2,41.5,145.0,4.60,12.6,0.14,10.5,134.0,0.00,0.00,0.09,0.66,1.20,1.20,0.00,0.00,4.60,33.8,61.60,1.95,29.7,41.9,12.6,18.0,39.00,0.15,0.35,18.5,4.82,43.7,0.91,12.0,1.6,40.9,36.0,50.0,45.0,4.05,2.29,99.9,17.2,75.1,49.0,25.9,4.32,0.81,138.2,1.32,9.5,66.8,342.3,3.21,102.0,0.92,392.0,0.77,21.0,3.2,6.63,1.89,295.7,0.15,3.2,284.0,0.75,29.0,9.0,7.10,204.5,0.38,,,,,,,2.70,2.10,4.87,8.08,2.13,2.44
2,3.0,330.0,31.1,94.2,30.4,101.0,3.23,16.3,0.20,10.8,183.0,0.02,0.00,0.30,0.68,3.01,3.01,0.40,0.10,7.50,16.9,75.10,4.01,,,,,11.98,,,,,,,,0.9,30.0,44.0,59.0,63.0,6.07,1.77,105.0,23.3,68.0,23.0,32.1,3.63,0.88,141.6,1.02,4.7,62.1,296.6,4.63,78.0,0.77,297.0,2.89,13.0,1.7,9.36,2.64,305.9,0.14,3.0,238.0,0.89,,,7.51,,,,,,,,,,,,,,
3,4.0,337.0,30.7,91.3,40.9,138.0,4.48,16.2,0.23,8.3,272.0,0.03,0.00,0.30,0.73,4.19,4.19,0.60,0.00,5.80,13.9,79.70,5.25,16.3,40.0,12.7,19.0,66.40,0.13,1.70,19.1,4.77,38.8,0.94,12.4,1.2,31.7,66.0,35.0,47.0,6.63,2.05,100.2,21.9,81.8,65.0,26.4,4.66,0.81,137.2,0.73,9.6,58.1,242.3,3.57,83.0,1.04,546.0,1.30,15.0,3.7,5.80,1.94,296.1,0.09,4.4,378.0,0.83,28.0,8.0,5.70,662.0,0.57,52.71,38.98,12.57,27.43,13.64,3.10,2.50,2.26,60.28,6.58,2.20,3.07
4,5.0,329.0,30.4,92.4,41.5,136.0,4.49,16.4,0.20,9.1,215.0,0.01,0.00,0.43,1.64,5.23,5.23,0.10,0.00,5.90,22.4,71.60,7.31,20.9,40.9,12.8,,,0.13,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,6.0,348.0,31.4,90.5,35.1,122.0,3.88,16.4,0.28,12.3,230.0,0.02,0.03,0.57,1.35,4.39,4.39,0.30,0.50,9.00,21.2,69.00,6.36,42.0,40.3,12.3,102.0,136.00,0.13,0.83,,,,,,1.2,38.3,65.0,17.0,20.0,4.04,2.10,93.5,27.6,49.3,36.0,31.0,4.22,0.73,138.5,0.90,10.7,69.3,269.8,,40.0,,283.0,,17.0,4.5,14.08,,303.6,0.11,0.7,,,27.0,,,,,61.61,32.14,26.02,11.00,19.61,1.24,2.63,1.53,24.13,4.70,2.03,1.63
6,7.0,346.0,32.2,93.1,29.5,102.0,3.17,12.6,0.11,10.3,111.0,0.00,0.00,0.75,1.18,10.16,10.16,0.00,0.00,6.20,9.8,84.00,12.09,28.6,46.5,13.6,,63.30,,,17.2,3.14,53.1,1.01,13.1,0.9,22.6,110.0,17.0,39.0,2.27,2.01,105.9,19.7,54.3,11.0,24.7,3.84,0.58,133.7,1.60,13.8,47.3,350.5,,,,,,,,5.12,,,,,,,,,,,,,,,,,,,,,,,
7,8.0,332.0,30.2,90.8,39.4,131.0,4.34,15.8,0.17,12.3,140.0,0.01,0.00,0.34,1.21,1.36,1.36,0.30,0.00,11.60,41.4,46.70,2.92,43.6,41.5,12.4,6.0,,0.13,0.51,16.4,3.58,39.5,0.96,12.6,1.8,41.5,42.0,16.0,23.0,2.99,2.06,103.7,22.3,49.7,7.0,23.6,3.94,0.94,140.5,0.90,7.2,65.1,171.1,3.20,37.0,1.34,177.0,0.69,15.0,2.7,4.49,1.49,296.4,0.11,3.6,126.0,0.60,19.0,5.0,3.50,110.0,0.32,81.70,43.50,28.05,7.54,9.40,1.55,2.43,1.49,5.05,4.15,2.22,1.93
8,9.0,331.0,28.6,86.4,35.7,118.0,4.13,10.3,0.21,9.9,211.0,0.01,0.07,0.31,2.52,2.34,2.34,0.20,1.30,5.90,48.0,44.60,5.25,22.6,40.0,12.5,7.0,,0.13,0.22,17.8,3.05,34.8,1.08,13.8,1.7,33.2,45.0,7.0,11.0,3.51,1.95,104.8,24.4,55.6,6.0,19.5,3.77,0.80,141.4,1.24,5.6,52.7,157.3,3.59,30.0,0.67,139.0,1.11,12.0,1.6,4.89,2.31,298.7,0.15,3.5,113.0,0.64,15.0,4.0,7.40,15.7,0.94,79.47,45.67,27.69,16.54,3.01,1.65,3.09,2.89,3.22,2.23,2.89,2.18
9,10.0,329.0,32.6,99.0,37.3,123.0,3.76,18.0,0.17,11.9,145.0,0.01,0.05,0.20,1.22,2.30,2.30,0.30,1.30,5.30,32.4,60.70,3.78,40.4,43.9,12.8,,3.14,0.13,0.22,16.0,3.99,27.8,0.99,12.9,1.5,41.4,87.0,19.0,19.0,3.40,2.32,104.7,21.7,65.7,8.0,26.8,4.47,0.74,140.3,1.37,9.4,68.2,293.0,4.39,37.0,0.74,189.0,2.18,,3.4,,2.52,,,,,1.16,,,,,,79.30,48.95,27.80,13.85,3.94,1.76,2.22,1.58,2.94,2.59,1.64,1.46


In [83]:
#feature_df.to_csv(store_path+"CF_clinical_features.csv", sep=";", index=False)

## Test dataloader

In [112]:
meta_path = "../data/covid/covid_download/CF_meta.csv"
overview_path = "../data/covid/covid_download/CF_overview.csv"
feat_path = "../data/covid/covid_download/CF_clinical_features.csv"
label_path = "../data/covid/covid_download/CF_labels_all.csv"

meta_data = pd.read_csv(meta_path, delimiter=';', header=0)
overview_data = pd.read_csv(overview_path, delimiter=';', header=0)
feature_data = pd.read_csv(feat_path, delimiter=';', header=0) # loads clinical features only
label_data = pd.read_csv(label_path, delimiter=';', header=0)

# Pre-process meta information
meta_data = meta_data.drop(columns=['ID'])

meta_data.Gender = meta_data.Gender.replace('Male', 0)
meta_data.Gender = meta_data.Gender.replace('Female', 1)

meta_data.Hospital = meta_data.Hospital.replace('Union', 0)
meta_data.Hospital = meta_data.Hospital.replace('Liyuan', 1)

# Pre-process feature information
feature_data = feature_data.drop(columns=['ID'])

# add body temperature as feature information
#feature_data.insert(loc=0,
#                    column='Body_temperature',
#                    value=overview_data['Body_temperature']).astype(np.float32)




# Pre-process labels
label_data = label_data.drop(columns=['ID'])

label_data["SARS-CoV-2_nucleic_acids"] = label_data["SARS-CoV-2_nucleic_acids"].replace('Positive', 1)
label_data["SARS-CoV-2_nucleic_acids"] = label_data["SARS-CoV-2_nucleic_acids"].replace('Negative', 0)
label_data["SARS-CoV-2_nucleic_acids"] = label_data["SARS-CoV-2_nucleic_acids"].replace('Negative; Positive (Confirmed later)', 1) # a lot of missing data in there, which is good for evaluating MGMC's classification performance

label_data["Mortality_outcome"] = label_data["Mortality_outcome"].replace('Deceased', 1)
label_data["Mortality_outcome"] = label_data["Mortality_outcome"].replace('Cured', 0)
label_data["Mortality_outcome"] = label_data["Mortality_outcome"].replace('Unknown', np.nan)

label_data["Morbiditiy_outcome"] = label_data["Morbiditiy_outcome"].replace('Mild', 0)
label_data["Morbiditiy_outcome"] = label_data["Morbiditiy_outcome"].replace('Regular', 1)
label_data["Morbiditiy_outcome"] = label_data["Morbiditiy_outcome"].replace('Suspected', 2)
label_data["Morbiditiy_outcome"] = label_data["Morbiditiy_outcome"].replace('Suspected (COVID-19-confirmed later)', 3)
label_data["Morbiditiy_outcome"] = label_data["Morbiditiy_outcome"].replace('Control', 4)
label_data["Morbiditiy_outcome"] = label_data["Morbiditiy_outcome"].replace('Control (Community-acquired pneumonia)', 5)
label_data["Morbiditiy_outcome"] = label_data["Morbiditiy_outcome"].replace('Severe', 6)
label_data["Morbiditiy_outcome"] = label_data["Morbiditiy_outcome"].replace('Critically ill', 7)

# decide on which label to use for classification problem!
# we decided to use this as classification task
curr_label_data = label_data["SARS-CoV-2_nucleic_acids"]
curr_label_data = curr_label_data.squeeze()

#meta_data = np.array(meta_data).astype(np.float32)
#curr_label_data = np.array(curr_label_data).astype(np.float32)
#feature_data = np.array(feature_data).astype(np.float32)

print('Feature shape is {}'.format(feature_data.shape))
print('Class label shape is {}'.format(curr_label_data.shape))
print('Meta information shape is {}'.format(meta_data.shape))

print('Initializing COVID dataset...')

Feature shape is (1521, 84)
Class label shape is (1521,)
Meta information shape is (1521, 3)
Initializing COVID dataset...


In [113]:
feature_nan_threshold = 0.4
patient_nan_threshold = 0.4
drop_features = []
drop_patients = []

for col in feature_data.columns:
    if feature_data['{}'.format(col)].isnull().sum() / len(feature_data) > feature_nan_threshold:
        drop_features.append(col)

feature_data = feature_data.drop(drop_features, axis=1)


for patient in feature_data.index:
    if feature_data.iloc[patient].isnull().sum() / len(feature_data.columns) > patient_nan_threshold:
        drop_patients.append(patient)

feature_data = feature_data.drop(drop_patients, axis=0)
label_data = label_data.drop(drop_patients, axis=0)
meta_data = meta_data.drop(drop_patients, axis=0)
overview_data = overview_data.drop(drop_patients, axis=0)

print('Dropped {} features from dataset [too many missing values].'.format(drop_features))
print('Dropped {} patients from dataset [too many missing values].'.format(len(drop_patients)))

Dropped ['PLCR', 'RDWSD', 'ESR', 'CRP', 'PCT', 'ALG', 'BUN', 'GGT', 'CHOL', 'CK', 'HDLC', 'LDH', 'TG', 'AnG', 'DBIL', 'GLU', 'LDLC', 'OSM', 'PA', 'TBA', 'HBDH', 'CysC', 'LAP', '5NT', 'HC', 'SAA', 'SdLDL', 'CD3+', 'CD4+', 'CD8+', 'BC', 'NKC', 'CD4/CD8', 'IL-2', 'IL-4', 'IL-6', 'IL-10', 'TNF', 'IFN'] features from dataset [too many missing values].
Dropped 396 patients from dataset [too many missing values].


## Debug DataLoading

In [10]:
general_dir = "../data/covid/"
data_dir = "../data/covid/covid_download/"
image_dir = "..data/covid/images/"

In [18]:
img_paths = [image_dir+name+'/CT/' for name in os.listdir(general_dir + '/images')]


In [19]:
img_paths

['..data/covid/images/Patient 1/CT/',
 '..data/covid/images/Patient 4/CT/',
 '..data/covid/images/Patient 3/CT/',
 '..data/covid/images/Patient 2/CT/',
 '..data/covid/images/Patient 5/CT/']