In [1]:
import pandas as pd
import numpy as np
import datetime
import string
import os
from matplotlib import pyplot as plt
from pathlib import Path

def print_with_index(l):
    df = pd.DataFrame(l)
    print(df.rename(columns={0: 'discharged', 1: 'deceased'}))

In [2]:
DATA = os.path.join(Path.cwd().parents[0], 'Data/out.csv')
data = pd.read_csv(DATA, keep_default_na=False)
print(data.shape)

(2072, 65)


In [3]:
# Get outcome died
outcome_died = data['outcome_died']
# print(outcome_died)

# Number of severe/not severe cases
cases = outcome_died.value_counts()
print(cases)
print(cases / outcome_died.shape[0] * 100)

0    2042
1      30
Name: outcome_died, dtype: int64
0    98.552124
1     1.447876
Name: outcome_died, dtype: float64


In [4]:
# By year: 2017, 2018, 2019
by_year = np.zeros((3, 2))
by_year_percent = np.zeros((3, 2))

for year in ['2017', '2018', '2019']:
    for i, case in enumerate([0, 1]):
        by_year[int(year) - 2017, i] = data[(data['admission_date'].str.contains(year)) & (data['outcome_died'] == case)].shape[0]
        by_year_percent[int(year) - 2017, i] = by_year[int(year) - 2017, i] / cases[case] * 100

print_with_index(by_year)
print_with_index(by_year_percent)

   discharged  deceased
0       130.0       2.0
1       615.0       6.0
2      1297.0      22.0
   discharged   deceased
0    6.366308   6.666667
1   30.117532  20.000000
2   63.516161  73.333333


In [5]:
# By sex: Male or Female
by_sex = np.zeros((2, 2))
by_sex_percent = np.zeros((2, 2))

print(data[(data['Female'] == 0) & (data['outcome_died'] == 1)].shape[0])

for i, sex in enumerate((0, 1)):
    for j, case in enumerate((0, 1)):
        # print(f"At cell {i}, {j}")
        # print(f"sex = {sex}, case = {case} => {data[(data['Female'] == sex) & (data['Clinical classification'] == case)].shape[0]}")
        by_sex[i, j] = data[(data['Female'] == sex) & (data['outcome_died'] == case)].shape[0]
        by_sex_percent[i, j] = by_sex[i, j] / cases[case] * 100

print_with_index(by_sex)
print_with_index(by_sex_percent)

18
   discharged  deceased
0      1279.0      18.0
1       763.0      12.0
   discharged  deceased
0   62.634672      60.0
1   37.365328      40.0


In [6]:
# Calcuate mean age and std
severe_age = data[data['outcome_died'] == 1]['age_at_admission']
not_severe_age = data[data['outcome_died'] == 0]['age_at_admission']

print(round(severe_age.mean(), 1))
print(round(severe_age.std(), 1))
print(round(not_severe_age.mean(), 1))
print(round(not_severe_age.std(), 1))

32.6
43.1
19.7
28.7


In [7]:
# By age group
age_groups = {
    '0-<3': lambda x: x < 3,
    '3-<6': lambda x: x >= 3 and x < 6,
    '6-<9': lambda x: x >= 6 and x < 9,
    '9-<12': lambda x: x >= 9 and x < 12,
    '12-<24': lambda x: x >= 12 and x < 24,
    '24-<36': lambda x: x >= 24 and x < 36,
    '36-<48': lambda x: x >= 36 and x < 48,
    '48-<60': lambda x: x >= 48 and x < 60,
    '>=60': lambda x: x >= 60
} # in months

by_age_group = np.zeros((9, 2))
by_age_group_percent = np.zeros((9, 2))

for i, group in enumerate(age_groups):
    for j, case in enumerate((0, 1)):
        by_age_group[i, j] = data[(data['age_at_admission'].apply(age_groups[group])) & (data['outcome_died'] == case)].shape[0]
        by_age_group_percent[i, j] = round(by_age_group[i, j] / cases[case] * 100, 1)

print_with_index(by_age_group)
print_with_index(by_age_group_percent)

   discharged  deceased
0        87.0       0.0
1       286.0       6.0
2       517.0       6.0
3       450.0       6.0
4       312.0       3.0
5       105.0       1.0
6        71.0       1.0
7        34.0       1.0
8       180.0       6.0
   discharged  deceased
0         4.3       0.0
1        14.0      20.0
2        25.3      20.0
3        22.0      20.0
4        15.3      10.0
5         5.1       3.3
6         3.5       3.3
7         1.7       3.3
8         8.8      20.0


In [8]:
# By vaccination status
vaccination_status = sorted(data['Vaccination'].unique(), key=lambda x: 100 if pd.isna(x) else x)
print(vaccination_status)

by_vaccination_status = np.zeros((len(vaccination_status), 2))
by_vaccination_status_percent = np.zeros((len(vaccination_status), 2))

for i, status in enumerate(vaccination_status):
    for j, case in enumerate((0, 1)):
        if pd.isna(status):
            by_vaccination_status[i, j] = data[(pd.isna(data['Vaccination'])) & (data['outcome_died'] == case)].shape[0]
        else:
            by_vaccination_status[i, j] = data[(data['Vaccination'] == status) & (data['outcome_died'] == case)].shape[0]
        by_vaccination_status_percent[i, j] = round(by_vaccination_status[i, j] / cases[case] * 100, 1)

print_with_index(by_vaccination_status.astype(int))
print_with_index(by_vaccination_status_percent)

['0', '1', '2', '3', 'NA']
   discharged  deceased
0        1782        26
1         170         2
2          29         0
3           2         1
4          59         1
   discharged  deceased
0        87.3      86.7
1         8.3       6.7
2         1.4       0.0
3         0.1       3.3
4         2.9       3.3


In [9]:
place_of_exposure_groups = data['place_of_exposure'].unique()
print(place_of_exposure_groups)

by_place_of_exposure = np.zeros((len(place_of_exposure_groups), 2))
by_place_of_exposure_percent = np.zeros((len(place_of_exposure_groups), 2))

for i, place in enumerate(place_of_exposure_groups):
    for j, case in enumerate((0, 1)):
        if pd.isna(place):
            by_place_of_exposure[i, j] = data[(pd.isna(data['place_of_exposure'])) & (data['outcome_died'] == case)].shape[0]
        else:
            by_place_of_exposure[i, j] = data[(data['place_of_exposure'] == place) & (data['outcome_died'] == case)].shape[0]
        by_place_of_exposure_percent[i, j] = round(by_place_of_exposure[i, j] / cases[case] * 100, 1)

print_with_index(by_place_of_exposure.astype(int))
print_with_index(by_place_of_exposure_percent)

['NHP' 'Another hospital' 'community' '']
   discharged  deceased
0         535        12
1         232         9
2        1272         9
3           3         0
   discharged  deceased
0        26.2      40.0
1        11.4      30.0
2        62.3      30.0
3         0.1       0.0


In [10]:
# By distance
distance_groups = {
    '<20': lambda x: x < 20,
    '20-200': lambda x: x >= 20 and x < 200,
    '200-500': lambda x: x >= 200 and x < 500,
    '>=500': lambda x: x >= 500
}
print(list(distance_groups))

by_distance = np.zeros((4, 2))
by_distance_percent = np.zeros((4, 2))

for i, group in enumerate(distance_groups):
    for j, case in enumerate((0, 1)):
        by_distance[i, j] = data[(data['distance_to_nhp'].apply(distance_groups[group])) & (data['outcome_died'] == case)].shape[0]
        by_distance_percent[i, j] = round(by_distance[i, j] / cases[case] * 100, 1)

print_with_index(by_distance.astype(int))
print_with_index(by_distance_percent)

['<20', '20-200', '200-500', '>=500']
   discharged  deceased
0         466         2
1        1263        22
2         299         6
3          14         0
   discharged  deceased
0        22.8       6.7
1        61.9      73.3
2        14.6      20.0
3         0.7       0.0


In [11]:
# By region of residence
# region_of_residence = data['region_of_address'].unique()
region_of_residence = [
    'Ha Noi',
    'Northeastern',
    'Northwestern',
    'Red River Delta (except Hanoi)',
    'Central',
    'Southern',
]
print(region_of_residence)

by_region_of_residence = np.zeros((len(region_of_residence), 2))
by_region_of_residence_percent = np.zeros((len(region_of_residence), 2))

for i, region in enumerate(region_of_residence):
    for j, case in enumerate((0, 1)):
        by_region_of_residence[i, j] = data[(data['region_of_address'] == region) & (data['outcome_died'] == case)].shape[0]
        by_region_of_residence_percent[i, j] = round(by_region_of_residence[i, j] / cases[case] * 100, 1)

print_with_index(by_region_of_residence.astype(int))
print_with_index(by_region_of_residence_percent)

['Ha Noi', 'Northeastern', 'Northwestern', 'Red River Delta (except Hanoi)', 'Central', 'Southern']
   discharged  deceased
0         710         4
1         202         5
2          97         3
3         711        11
4         314         7
5           8         0
   discharged  deceased
0        34.8      13.3
1         9.9      16.7
2         4.8      10.0
3        34.8      36.7
4        15.4      23.3
5         0.4       0.0


In [12]:
# By severity
severity_groups = [0, 1]
print(severity_groups)

by_severity = np.zeros((len(severity_groups), 2))
by_severity_percent = np.zeros((len(severity_groups), 2))

for i, severity in enumerate(severity_groups):
    for j, case in enumerate((0, 1)):
        by_severity[i, j] = data[(data['Clinical classification'] == severity) & (data['outcome_died'] == case)].shape[0]
        by_severity_percent[i, j] = round(by_severity[i, j] / cases[case] * 100, 1)

print_with_index(by_severity.astype(int))
print_with_index(by_severity_percent)

[0, 1]
   discharged  deceased
0        1223         0
1         819        30
   discharged  deceased
0        59.9       0.0
1        40.1     100.0


In [13]:
# By duration between onset and admission
duration_groups = {
    '<0': lambda x: x < 0,
    '0-3': lambda x: x >= 0 and x < 3,
    '3-7': lambda x: x >= 3 and x < 7,
    '7-14': lambda x: x >= 7 and x < 14,
    '>=14': lambda x: x >= 14,
    'Unknown': lambda x: pd.isna(x)
}
print(list(duration_groups))

by_duration = np.zeros((len(duration_groups), 2))
by_duration_percent = np.zeros((len(duration_groups), 2))

for i, group in enumerate(duration_groups):
    for j, case in enumerate((0, 1)):
        if pd.isna(group):
            by_duration[i, j] = data[(pd.isna(data['duration time onset to admission'])) & (data['outcome_died'] == case)].shape[0]
        else:
            by_duration[i, j] = data[(data['duration time onset to admission'].apply(duration_groups[group])) & (data['outcome_died'] == case)].shape[0]
        by_duration_percent[i, j] = round(by_duration[i, j] / cases[case] * 100, 1)

print_with_index(by_duration.astype(int))
print_with_index(by_duration_percent)

['<0', '0-3', '3-7', '7-14', '>=14', 'Unknown']


TypeError: '<' not supported between instances of 'str' and 'int'

In [None]:
# By duration of stay
duration_stay_groups = {
    '0-7': lambda x: x >= 0 and x < 7,
    '7-21': lambda x: x >= 7 and x < 21,
    '>=21': lambda x: x >= 21,
    'Unkown': lambda x: pd.isna(x)
}
print(list(duration_stay_groups))

by_duration_stay = np.zeros((len(duration_stay_groups), 2))
by_duration_stay_percent = np.zeros((len(duration_stay_groups), 2))

for i, group in enumerate(duration_stay_groups):
    for j, case in enumerate((0, 1)):
        if pd.isna(group):
            by_duration_stay[i, j] = data[(pd.isna(data['duration_of_stay'])) & (data['outcome_died'] == case)].shape[0]
        else:
            by_duration_stay[i, j] = data[(data['duration_of_stay'].apply(duration_stay_groups[group])) & (data['outcome_died'] == case)].shape[0]
        by_duration_stay_percent[i, j] = round(by_duration_stay[i, j] / cases[case] * 100, 1)

print_with_index(by_duration_stay.astype(int))
print_with_index(by_duration_stay_percent)

In [None]:
# By underlying conditions
underlying_conditions = [key for key in data.keys() if 'Underlying condition' in key]

by_underlying_condition_total = np.zeros((len(underlying_conditions), 2))
by_underlying_condition_total_percent = np.zeros((len(underlying_conditions), 2))

for i, condition in enumerate(underlying_conditions):
    print(condition)
    by_underlying_condition = np.zeros((1, 2))
    by_underlying_condition_percent = np.zeros((1, 2))
    value = 1
    for j, case in enumerate((0, 1)):
        by_underlying_condition[0, j] = data[(data[condition] == value) & (data['outcome_died'] == case)].shape[0]
        by_underlying_condition_percent[0, j] = round(by_underlying_condition[0, j] / cases[case] * 100, 1)
    by_underlying_condition_total[i, :] = by_underlying_condition
    by_underlying_condition_total_percent[i, :] = by_underlying_condition_percent

print_with_index(by_underlying_condition_total.astype(int))
print_with_index(by_underlying_condition_total_percent)

In [None]:
# By maximal form of respiratory support used
respiratory_support = [
    'oxygen_cannula',
    'CPAP',
    'conventional_mechanical_ventilation',
    'hfo_ventilation',
    'ECMO',
]
print(respiratory_support)

by_respiratory_support = np.zeros((len(respiratory_support), 2))
by_respiratory_support_percent = np.zeros((len(respiratory_support), 2))

for i, support in enumerate(respiratory_support):
    for j, case in enumerate((0, 1)):
        by_respiratory_support[i, j] = data[(data['highest_ventilation_mode'] == support) & (data['outcome_died'] == case)].shape[0]
        by_respiratory_support_percent[i, j] = round(by_respiratory_support[i, j] / cases[case] * 100, 1)


print_with_index(by_respiratory_support.astype(int))
print_with_index(by_respiratory_support_percent)

In [None]:
# By duration between onset and test (hours)
duration_between_onset_and_test_groups = {
    '<0': lambda x: x < 0,
    '0-24': lambda x: x >= 0 and x < 24,
    '24-48': lambda x: x >= 24 and x < 48,
    '>=48': lambda x: x >= 48,
    'Unkown': lambda x: pd.isna(x)
}
print(list(duration_between_onset_and_test_groups))

by_duration_between_onset_and_test = np.zeros((len(duration_between_onset_and_test_groups), 2))
by_duration_between_onset_and_test_percent = np.zeros((len(duration_between_onset_and_test_groups), 2))

for i, group in enumerate(duration_between_onset_and_test_groups):
    for j, case in enumerate((0, 1)):
        if pd.isna(group):
            by_duration_between_onset_and_test[i, j] = data[(pd.isna(data['onset to test'])) & (data['outcome_died'] == case)].shape[0]
        else:
            by_duration_between_onset_and_test[i, j] = data[((data['onset to test'] * 24).apply(duration_between_onset_and_test_groups[group])) & (data['outcome_died'] == case)].shape[0]
        by_duration_between_onset_and_test_percent[i, j] = round(by_duration_between_onset_and_test[i, j] / cases[case] * 100, 1)

print_with_index(by_duration_between_onset_and_test.astype(int))
print_with_index(by_duration_between_onset_and_test_percent)

In [None]:
# By complications
complication_groups = [key for key in data.keys() if 'complication' in key]
print(complication_groups)

by_complications = np.zeros((len(complication_groups), 2))
by_complications_percent = np.zeros((len(complication_groups), 2))

for i, group in enumerate(complication_groups):
    for j, case in enumerate((0, 1)):
        by_complications[i, j] = data[(data[group] == 1) & (data['outcome_died'] == case)].shape[0]
        by_complications_percent[i, j] = round(by_complications[i, j] / cases[case] * 100, 1)

print_with_index(by_complications.astype(int))
print_with_index(by_complications_percent)

In [None]:
# By co-infections

co_infection_groups = [key for key in data.keys() if 'co-infection' in key]
print(co_infection_groups)

by_co_infections = np.zeros((len(co_infection_groups), 2))
by_co_infections_percent = np.zeros((len(co_infection_groups), 2))

for i, group in enumerate(co_infection_groups):
    for j, case in enumerate((0, 1)):
        by_co_infections[i, j] = data[(data[group] == 1) & (data['outcome_died'] == case)].shape[0]
        by_co_infections_percent[i, j] = round(by_co_infections[i, j] / cases[case] * 100, 1)

print_with_index(by_co_infections.astype(int))
print_with_index(by_co_infections_percent)

In [None]:
# By healthcare-associated infection

healthcare_associated_infection_groups = [
    'respiratory_syncytical_virus',
    'adenovirus',
    'pertussis',
    'healthcare_associated_infection',
]

by_healthcare_associated_infections = np.zeros((len(healthcare_associated_infection_groups), 2))
by_healthcare_associated_infections_percent = np.zeros((len(healthcare_associated_infection_groups), 2))

for i, group in enumerate(healthcare_associated_infection_groups):
    for j, case in enumerate((0, 1)):
        by_healthcare_associated_infections[i, j] = data[(data[group] == 1) & (data['outcome_died'] == case)].shape[0]
        by_healthcare_associated_infections_percent[i, j] = round(by_healthcare_associated_infections[i, j] / cases[case] * 100, 1)

print_with_index(by_healthcare_associated_infections.astype(int))
print_with_index(by_healthcare_associated_infections_percent)

In [None]:
# By region
region_groups = [
    'Ha Noi',
    'Northeastern',
    'Northwestern',
    'Red River Delta (except Hanoi)',
    'Central',
    'Southern',
]
print(data['region_of_address'].unique())
print(list(region_groups))

by_region = np.zeros((len(region_groups), 2))
by_region_percent = np.zeros((len(region_groups), 2))

for i, group in enumerate(region_groups):
    for j, case in enumerate((0, 1)):
        by_region[i, j] = data[(data['region_of_address'] == group) & (data['outcome_died'] == case)].shape[0]
        by_region_percent[i, j] = round(by_region[i, j] / cases[case] * 100, 1)

print_with_index(by_region.astype(int))
print_with_index(by_region_percent)

In [None]:
# By diagnosis of admission
diagnosis_groups = [
    'Measles',
    'Pneumonia',
    'Bronchopneumonia',
    'Other diagnosis',
]
print(diagnosis_groups)

by_diagnosis = np.zeros((len(diagnosis_groups), 2))
by_diagnosis_percent = np.zeros((len(diagnosis_groups), 2))

for i, group in enumerate(diagnosis_groups):
    for j, case in enumerate((0, 1)):
        by_diagnosis[i, j] = data[(data[group] == 1) & (data['outcome_died'] == case)].shape[0]
        by_diagnosis_percent[i, j] = round(by_diagnosis[i, j] / cases[case] * 100, 1)

print_with_index(by_diagnosis.astype(int))
print_with_index(by_diagnosis_percent)

In [None]:
cases

0    2042
1      30
Name: outcome_died, dtype: int64

In [14]:
total = np.array([30, 2042])
vac = ['0','>=1','NA']
table = np.zeros((3,2))

for i in range(3):
    for j in range(2):
        table[i][j] = len(data[ (data['outcome_died'] == 1 - j) & (data['New vaccination'] == vac[i])])

np.set_printoptions(suppress=True)
print(table)
print(np.round(100 * table/total,1))

[[  26. 1782.]
 [   3.  201.]
 [   1.   59.]]
[[86.7 87.3]
 [10.   9.8]
 [ 3.3  2.9]]


In [16]:
total = np.array([30, 2042])
age = ['0 - 9','9 - 60','>= 60']
table = np.zeros((3,2))

for i in range(3):
    for j in range(2):
        table[i][j] = len(data[ (data['outcome_died'] == 1 - j) & (data['New Age group (in months)'] == age[i])])

np.set_printoptions(suppress=True)
print(table)
print(np.round(100 * table/total,1))

[[ 12. 890.]
 [ 12. 972.]
 [  6. 180.]]
[[40.  43.6]
 [40.  47.6]
 [20.   8.8]]
