In [5]:
import pandas as pd
import numpy as np
import datetime
import string
import os
from matplotlib import pyplot as plt
from pathlib import Path

def print_with_index(l):
    print(pd.DataFrame(l))

In [6]:
DATA = os.path.join(Path.cwd().parents[0], 'Data/out.csv')
data = pd.read_csv(DATA, keep_default_na=False)
print(data.shape)

(2072, 64)


In [7]:
# Get clinical specification
clinical_spec = data['Clinical classification']
# print(clinical_spec)

# Number of severe/not severe cases
cases = clinical_spec.value_counts()
print(cases)
print(cases / clinical_spec.shape[0] * 100)

0    1223
1     849
Name: Clinical classification, dtype: int64
0    59.025097
1    40.974903
Name: Clinical classification, dtype: float64


In [8]:
# By year: 2017, 2018, 2019
by_year = np.zeros((3, 2))
by_year_percent = np.zeros((3, 2))

for year in ['2017', '2018', '2019']:
    for i, case in enumerate([0, 1]):
        by_year[int(year) - 2017, i] = data[(data['admission_date'].str.contains(year)) & (data['Clinical classification'] == case)].shape[0]
        by_year_percent[int(year) - 2017, i] = by_year[int(year) - 2017, i] / cases[case] * 100

print_with_index(by_year)
print_with_index(by_year_percent)

       0      1
0  109.0   23.0
1  375.0  246.0
2  739.0  580.0
           0          1
0   8.912510   2.709069
1  30.662306  28.975265
2  60.425184  68.315665


In [9]:
# By sex: Male or Female
by_sex = np.zeros((2, 2))
by_sex_percent = np.zeros((2, 2))

print(data[(data['Female'] == 0) & (data['Clinical classification'] == 1)].shape[0])

for i, sex in enumerate((0, 1)):
    for j, case in enumerate((0, 1)):
        # print(f"At cell {i}, {j}")
        # print(f"sex = {sex}, case = {case} => {data[(data['Female'] == sex) & (data['Clinical classification'] == case)].shape[0]}")
        by_sex[i, j] = data[(data['Female'] == sex) & (data['Clinical classification'] == case)].shape[0]
        by_sex_percent[i, j] = by_sex[i, j] / cases[case] * 100

print_with_index(by_sex)
print_with_index(by_sex_percent)

542
       0      1
0  755.0  542.0
1  468.0  307.0
           0          1
0  61.733442  63.839812
1  38.266558  36.160188


In [10]:
# Calcuate mean age and std
severe_age = data[data['Clinical classification'] == 1]['age_at_admission']
not_severe_age = data[data['Clinical classification'] == 0]['age_at_admission']

print(round(severe_age.mean(), 1))
print(round(severe_age.std(), 1))
print(round(not_severe_age.mean(), 1))
print(round(not_severe_age.std(), 1))

13.7
20.6
24.2
32.9


In [11]:
# By age group
age_groups = {
    '0-<3': lambda x: x < 3,
    '3-<6': lambda x: x >= 3 and x < 6,
    '6-<9': lambda x: x >= 6 and x < 9,
    '9-<12': lambda x: x >= 9 and x < 12,
    '12-<24': lambda x: x >= 12 and x < 24,
    '24-<36': lambda x: x >= 24 and x < 36,
    '36-<48': lambda x: x >= 36 and x < 48,
    '48-<60': lambda x: x >= 48 and x < 60,
    '>=60': lambda x: x >= 60
} # in months

by_age_group = np.zeros((9, 2))
by_age_group_percent = np.zeros((9, 2))

for i, group in enumerate(age_groups):
    for j, case in enumerate((0, 1)):
        by_age_group[i, j] = data[(data['age_at_admission'].apply(age_groups[group])) & (data['Clinical classification'] == case)].shape[0]
        by_age_group_percent[i, j] = round(by_age_group[i, j] / cases[case] * 100, 1)

print_with_index(by_age_group)
print_with_index(by_age_group_percent)
print(by_age_group_percent.sum(axis=0))

       0      1
0   23.0   64.0
1  126.0  166.0
2  303.0  220.0
3  279.0  177.0
4  187.0  128.0
5   78.0   28.0
6   51.0   21.0
7   26.0    9.0
8  150.0   36.0
      0     1
0   1.9   7.5
1  10.3  19.6
2  24.8  25.9
3  22.8  20.8
4  15.3  15.1
5   6.4   3.3
6   4.2   2.5
7   2.1   1.1
8  12.3   4.2
[100.1 100. ]


In [12]:
# By vaccination status
vaccination_status = data['Vaccination'].unique()
print(vaccination_status)

by_vaccination_status = np.zeros((len(vaccination_status), 2))
by_vaccination_status_percent = np.zeros((len(vaccination_status), 2))

for i, status in enumerate(vaccination_status):
    for j, case in enumerate((0, 1)):
        if pd.isna(status):
            by_vaccination_status[i, j] = data[(pd.isna(data['Vaccination'])) & (data['Clinical classification'] == case)].shape[0]
        else:
            by_vaccination_status[i, j] = data[(data['Vaccination'] == status) & (data['Clinical classification'] == case)].shape[0]
        by_vaccination_status_percent[i, j] = round(by_vaccination_status[i, j] / cases[case] * 100, 1)

print_with_index(by_vaccination_status.astype(int))
print_with_index(by_vaccination_status_percent)

['0' '1' 'NA' '2' '3']
      0    1
0  1021  787
1   143   29
2    34   26
3    24    5
4     1    2
      0     1
0  83.5  92.7
1  11.7   3.4
2   2.8   3.1
3   2.0   0.6
4   0.1   0.2


In [13]:
# By distance
distance_groups = {
    '<20': lambda x: x < 20,
    '20-200': lambda x: x >= 20 and x < 200,
    '200-500': lambda x: x >= 200 and x < 500,
    '>=500': lambda x: x >= 500
}
print(list(distance_groups))

by_distance = np.zeros((4, 2))
by_distance_percent = np.zeros((4, 2))

for i, group in enumerate(distance_groups):
    for j, case in enumerate((0, 1)):
        by_distance[i, j] = data[(data['distance_to_nhp'].apply(distance_groups[group])) & (data['Clinical classification'] == case)].shape[0]
        by_distance_percent[i, j] = round(by_distance[i, j] / cases[case] * 100, 1)

print(by_distance.astype(int))
print(by_distance_percent)

['<20', '20-200', '200-500', '>=500']
[[343 125]
 [718 567]
 [152 153]
 [ 10   4]]
[[28.  14.7]
 [58.7 66.8]
 [12.4 18. ]
 [ 0.8  0.5]]


In [14]:
# By region of residence
region_of_residence = data['region_of_address'].unique()
print(region_of_residence)

['Red River Delta (except Hanoi)' 'Ha Noi' 'Central' 'Northwestern'
 'Northeastern' 'Southern']


In [15]:
# By clinical outcome
clinical_outcome = [0, 1] # 0: survived, 1: dead
print(clinical_outcome)

by_clinical_outcome = np.zeros((2, 2))
by_clinical_outcome_percent = np.zeros((2, 2))

for i, outcome in enumerate(clinical_outcome):
    for j, case in enumerate((0, 1)):
        by_clinical_outcome[i, j] = data[(data['outcome_died'] == outcome) & (data['Clinical classification'] == case)].shape[0]
        by_clinical_outcome_percent[i, j] = round(by_clinical_outcome[i, j] / cases[case] * 100, 1)

print(by_clinical_outcome.astype(int))
print(by_clinical_outcome_percent)

[0, 1]
[[1223  819]
 [   0   30]]
[[100.   96.5]
 [  0.    3.5]]


In [16]:
# By duration between onset and admission
duration_groups = {
    '<0': lambda x: x < 0,
    '0-3': lambda x: x >= 0 and x < 3,
    '3-7': lambda x: x >= 3 and x < 7,
    '7-14': lambda x: x >= 7 and x < 14,
    '>=14': lambda x: x >= 14,
    'Unknown': lambda x: pd.isna(x)
}
print(list(duration_groups))

by_duration = np.zeros((len(duration_groups), 2))
by_duration_percent = np.zeros((len(duration_groups), 2))

for i, group in enumerate(duration_groups):
    for j, case in enumerate((0, 1)):
        if pd.isna(group):
            by_duration[i, j] = data[(pd.isna(data['duration time onset to admission'])) & (data['Clinical classification'] == case)].shape[0]
        else:
            by_duration[i, j] = data[(data['duration time onset to admission'].apply(duration_groups[group])) & (data['Clinical classification'] == case)].shape[0]
        by_duration_percent[i, j] = round(by_duration[i, j] / cases[case] * 100, 1)

print_with_index(by_duration.astype(int))
print_with_index(by_duration_percent)

['<0', '0-3', '3-7', '7-14', '>=14', 'Unknown']


TypeError: '<' not supported between instances of 'str' and 'int'

In [None]:
# By duration of stay
duration_stay_groups = {
    '0-7': lambda x: x >= 0 and x < 7,
    '7-21': lambda x: x >= 7 and x < 21,
    '>=21': lambda x: x >= 21,
    'Unkown': lambda x: pd.isna(x)
}
print(list(duration_stay_groups))

by_duration_stay = np.zeros((len(duration_stay_groups), 2))
by_duration_stay_percent = np.zeros((len(duration_stay_groups), 2))

for i, group in enumerate(duration_stay_groups):
    for j, case in enumerate((0, 1)):
        if pd.isna(group):
            by_duration_stay[i, j] = data[(pd.isna(data['duration_of_stay'])) & (data['Clinical classification'] == case)].shape[0]
        else:
            by_duration_stay[i, j] = data[(data['duration_of_stay'].apply(duration_stay_groups[group])) & (data['Clinical classification'] == case)].shape[0]
        by_duration_stay_percent[i, j] = round(by_duration_stay[i, j] / cases[case] * 100, 1)

print_with_index(by_duration_stay.astype(int))
print_with_index(by_duration_stay_percent)

['0-7', '7-21', '>=21', 'Unkown']
     0    1
0  816  226
1  348  391
2   59  232
3    0    0
      0     1
0  66.7  26.6
1  28.5  46.1
2   4.8  27.3
3   0.0   0.0


In [None]:
# By underlying conditions
underlying_conditions = [key for key in data.keys() if 'Underlying condition' in key]

by_underlying_condition_total = np.zeros((len(underlying_conditions), 2))
by_underlying_condition_total_percent = np.zeros((len(underlying_conditions), 2))

for i, condition in enumerate(underlying_conditions):
    print(condition)
    by_underlying_condition = np.zeros((1, 2))
    by_underlying_condition_percent = np.zeros((1, 2))
    value = 1
    for j, case in enumerate((0, 1)):
        by_underlying_condition[0, j] = data[(data[condition] == value) & (data['Clinical classification'] == case)].shape[0]
        by_underlying_condition_percent[0, j] = round(by_underlying_condition[0, j] / cases[case] * 100, 1)
    by_underlying_condition_total[i, :] = by_underlying_condition
    by_underlying_condition_total_percent[i, :] = by_underlying_condition_percent

print_with_index(by_underlying_condition_total.astype(int))
print_with_index(by_underlying_condition_total_percent)

Underlying conditions - Respiratory system
Underlying conditions - Cardiovascular system
Underlying condition - Gastrointestinal system
Underlying condition - Kidney and urology system
Underlying condition - Immunodeficiency
Underlying condition - Neurological system
Underlying condition - Inherited metabolic disorders
Underlying condition - No underlying diseases
Underlying condition - Other underlying conditions
      0    1
0    19   23
1    27   45
2    21   73
3    26   20
4     6    6
5    29   44
6     8    7
7  1031  554
8    69   90
      0     1
0   1.6   2.7
1   2.2   5.3
2   1.7   8.6
3   2.1   2.4
4   0.5   0.7
5   2.4   5.2
6   0.7   0.8
7  84.3  65.3
8   5.6  10.6


In [None]:
# By maximal form of respiratory support used
respiratory_support = [
    'oxygen_cannula',
    'CPAP',
    'conventional_mechanical_ventilation',
    'hfo_ventilation',
    'ECMO',
]
print(respiratory_support)

by_respiratory_support = np.zeros((len(respiratory_support), 2))
by_respiratory_support_percent = np.zeros((len(respiratory_support), 2))

for i, support in enumerate(respiratory_support):
    for j, case in enumerate((0, 1)):
        by_respiratory_support[i, j] = data[(data['highest_ventilation_mode'] == support) & (data['Clinical classification'] == case)].shape[0]
        by_respiratory_support_percent[i, j] = round(by_respiratory_support[i, j] / cases[case] * 100, 1)


print(by_respiratory_support.astype(int))
print(by_respiratory_support_percent)

['oxygen_cannula', 'CPAP', 'conventional_mechanical_ventilation', 'hfo_ventilation', 'ECMO']
[[  0 685]
 [  0   8]
 [  0 144]
 [  0  11]
 [  0   1]]
[[ 0.  80.7]
 [ 0.   0.9]
 [ 0.  17. ]
 [ 0.   1.3]
 [ 0.   0.1]]


In [17]:
# By duration between onset and test (hours)
duration_between_onset_and_test_groups = {
    '<0': lambda x: x < 0,
    '0-24': lambda x: x >= 0 and x < 24,
    '24-48': lambda x: x >= 24 and x < 48,
    '>=48': lambda x: x >= 48,
    'Unkown': lambda x: pd.isna(x)
}
print(list(duration_between_onset_and_test_groups))

by_duration_between_onset_and_test = np.zeros((len(duration_between_onset_and_test_groups), 2))
by_duration_between_onset_and_test_percent = np.zeros((len(duration_between_onset_and_test_groups), 2))

for i, group in enumerate(duration_between_onset_and_test_groups):
    for j, case in enumerate((0, 1)):
        if pd.isna(group):
            by_duration_between_onset_and_test[i, j] = data[(pd.isna(data['onset to test'])) & (data['Clinical classification'] == case)].shape[0]
        else:
            by_duration_between_onset_and_test[i, j] = data[((data['onset to test'] * 24).apply(duration_between_onset_and_test_groups[group])) & (data['Clinical classification'] == case)].shape[0]
        by_duration_between_onset_and_test_percent[i, j] = round(by_duration_between_onset_and_test[i, j] / cases[case] * 100, 1)

print_with_index(by_duration_between_onset_and_test.astype(int))
print_with_index(by_duration_between_onset_and_test_percent)

['<0', '0-24', '24-48', '>=48', 'Unkown']
     0    1
0   15    8
1  314  240
2  237  201
3  657  400
4    0    0
      0     1
0   1.2   0.9
1  25.7  28.3
2  19.4  23.7
3  53.7  47.1
4   0.0   0.0


In [None]:
# By complications
complication_groups = [key for key in data.keys() if 'complication' in key]
print(complication_groups)

by_complications = np.zeros((len(complication_groups), 2))
by_complications_percent = np.zeros((len(complication_groups), 2))

for i, group in enumerate(complication_groups):
    for j, case in enumerate((0, 1)):
        by_complications[i, j] = data[(data[group] == 1) & (data['Clinical classification'] == case)].shape[0]
        by_complications_percent[i, j] = round(by_complications[i, j] / cases[case] * 100, 1)

print_with_index(by_complications.astype(int))
print_with_index(by_complications_percent)

['complication-gastroentiritis', 'complication-middle-ear-infec', 'complication-conjunctivitis', 'complication-laryngitis', 'complication-pneumonia-bronchitis', 'complication-febrile-seizures', 'complication-septic-shock-sepsis']
     0    1
0   37   14
1   26    4
2    1    1
3    7    0
4  692  721
5    1    9
6    2    2
      0     1
0   3.0   1.6
1   2.1   0.5
2   0.1   0.1
3   0.6   0.0
4  56.6  84.9
5   0.1   1.1
6   0.2   0.2


In [None]:
# By co-infections

co_infection_groups = [key for key in data.keys() if 'co-infection' in key]
print(co_infection_groups)

by_co_infections = np.zeros((len(co_infection_groups), 2))
by_co_infections_percent = np.zeros((len(co_infection_groups), 2))

for i, group in enumerate(co_infection_groups):
    for j, case in enumerate((0, 1)):
        by_co_infections[i, j] = data[(data[group] == 1) & (data['Clinical classification'] == case)].shape[0]
        by_co_infections_percent[i, j] = round(by_co_infections[i, j] / cases[case] * 100, 1)

print_with_index(by_co_infections.astype(int))
print_with_index(by_co_infections_percent)

['co-infection-influenza-a', 'co-infection-influenza-b', 'co-infection-streptococus-aerius', 'co-infection-streptococus-pneumonia']
   0  1
0  8  8
1  1  3
2  2  1
3  0  2
     0    1
0  0.7  0.9
1  0.1  0.4
2  0.2  0.1
3  0.0  0.2


In [None]:
# By healthcare-associated infection

healthcare_associated_infection_groups = [
    'respiratory_syncytical_virus',
    'adenovirus',
    'pertussis',
    'healthcare_associated_infection',
]

by_healthcare_associated_infections = np.zeros((len(healthcare_associated_infection_groups), 2))
by_healthcare_associated_infections_percent = np.zeros((len(healthcare_associated_infection_groups), 2))

for i, group in enumerate(healthcare_associated_infection_groups):
    for j, case in enumerate((0, 1)):
        by_healthcare_associated_infections[i, j] = data[(data[group] == 1) & (data['Clinical classification'] == case)].shape[0]
        by_healthcare_associated_infections_percent[i, j] = round(by_healthcare_associated_infections[i, j] / cases[case] * 100, 1)

print_with_index(by_healthcare_associated_infections.astype(int))
print_with_index(by_healthcare_associated_infections_percent)

    0   1
0  12  18
1  22  73
2   5  12
3   3  15
     0    1
0  1.0  2.1
1  1.8  8.6
2  0.4  1.4
3  0.2  1.8


In [None]:
# By region
region_groups = [
    'Ha Noi',
    'Northeastern',
    'Northwestern',
    'Red River Delta (except Hanoi)',
    'Central',
    'Southern',
]
print(data['region_of_address'].unique())
print(list(region_groups))

by_region = np.zeros((len(region_groups), 2))
by_region_percent = np.zeros((len(region_groups), 2))

for i, group in enumerate(region_groups):
    for j, case in enumerate((0, 1)):
        by_region[i, j] = data[(data['region_of_address'] == group) & (data['Clinical classification'] == case)].shape[0]
        by_region_percent[i, j] = round(by_region[i, j] / cases[case] * 100, 1)

print_with_index(by_region.astype(int))
print_with_index(by_region_percent)

['Red River Delta (except Hanoi)' 'Ha Noi' 'Central' 'Northwestern'
 'Northeastern' 'Southern']
['Ha Noi', 'Northeastern', 'Northwestern', 'Red River Delta (except Hanoi)', 'Central', 'Southern']
     0    1
0  475  239
1  109   98
2   51   49
3  416  306
4  166  155
5    6    2
      0     1
0  38.8  28.2
1   8.9  11.5
2   4.2   5.8
3  34.0  36.0
4  13.6  18.3
5   0.5   0.2


In [None]:
# By diagnosis of admission
diagnosis_groups = [
    'Measles',
    'Pneumonia',
    'Bronchopneumonia',
    'Other diagnosis',
]
print(diagnosis_groups)

by_diagnosis = np.zeros((len(diagnosis_groups), 2))
by_diagnosis_percent = np.zeros((len(diagnosis_groups), 2))

for i, group in enumerate(diagnosis_groups):
    for j, case in enumerate((0, 1)):
        by_diagnosis[i, j] = data[(data[group] == 1) & (data['Clinical classification'] == case)].shape[0]
        by_diagnosis_percent[i, j] = round(by_diagnosis[i, j] / cases[case] * 100, 1)

print_with_index(by_diagnosis.astype(int))
print_with_index(by_diagnosis_percent)

['Measles', 'Pneumonia', 'Bronchopneumonia', 'Other diagnosis']
     0    1
0  576  272
1  300  361
2  406  343
3  214  177
      0     1
0  47.1  32.0
1  24.5  42.5
2  33.2  40.4
3  17.5  20.8
