In [2]:
import pandas as pd
import numpy as np
import datetime
import string
import os
from matplotlib import pyplot as plt
from pathlib import Path

from scipy.stats import chi2_contingency
from scipy.stats import f_oneway
from scipy.stats import ttest_ind

In [3]:
RAW_DATA_PATH = '/Users/vuhoangnguyen/Research/'
RAW_DATA = os.path.join(RAW_DATA_PATH, 'raw_data.csv')
raw_dataset = pd.read_csv(RAW_DATA, keep_default_na=False)
DATA = os.path.join(Path.cwd().parents[0], 'Data/out.csv')
dataset = pd.read_csv(DATA, keep_default_na=False)

In [4]:
place = ['community','Another hospital','NHP']
total = np.zeros(3)
for i in range(3):
    total[i] = len(dataset[dataset['place_of_exposure'] == place[i]])
total

array([1281.,  241.,  547.])

In [5]:
place = ['community','Another hospital','NHP']
table = np.zeros((3,3))

for i, year in enumerate(['2017', '2018', '2019']):
    for j in range(3):
        table[i][j] = len(dataset[ (dataset['place_of_exposure'] == place[j]) & (dataset['admission_date'].str.contains(year)) ])

print(table)
print(np.round(100 * table/total, 1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[ 61.   8.  61.]
 [382.  67. 171.]
 [838. 166. 315.]]
[[ 4.8  3.3 11.2]
 [29.8 27.8 31.3]
 [65.4 68.9 57.6]]
[ 130.  620. 1319.]
6.692994327832307e-07


In [6]:
place = ['community','Another hospital','NHP']
table = np.zeros((2,3))

for i in range(2):
    for j in range(3):
        table[i][j] = len(dataset[ (dataset['place_of_exposure'] == place[j]) & (dataset['Female'] == i) ])

print(table)
print(np.round(100 * table/total, 1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[787. 161. 348.]
 [494.  80. 199.]]
[[61.4 66.8 63.6]
 [38.6 33.2 36.4]]
[1296.  773.]
0.2461350496254834


In [29]:
place = ['community','Another hospital','NHP']
age_group = ['0 - 9','9 - 60','>= 60']
table = np.zeros((len(age_group),len(place)))

for i in range(len(age_group)):
    for j in range(len(place)):
        table[i][j] = len(dataset[ (dataset['place_of_exposure'] == place[j]) & (dataset['New Age group (in months)'] == age_group[i]) ])

print(table)
print(np.round(100 * table/total, 1))
print(np.sum(table, axis=1))

# Age in months for each place
agePerPlace = [[], [], []]
place = ['community','Another hospital','NHP']
for i in range(len(dataset)):
    x = pd.to_datetime(dataset.at[i,'admission_date'])
    y = pd.to_datetime(dataset.at[i,'dob'])
    decimal_month_at_admission = (x-y).days / 30
    place_exposure = dataset.at[i,'place_of_exposure']
    if place_exposure == '':        continue
    id =  place.index(place_exposure)
    agePerPlace[id].append(decimal_month_at_admission)

f_oneway(agePerPlace[0], agePerPlace[1], agePerPlace[2])

[[509. 108. 283.]
 [624. 125. 234.]
 [148.   8.  30.]]
[[39.7 44.8 51.7]
 [48.7 51.9 42.8]
 [11.6  3.3  5.5]]
[900. 983. 186.]


F_onewayResult(statistic=19.594341813242558, pvalue=3.715240652193271e-09)

In [22]:
place = ['community','Another hospital','NHP']
vac = ['0','>=1','NA']
table = np.zeros((len(vac),len(place)))

for i in range(len(vac)):
    for j in range(len(place)):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset['New vaccination'] == vac[i])])

np.set_printoptions(suppress=True)
print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[1074.  222.  509.]
 [ 168.   13.   23.]
 [  39.    6.   15.]]
[[83.8 92.1 93.1]
 [13.1  5.4  4.2]
 [ 3.   2.5  2.7]]
[1805.  204.   60.]
2.5240050841646455e-08


In [30]:
place = ['community','Another hospital','NHP']
table = np.zeros((4,3))
distance = ['0 - <20','20 - <200','200 - <500','>= 500']

for i in range(4):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset['Distance from the hospital (km)'] == distance[i])])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

f_oneway(table[0], table[1], table[2], table[3])

[[347.  45.  76.]
 [764. 143. 375.]
 [164.  51.  90.]
 [  6.   2.   6.]]
[[27.1 18.7 13.9]
 [59.6 59.3 68.6]
 [12.8 21.2 16.5]
 [ 0.5  0.8  1.1]]
[ 468. 1282.  305.   14.]


F_onewayResult(statistic=3.0424810076570608, pvalue=0.09253932687496695)

In [31]:
place = ['community','Another hospital','NHP']
table = np.zeros((6,3))
part_of_vietnam = ['Ha Noi','Northeastern','Northwestern','Red River Delta (except Hanoi)','Central','Southern']

for i in range(6):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset['region_of_address'] == part_of_vietnam[i])])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[503.  67. 144.]
 [117.  29.  61.]
 [ 63.   9.  28.]
 [424.  81. 214.]
 [169.  54.  98.]
 [  5.   1.   2.]]
[[39.3 27.8 26.3]
 [ 9.1 12.  11.2]
 [ 4.9  3.7  5.1]
 [33.1 33.6 39.1]
 [13.2 22.4 17.9]
 [ 0.4  0.4  0.4]]
[714. 207. 100. 719. 321.   8.]
3.992796106388303e-06


In [32]:
place = ['community','Another hospital','NHP']
table = np.zeros((2,3))

for i in range(2):
    for j in range(3):
        table[i][j] = len(dataset[ (dataset['place_of_exposure'] == place[j]) & (dataset['outcome_died'] == i)])
print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[1272.  232.  535.]
 [   9.    9.   12.]]
[[99.3 96.3 97.8]
 [ 0.7  3.7  2.2]]
[2039.   30.]
0.00034788832737828163


In [42]:
place = ['community','Another hospital','NHP']
table = np.zeros((5,3))
duration = ['<0','0 - <3','3 - <7','7 - <14', '>= 14']

for i in range(5):
    for j in range(3):
        table[i][j] = len(dataset[ (dataset['place_of_exposure'] == place[j]) & (dataset['Duration between onset and admission'] == duration[i])])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

f_oneway(table[0], table[1], table[2], table[3], table[4])

[[259.  52. 298.]
 [652. 107. 154.]
 [303.  64.  85.]
 [ 29.  10.   8.]
 [ 36.   8.   2.]]
[[20.2 21.6 54.5]
 [50.9 44.4 28.2]
 [23.7 26.6 15.5]
 [ 2.3  4.1  1.5]
 [ 2.8  3.3  0.4]]
[609. 913. 452.  47.  46.]


F_onewayResult(statistic=1.8374574995526267, pvalue=0.1983433912208276)

In [43]:
place = ['community','Another hospital','NHP']
table = np.zeros((3,3))
duration = ['0 - <7','7 - <21', '>= 21']

for i in range(3):
    for j in range(3):
        table[i][j] = len(dataset[ (dataset['place_of_exposure'] == place[j]) & (dataset['Duration of stay within the hospital'] == duration[i])])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

f_oneway(table[0], table[1], table[2])

[[817.  78. 146.]
 [404. 125. 209.]
 [ 60.  38. 192.]]
[[63.8 32.4 26.7]
 [31.5 51.9 38.2]
 [ 4.7 15.8 35.1]]
[1041.  738.  290.]


F_onewayResult(statistic=0.7348695702906481, pvalue=0.518247772858704)

In [33]:
place = ['community','Another hospital','NHP']
table = np.zeros((9,3))

underlying_condition = ['Underlying conditions - Respiratory system','Underlying conditions - Cardiovascular system',
    'Underlying condition - Gastrointestinal system','Underlying condition - Kidney and urology system',
    'Underlying condition - Immunodeficiency','Underlying condition - Neurological system','Underlying condition - Inherited metabolic disorders',
    'Underlying condition - No underlying diseases','Underlying condition - Other underlying conditions']

for i in range(9):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset[underlying_condition[i]] == 1)])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[  16.    3.   23.]
 [  37.    7.   28.]
 [  42.    2.   50.]
 [  11.    2.   33.]
 [   4.    1.    7.]
 [  22.    5.   46.]
 [   9.    1.    4.]
 [1092.  206.  285.]
 [  55.   14.   90.]]
[[ 1.2  1.2  4.2]
 [ 2.9  2.9  5.1]
 [ 3.3  0.8  9.1]
 [ 0.9  0.8  6. ]
 [ 0.3  0.4  1.3]
 [ 1.7  2.1  8.4]
 [ 0.7  0.4  0.7]
 [85.2 85.5 52.1]
 [ 4.3  5.8 16.5]]
[  42.   72.   94.   46.   12.   73.   14. 1583.  159.]
3.830949106314939e-53


In [34]:
place = ['community','Another hospital','NHP']
form = ['oxygen_cannula','CPAP','conventional_mechanical_ventilation','hfo_ventilation','ECMO']
table = np.zeros((5,3))

for i in range(5):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset['highest_ventilation_mode'] == form[i])])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[335.  93. 256.]
 [  2.   1.   5.]
 [ 46.  32.  66.]
 [  1.   5.   5.]
 [  1.   0.   0.]]
[[26.2 38.6 46.8]
 [ 0.2  0.4  0.9]
 [ 3.6 13.3 12.1]
 [ 0.1  2.1  0.9]
 [ 0.1  0.   0. ]]
[684.   8. 144.  11.   1.]
0.00039757778334743095


In [44]:
place = ['community','Another hospital','NHP']
duration = ['< 0', '0 - <24','24 - <48','>= 48']
table = np.zeros((4,3))

for i in range(4):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset['Duration between onset and test (detection time) (in hours)'] == duration[i])])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))
print(np.sum(table))

f_oneway(table[0], table[1], table[2], table[3])

[[ 14.   3.   5.]
 [315.  45. 193.]
 [264.  47. 127.]
 [688. 146. 222.]]
[[ 1.1  1.2  0.9]
 [24.6 18.7 35.3]
 [20.6 19.5 23.2]
 [53.7 60.6 40.6]]
[  22.  553.  438. 1056.]
2069.0


F_onewayResult(statistic=2.0722970755681738, pvalue=0.18236102913006402)

In [40]:
place = ['community','Another hospital','NHP']
diagnosis = ['Measles','Pneumonia','Bronchopneumonia','Other diagnosis']
table = np.zeros((4,3))

for i in range(4):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset[diagnosis[i]] == 1)])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[612.  85. 150.]
 [374. 109. 178.]
 [440. 100. 209.]
 [185.  35. 169.]]
[[47.8 35.3 27.4]
 [29.2 45.2 32.5]
 [34.3 41.5 38.2]
 [14.4 14.5 30.9]]
[847. 661. 749. 389.]
1.2832922667767798e-22


In [36]:
place = ['community','Another hospital','NHP']
prefix = 'complication-'
column_name = ['gastroentiritis','middle-ear-infec','conjunctivitis','laryngitis','pneumonia-bronchitis',
'febrile-seizures','septic-shock-sepsis']
table = np.zeros((7,3))

for i in range(7):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset[prefix+column_name[i]] == 1)])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[ 34.   4.  12.]
 [ 19.   3.   8.]
 [  2.   0.   0.]
 [  5.   0.   2.]
 [823. 196. 393.]
 [  2.   1.   7.]
 [  4.   0.   0.]]
[[ 2.7  1.7  2.2]
 [ 1.5  1.2  1.5]
 [ 0.2  0.   0. ]
 [ 0.4  0.   0.4]
 [64.2 81.3 71.8]
 [ 0.2  0.4  1.3]
 [ 0.3  0.   0. ]]
[  50.   30.    2.    7. 1412.   10.    4.]
0.14563525626399523


In [37]:
place = ['community','Another hospital','NHP']
prefix = 'co-infection-'
column_name = ['influenza-a','influenza-b','streptococus-aerius','streptococus-pneumonia']
table = np.zeros((4,3))

for i in range(4):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset[prefix+column_name[i]] == 1)])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[10.  1.  5.]
 [ 2.  0.  2.]
 [ 1.  0.  2.]
 [ 0.  1.  1.]]
[[0.8 0.4 0.9]
 [0.2 0.  0.4]
 [0.1 0.  0.4]
 [0.  0.4 0.2]]
[16.  4.  3.  2.]
0.2685603500299233


In [38]:
place = ['community','Another hospital','NHP']
column_name = ['respiratory_syncytical_virus', 'adenovirus', 'pertussis', 'healthcare_associated_infection']
table = np.zeros((4,3))

for i in range(4):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset[column_name[i]] == 1)])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[ 9.  1. 20.]
 [37. 19. 38.]
 [ 5.  3.  9.]
 [ 1.  4. 13.]]
[[0.7 0.4 3.7]
 [2.9 7.9 6.9]
 [0.4 1.2 1.6]
 [0.1 1.7 2.4]]
[30. 94. 17. 18.]
0.022038927613224197


In [39]:
place = ['community','Another hospital','NHP']
table = np.zeros((2,3))

for i in range(2):
    for j in range(3):
        table[i][j] = len(dataset[(dataset['place_of_exposure'] == place[j]) & (dataset['Clinical classification'] == 1-i)])

print(table)
print(np.round(100 * table/total,1))
print(np.sum(table, axis=1))

stat, pvalue, dof, expected_freq = chi2_contingency(table)
print(pvalue)

[[385. 131. 332.]
 [896. 110. 215.]]
[[30.1 54.4 60.7]
 [69.9 45.6 39.3]]
[ 848. 1221.]
2.0627038367635563e-37
