In [483]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportion_confint

In [484]:
np = pd.read_csv('/Users/ASUS/Desktop/Pertussis data analysis/np.csv')
np.head()
np.shape
np['age_group'] = pd.cut(  #grouped into age groups
    np['infant_age'],
    bins=[0, 29, 60, np['infant_age'].max()],
    labels=['Neonates', '1-2 months', '2+ months'],
    right=False  # include left edge, exclude right (0-28 days, 29-59 days, 60+ days)
)
np.head()
np['ct_mean'] = pd.to_numeric(np['ct_mean'], errors='coerce')
np = np[(np['parent'] == False)][["subject_id", 'parent', 'ct.detect', 'ct_mean', 'infant_age', 'age_group', "wp"]]
np.head()
#if the infant was vaccinated on the previous visit
np["dose_prev"] = np.groupby("subject_id")["wp"].shift(1).fillna(0).astype(bool)
np.loc[:, "nshot_prev"] = np.groupby("subject_id")["dose_prev"].cumsum()
np.head()
print(np.columns.tolist())
np = np.sort_values(['subject_id', 'infant_age']).copy()
np['wp'] = np['wp'].astype(bool)
g = np.groupby('subject_id')

# the first infant_age where wp is True. If no True exists, returns NaN.
first_wp_age_by_subject = g.apply(
    lambda x: x.loc[x['wp'], 'infant_age'].iloc[0] if x['wp'].any() else float('nan'), include_groups=False)

# Adds a column giving each subject’s first wp age for all their rows.
np['ref_age'] = np['subject_id'].map(first_wp_age_by_subject)

# running total of wp within each subject. All rows after the first True will have positive values.
np['post_first_wp'] = g['wp'].cummax()

# Calculates the difference between current age and first wp age, but only for rows after that first event
np['days_since_first_wp'] = (np['infant_age'] - np['ref_age']).where(np['post_first_wp'])

np.drop(columns=['ref_age', 'post_first_wp'], inplace=True)

['subject_id', 'parent', 'ct.detect', 'ct_mean', 'infant_age', 'age_group', 'wp', 'dose_prev', 'nshot_prev']


In [485]:
np["Unvaccinated"] = (np["nshot_prev"] == 0) | ((np["nshot_prev"] == 1) & (np["days_since_first_wp"] < 14))
np["Recently Vaccinated"] = ((np["nshot_prev"] >= 1) & (np["days_since_first_wp"] >= 14) &(np["days_since_first_wp"] <= 28))
np["Vaccinated"] =  ((np["nshot_prev"] >= 1) & (np["days_since_first_wp"] > 28))
np.head(10)

Unnamed: 0,subject_id,parent,ct.detect,ct_mean,infant_age,age_group,wp,dose_prev,nshot_prev,days_since_first_wp,Unvaccinated,Recently Vaccinated,Vaccinated
0,4,False,False,,7,Neonates,False,False,0,,True,False,False
1,4,False,False,,21,Neonates,False,False,0,,True,False,False
2,4,False,False,,42,1-2 months,True,False,0,0.0,True,False,False
3,4,False,False,,56,1-2 months,False,True,1,14.0,False,True,False
4,4,False,False,,63,2+ months,False,False,1,21.0,False,True,False
5,4,False,False,,72,2+ months,True,False,1,30.0,False,False,True
6,4,False,False,,90,2+ months,False,True,2,48.0,False,False,True
7,4,False,False,,96,2+ months,False,False,2,54.0,False,False,True
8,4,False,False,,110,2+ months,True,False,2,68.0,False,False,True
9,6,False,False,,7,Neonates,False,False,0,,True,False,False


In [486]:
subject_df = pd.read_csv('/Users/ASUS/Downloads/subject.csv')
subject_df.head(10)

Unnamed: 0,subject_id,parent,n.visit,n.unsched,age.min,begin,end,duration,nshot
0,3,False,2,0,4,2015-03-02,2015-03-17,15,0.0
1,3,True,2,0,4,2015-03-02,2015-03-17,15,
2,4,False,9,2,7,2015-03-03,2015-06-14,103,3.0
3,4,True,9,2,7,2015-03-03,2015-06-14,103,
4,5,False,1,0,10,2015-03-02,2015-03-02,0,0.0
5,5,True,1,0,10,2015-03-02,2015-03-02,0,
6,6,False,8,1,7,2015-03-02,2015-06-03,93,2.0
7,6,True,8,1,7,2015-03-02,2015-06-03,93,
8,7,False,7,0,7,2015-03-06,2015-06-12,98,2.0
9,7,True,7,0,7,2015-03-06,2015-06-12,98,


In [487]:
filtered_df = subject_df[subject_df['parent'] == False]

final_df = filtered_df[['subject_id', "nshot"]]
final_df.head()

Unnamed: 0,subject_id,nshot
0,3,0.0
2,4,3.0
4,5,0.0
6,6,2.0
8,7,2.0


In [488]:
efi = pd.read_csv('/Users/ASUS/Downloads/efi.csv')
efi.head()


Unnamed: 0,subject_id,parent,n.sample,n.detect,n.nose.cough,n.sick,n.abx,fsymptom,efi.strength,efi
0,4,False,9,0,3,2,2,Moderate/Severe,,0.0
1,6,False,8,3,2,1,2,Moderate/Severe,Strong,0.73009
2,7,False,7,0,2,1,1,Moderate/Severe,,0.0
3,11,False,7,0,1,0,0,Minimal,,0.0
4,12,False,7,0,0,0,0,,,0.0


In [489]:
efi[efi["subject_id"] == 26]

Unnamed: 0,subject_id,parent,n.sample,n.detect,n.nose.cough,n.sick,n.abx,fsymptom,efi.strength,efi
16,26,False,7,1,1,0,0,Minimal,Weak,0.49911
1336,26,True,7,0,0,0,0,,,0.0


In [490]:
efi_df = efi.loc[efi["parent"] == False].copy()

# Fill missing values safely
efi_df.loc[:, "fsymptom"] = efi_df["fsymptom"].fillna("None")
efi_df.loc[:, "efi.strength"] = efi_df["efi.strength"].fillna("None")

efi_df.head()


Unnamed: 0,subject_id,parent,n.sample,n.detect,n.nose.cough,n.sick,n.abx,fsymptom,efi.strength,efi
0,4,False,9,0,3,2,2,Moderate/Severe,,0.0
1,6,False,8,3,2,1,2,Moderate/Severe,Strong,0.73009
2,7,False,7,0,2,1,1,Moderate/Severe,,0.0
3,11,False,7,0,1,0,0,Minimal,,0.0
4,12,False,7,0,0,0,0,,,0.0


In [491]:
merged_efi = efi_df.merge(final_df, on="subject_id", how="left")
merged_efi.head()

Unnamed: 0,subject_id,parent,n.sample,n.detect,n.nose.cough,n.sick,n.abx,fsymptom,efi.strength,efi,nshot
0,4,False,9,0,3,2,2,Moderate/Severe,,0.0,3.0
1,6,False,8,3,2,1,2,Moderate/Severe,Strong,0.73009,2.0
2,7,False,7,0,2,1,1,Moderate/Severe,,0.0,2.0
3,11,False,7,0,1,0,0,Minimal,,0.0,3.0
4,12,False,7,0,0,0,0,,,0.0,3.0


In [492]:
efi_strongweak= merged_efi.loc[merged_efi['efi.strength'].isin(['Strong', 'Weak']), ['subject_id', 'efi.strength']]
efi_strongweak.shape

(454, 2)

In [493]:
efi_strong= merged_efi.loc[merged_efi['efi.strength'].isin(['Strong']), ['subject_id', 'efi.strength']]
efi_strong.shape

(177, 2)

In [494]:
list_efistrong = efi_strong['subject_id'].tolist()
print(list_efistrong)

[6, 15, 20, 70, 72, 95, 96, 112, 115, 126, 130, 132, 140, 152, 166, 170, 197, 198, 199, 222, 233, 235, 242, 243, 246, 249, 251, 255, 260, 268, 269, 273, 279, 298, 341, 342, 344, 346, 347, 349, 350, 353, 355, 357, 358, 365, 367, 369, 375, 378, 384, 386, 387, 389, 398, 403, 405, 414, 419, 424, 425, 428, 429, 434, 436, 445, 447, 449, 456, 462, 463, 466, 468, 472, 474, 476, 479, 480, 482, 484, 488, 498, 501, 504, 514, 516, 518, 527, 534, 535, 537, 542, 544, 553, 556, 563, 565, 570, 572, 573, 575, 576, 579, 589, 599, 603, 610, 627, 639, 643, 644, 646, 649, 652, 660, 665, 674, 677, 683, 685, 687, 689, 691, 692, 697, 700, 714, 716, 720, 727, 728, 729, 733, 735, 738, 739, 741, 742, 745, 748, 752, 762, 785, 801, 808, 818, 828, 868, 877, 882, 890, 898, 910, 912, 920, 928, 949, 962, 995, 1004, 1047, 1089, 1126, 1131, 1161, 1162, 1219, 1265, 1295, 1332, 1355, 1374, 1430, 1560, 1605, 1852, 1908]


In [495]:
final = efi_df.merge(np, on='subject_id', how='left')
final.head()

Unnamed: 0,subject_id,parent_x,n.sample,n.detect,n.nose.cough,n.sick,n.abx,fsymptom,efi.strength,efi,...,ct_mean,infant_age,age_group,wp,dose_prev,nshot_prev,days_since_first_wp,Unvaccinated,Recently Vaccinated,Vaccinated
0,4,False,9,0,3,2,2,Moderate/Severe,,0.0,...,,7,Neonates,False,False,0,,True,False,False
1,4,False,9,0,3,2,2,Moderate/Severe,,0.0,...,,21,Neonates,False,False,0,,True,False,False
2,4,False,9,0,3,2,2,Moderate/Severe,,0.0,...,,42,1-2 months,True,False,0,0.0,True,False,False
3,4,False,9,0,3,2,2,Moderate/Severe,,0.0,...,,56,1-2 months,False,True,1,14.0,False,True,False
4,4,False,9,0,3,2,2,Moderate/Severe,,0.0,...,,63,2+ months,False,False,1,21.0,False,True,False


In [498]:
final = final.drop(columns=['n.sample', 'n.detect', 'n.nose.cough', 'n.sick', 'n.abx', 'dose_prev', 'nshot_prev', 'ct.detect', 'parent_x', "parent_y","efi","wp"])
final.head(20)

Unnamed: 0,subject_id,fsymptom,efi.strength,ct_mean,infant_age,age_group,days_since_first_wp,Unvaccinated,Recently Vaccinated,Vaccinated
0,4,Moderate/Severe,,,7,Neonates,,True,False,False
1,4,Moderate/Severe,,,21,Neonates,,True,False,False
2,4,Moderate/Severe,,,42,1-2 months,0.0,True,False,False
3,4,Moderate/Severe,,,56,1-2 months,14.0,False,True,False
4,4,Moderate/Severe,,,63,2+ months,21.0,False,True,False
5,4,Moderate/Severe,,,72,2+ months,30.0,False,False,True
6,4,Moderate/Severe,,,90,2+ months,48.0,False,False,True
7,4,Moderate/Severe,,,96,2+ months,54.0,False,False,True
8,4,Moderate/Severe,,,110,2+ months,68.0,False,False,True
9,6,Moderate/Severe,Strong,,7,Neonates,,True,False,False


In [499]:
final.to_excel('efi_np_final.xlsx', index=False)

In [500]:
final['Vaccinated'].value_counts()

Vaccinated
False    5653
True     3070
Name: count, dtype: int64

In [501]:
final['Unvaccinated'].value_counts()

Unvaccinated
True     4405
False    4318
Name: count, dtype: int64

In [502]:
final['Recently Vaccinated'].value_counts()

Recently Vaccinated
False    7475
True     1248
Name: count, dtype: int64

In [503]:
final['Vaccinated'].unique()

array([False,  True])

In [504]:
final['fsymptom'] = pd.Categorical(
    final['fsymptom'],
    categories=['None', 'Minimal', 'Moderate/Severe'],
    ordered=True
)

In [505]:
vaccinated_df = (
    final[final["Vaccinated"] == True]
    .groupby("subject_id", as_index=False)
    .first()
)

In [518]:
vaccinated_df.shape

(1176, 10)

In [519]:
table_vax = pd.crosstab(vaccinated_df["fsymptom"], vaccinated_df["efi.strength"], margins=True, margins_name='Sum')
print(table_vax)

efi.strength     None  Strong  Weak   Sum
fsymptom                                 
None              365      36    89   490
Minimal           283      92   125   500
Moderate/Severe    93      40    53   186
Sum               741     168   267  1176


In [521]:
excluded_df = final[(final['Vaccinated'] == False) & (final['Unvaccinated'] == False)]
excluded_df.shape

(1248, 10)

In [531]:
base_df = final[final["Vaccinated"] == False].copy()
base_df.shape

(5653, 10)

In [532]:

# Now select only the recently vaccinated from this filtered data
recently_df = (
    base_df[base_df["Recently Vaccinated"] == True]
    .groupby("subject_id", as_index=False)
    .first()
    .copy()
)

recently_df.shape

(1129, 10)

In [524]:
excluded_df.head()

Unnamed: 0,subject_id,fsymptom,efi.strength,ct_mean,infant_age,age_group,days_since_first_wp,Unvaccinated,Recently Vaccinated,Vaccinated
3,4,Moderate/Severe,,,56,1-2 months,14.0,False,True,False
4,4,Moderate/Severe,,,63,2+ months,21.0,False,True,False
12,6,Moderate/Severe,Strong,41.659,56,1-2 months,14.0,False,True,False
13,6,Moderate/Severe,Strong,44.209,70,2+ months,28.0,False,True,False
20,7,Moderate/Severe,,,57,1-2 months,14.0,False,True,False


In [255]:
unvaccinated_df = final[
    ~final['subject_id'].isin(vaccinated_df['subject_id']) &
    ~final['subject_id'].isin(excluded_df['subject_id'])
].copy()

In [256]:
unvaccinated_df.shape

(1320, 12)

In [265]:
unvaccinated_df.head()

Unnamed: 0,subject_id,fsymptom,efi.strength
0,57,Moderate/Severe,
1,58,Minimal,
2,71,Moderate/Severe,
3,72,Minimal,Strong
4,238,Minimal,


In [259]:
unvaccinated_df = (
    unvaccinated_df[['subject_id', 'fsymptom', 'efi.strength']]
    .drop_duplicates(subset='subject_id')
    .reset_index(drop=True)
)

In [261]:
unvaccinated_df.shape

(58, 3)

In [268]:
unvaccinated_df.head()

Unnamed: 0,subject_id,fsymptom,efi.strength
0,57,Moderate/Severe,
1,58,Minimal,
2,71,Moderate/Severe,
3,72,Minimal,Strong
4,238,Minimal,


In [263]:
unvaccinated_df['fsymptom'] = unvaccinated_df['fsymptom'].fillna('None')
unvaccinated_df['efi.strength'] = unvaccinated_df['efi.strength'].fillna('None')

In [264]:
unvaccinated_df['fsymptom'] = pd.Categorical(
    unvaccinated_df['fsymptom'],
    categories=['None', 'Minimal', 'Moderate/Severe'],
    ordered=True
)

In [267]:
table_unvax = pd.crosstab(unvaccinated_df["fsymptom"], unvaccinated_df["efi.strength"], margins=True, margins_name='Sum')
print(table_unvax)

efi.strength     None  Strong  Weak  Sum
fsymptom                                
None               28       1     1   30
Minimal            14       3     2   19
Moderate/Severe     7       1     1    9
Sum                49       5     4   58


In [276]:
unique_excluded = excluded_df[
    ~excluded_df['subject_id'].isin(vaccinated_df['subject_id']) &
    ~excluded_df['subject_id'].isin(unvaccinated_df['subject_id'])
][['subject_id', 'fsymptom', 'efi.strength']].drop_duplicates(subset='subject_id').reset_index(drop=True)
unique_excluded.shape

(84, 3)

In [280]:
unique_excluded.head(20)


Unnamed: 0,subject_id,fsymptom,efi.strength
0,24,,
1,61,,
2,76,,
3,123,,
4,213,,
5,222,,Strong
6,247,,
7,415,Minimal,Weak
8,416,Minimal,
9,432,Moderate/Severe,Weak


In [284]:
unique_excluded['fsymptom'] = unique_excluded['fsymptom'].fillna('None')
unique_excluded['efi.strength'] = unique_excluded['efi.strength'].fillna('None')

In [283]:
unique_excluded['fsymptom'] = pd.Categorical(
   unique_excluded['fsymptom'],
    categories=['None', 'Minimal', 'Moderate/Severe'],
    ordered=True
)

In [297]:
for df in [vaccinated_df, unvaccinated_df, unique_excluded]:
    df['efi.strength'] = pd.Categorical(
        df['efi.strength'],
        categories=['None', 'Weak', 'Strong'],
        ordered=True
    )


In [298]:
table_excluded = pd.crosstab(unique_excluded["fsymptom"], unique_excluded["efi.strength"], margins=True, margins_name='Sum')
print(table_excluded)

efi.strength     None  Weak  Strong  Sum
fsymptom                                
None               53     3       3   59
Minimal            15     1       0   16
Moderate/Severe     8     1       0    9
Sum                76     5       3   84


In [299]:
print(table_vax)
print(table_unvax)

efi.strength     None  Strong  Weak   Sum
fsymptom                                 
None              365      37    89   491
Minimal           283      92   125   500
Moderate/Severe    93      40    54   187
Sum               741     169   268  1178
efi.strength     None  Strong  Weak  Sum
fsymptom                                
None               28       1     1   30
Minimal            14       3     2   19
Moderate/Severe     7       1     1    9
Sum                49       5     4   58
