Import libraries

In [1]:
import pandas as pd

Data Loading & Cleaning

data1 : coverage-data

In [2]:
coverage_data = pd.read_excel("coverage-data.xlsx")

In [3]:
coverage_data.head()

Unnamed: 0,GROUP,CODE,NAME,YEAR,ANTIGEN,ANTIGEN_DESCRIPTION,COVERAGE_CATEGORY,COVERAGE_CATEGORY_DESCRIPTION,TARGET_NUMBER,DOSES,COVERAGE
0,COUNTRIES,ABW,Aruba,2023.0,BCG,BCG,ADMIN,Administrative coverage,,,
1,COUNTRIES,ABW,Aruba,2023.0,BCG,BCG,OFFICIAL,Official coverage,,,
2,COUNTRIES,ABW,Aruba,2023.0,DIPHCV4,"Diphtheria-containing vaccine, 4th dose (1st b...",ADMIN,Administrative coverage,1044.0,945.0,90.52
3,COUNTRIES,ABW,Aruba,2023.0,DIPHCV4,"Diphtheria-containing vaccine, 4th dose (1st b...",OFFICIAL,Official coverage,,,90.52
4,COUNTRIES,ABW,Aruba,2023.0,DIPHCV5,"Diphtheria-containing vaccine, 5th dose (2nd b...",ADMIN,Administrative coverage,1219.0,1008.0,82.69


In [4]:
coverage_data.shape

(399859, 11)

In [5]:
coverage_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399859 entries, 0 to 399858
Data columns (total 11 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   GROUP                          399859 non-null  object 
 1   CODE                           399858 non-null  object 
 2   NAME                           398584 non-null  object 
 3   YEAR                           399858 non-null  float64
 4   ANTIGEN                        399858 non-null  object 
 5   ANTIGEN_DESCRIPTION            399858 non-null  object 
 6   COVERAGE_CATEGORY              399858 non-null  object 
 7   COVERAGE_CATEGORY_DESCRIPTION  399858 non-null  object 
 8   TARGET_NUMBER                  79030 non-null   float64
 9   DOSES                          79327 non-null   float64
 10  COVERAGE                       230477 non-null  float64
dtypes: float64(4), object(7)
memory usage: 33.6+ MB


In [6]:
coverage_data.isnull().sum()

GROUP                                 0
CODE                                  1
NAME                               1275
YEAR                                  1
ANTIGEN                               1
ANTIGEN_DESCRIPTION                   1
COVERAGE_CATEGORY                     1
COVERAGE_CATEGORY_DESCRIPTION         1
TARGET_NUMBER                    320829
DOSES                            320532
COVERAGE                         169382
dtype: int64

Handle Missing Data

In [7]:
# Count NaNs per row
nan_counts = coverage_data.isna().sum(axis=1)

# Get the maximum number of NaNs in any row
max_nans = nan_counts.max()

# Get row(s) where number of NaNs equals the maximum
rows_with_most_nans = coverage_data[nan_counts == max_nans]
rows_with_most_nans

Unnamed: 0,GROUP,CODE,NAME,YEAR,ANTIGEN,ANTIGEN_DESCRIPTION,COVERAGE_CATEGORY,COVERAGE_CATEGORY_DESCRIPTION,TARGET_NUMBER,DOSES,COVERAGE
399858,Created: 2025-02-01 16:00 UTC,,,,,,,,,,


In [8]:
# dropping the row where all the values are NaN
coverage_data = coverage_data.drop(index=399858)

In [9]:
coverage_data.update(coverage_data['NAME'].fillna("WB_NA"))

In [10]:
coverage_data['TARGET_NUMBER'] = coverage_data.groupby(['ANTIGEN'])['TARGET_NUMBER'].transform(lambda x: x.fillna(x.median()))

In [11]:
coverage_data['DOSES'] = coverage_data.groupby(['ANTIGEN'])['DOSES'].transform(lambda x: x.fillna(x.median()))

In [12]:
# Fill coverage where both DOSES and TARGET_NUMBER are non-null and TARGET_NUMBER is not 0
mask_valid = (
    coverage_data["COVERAGE"].isna() &
    coverage_data["DOSES"].notna() &
    coverage_data["TARGET_NUMBER"].notna() &
    (coverage_data["TARGET_NUMBER"] != 0)
)

coverage_data.loc[mask_valid, "COVERAGE"] = (
    (coverage_data.loc[mask_valid, "DOSES"] / coverage_data.loc[mask_valid, "TARGET_NUMBER"]) * 100
)

# Handle cases where DOSES or TARGET_NUMBER is 0 → set coverage to 0
mask_zero = (
    coverage_data["COVERAGE"].isna() &
    ((coverage_data["DOSES"] == 0) | (coverage_data["TARGET_NUMBER"] == 0))
)

coverage_data.loc[mask_zero, "COVERAGE"] = 0

# Round coverage to 2 decimal places
coverage_data["COVERAGE"] = coverage_data["COVERAGE"].round(2)

In [13]:
coverage_data.isnull().sum()

GROUP                            0
CODE                             0
NAME                             0
YEAR                             0
ANTIGEN                          0
ANTIGEN_DESCRIPTION              0
COVERAGE_CATEGORY                0
COVERAGE_CATEGORY_DESCRIPTION    0
TARGET_NUMBER                    0
DOSES                            0
COVERAGE                         0
dtype: int64

Normalize Units & Date Consistency

In [14]:
# Normalize coverage where it's too high (likely raw count)
mask_high = coverage_data["COVERAGE"] > 100

# Only normalize if doses and target number are valid
valid = coverage_data["DOSES"].notna() & coverage_data["TARGET_NUMBER"].notna() & (coverage_data["TARGET_NUMBER"] != 0)

# Combined mask
mask = mask_high & valid

# Normalize
coverage_data.loc[mask, "COVERAGE"] = (coverage_data.loc[mask, "DOSES"] / coverage_data.loc[mask, "TARGET_NUMBER"]) * 100

# Optional: round to 2 decimal places
coverage_data["COVERAGE"] = coverage_data["COVERAGE"].round(2)

#Some rows may still exceed 100 due to incorrect data. cap values at 100
coverage_data["COVERAGE"] = coverage_data["COVERAGE"].clip(upper=100)


In [15]:
# Converting  Year, TARGET_NUMBER, DOSES columns to integer
coverage_data['YEAR'] = coverage_data['YEAR'].astype(int)
coverage_data['TARGET_NUMBER'] = coverage_data['TARGET_NUMBER'].astype(int)
coverage_data['DOSES'] = coverage_data['DOSES'].astype(int)

In [16]:
# Date consistency
#coverage_data["YEAR"] = pd.to_datetime(coverage_data["YEAR"], format="%Y", errors="coerce")


In [17]:
coverage_data

Unnamed: 0,GROUP,CODE,NAME,YEAR,ANTIGEN,ANTIGEN_DESCRIPTION,COVERAGE_CATEGORY,COVERAGE_CATEGORY_DESCRIPTION,TARGET_NUMBER,DOSES,COVERAGE
0,COUNTRIES,ABW,Aruba,2023,BCG,BCG,ADMIN,Administrative coverage,429586,376769,87.71
1,COUNTRIES,ABW,Aruba,2023,BCG,BCG,OFFICIAL,Official coverage,429586,376769,87.71
2,COUNTRIES,ABW,Aruba,2023,DIPHCV4,"Diphtheria-containing vaccine, 4th dose (1st b...",ADMIN,Administrative coverage,1044,945,90.52
3,COUNTRIES,ABW,Aruba,2023,DIPHCV4,"Diphtheria-containing vaccine, 4th dose (1st b...",OFFICIAL,Official coverage,65501,56755,90.52
4,COUNTRIES,ABW,Aruba,2023,DIPHCV5,"Diphtheria-containing vaccine, 5th dose (2nd b...",ADMIN,Administrative coverage,1219,1008,82.69
...,...,...,...,...,...,...,...,...,...,...,...
399853,WHO_REGIONS,WPR,Western Pacific Region,1980,DTPCV3,"DTP-containing vaccine, 3rd dose",WUENIC,WHO/UNICEF Estimates of National Immunization ...,27939588,2273390,8.00
399854,WHO_REGIONS,WPR,Western Pacific Region,1980,MCV1,"Measles-containing vaccine, 1st dose",WUENIC,WHO/UNICEF Estimates of National Immunization ...,27939588,1209026,4.00
399855,WHO_REGIONS,WPR,Western Pacific Region,1980,PAB,Protection at birth (PAB) against neonatal tet...,PAB,PAB Estimates,4494513,276306,6.00
399856,WHO_REGIONS,WPR,Western Pacific Region,1980,POL3,"Polio, 3rd dose",WUENIC,WHO/UNICEF Estimates of National Immunization ...,27939588,1296611,5.00


In [18]:
coverage_data.describe()

Unnamed: 0,YEAR,TARGET_NUMBER,DOSES,COVERAGE
count,399858.0,399858.0,399858.0,399858.0
mean,2009.207489,55386990.0,847887.9,64.979455
std,11.72053,24074850000.0,5181967.0,30.561151
min,1980.0,0.0,-222288200.0,-7.41
25%,2002.0,195053.0,86218.0,47.0
50%,2012.0,351192.0,180258.0,73.0
75%,2019.0,461232.0,308689.0,92.0
max,2023.0,11700000000000.0,126605200.0,100.0


data2 : incidence-rate-data

In [19]:
incidence_rate_data = pd.read_excel("incidence-rate-data.xlsx")

In [20]:
incidence_rate_data.head()

Unnamed: 0,GROUP,CODE,NAME,YEAR,DISEASE,DISEASE_DESCRIPTION,DENOMINATOR,INCIDENCE_RATE
0,COUNTRIES,ABW,Aruba,2023.0,CRS,Congenital rubella syndrome,"per 10,000 live births",0.0
1,COUNTRIES,ABW,Aruba,2023.0,DIPHTHERIA,Diphtheria,"per 1,000,000 total population",0.0
2,COUNTRIES,ABW,Aruba,2023.0,INVASIVE_MENING,Invasive meningococcal disease,"per 1,000,000 total population",9.3
3,COUNTRIES,ABW,Aruba,2023.0,MEASLES,Measles,"per 1,000,000 total population",
4,COUNTRIES,ABW,Aruba,2023.0,MUMPS,Mumps,"per 1,000,000 total population",0.0


In [21]:
incidence_rate_data.shape

(84946, 8)

In [22]:
incidence_rate_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84946 entries, 0 to 84945
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   GROUP                84946 non-null  object 
 1   CODE                 84945 non-null  object 
 2   NAME                 84945 non-null  object 
 3   YEAR                 84945 non-null  float64
 4   DISEASE              84945 non-null  object 
 5   DISEASE_DESCRIPTION  84945 non-null  object 
 6   DENOMINATOR          84945 non-null  object 
 7   INCIDENCE_RATE       61584 non-null  float64
dtypes: float64(2), object(6)
memory usage: 5.2+ MB


In [23]:
incidence_rate_data.isnull().sum()

GROUP                      0
CODE                       1
NAME                       1
YEAR                       1
DISEASE                    1
DISEASE_DESCRIPTION        1
DENOMINATOR                1
INCIDENCE_RATE         23362
dtype: int64

Handle Missing Data

In [24]:
# Count NaNs per row
nan_counts = incidence_rate_data.isna().sum(axis=1)

# Get the maximum number of NaNs in any row
max_nans = nan_counts.max()

# Get row(s) where number of NaNs equals the maximum
rows_with_most_nans = incidence_rate_data[nan_counts == max_nans]
rows_with_most_nans

Unnamed: 0,GROUP,CODE,NAME,YEAR,DISEASE,DISEASE_DESCRIPTION,DENOMINATOR,INCIDENCE_RATE
84945,Created: 2025-02-01 16:03 UTC,,,,,,,


In [25]:
# dropping the row where all the values are NaN
incidence_rate_data = incidence_rate_data.drop(index=84945)

In [26]:
# impute missing values using group median
incidence_rate_data['INCIDENCE_RATE'] = incidence_rate_data.groupby(['DISEASE'])['INCIDENCE_RATE'].transform(lambda x: x.fillna(x.median()))

In [27]:
incidence_rate_data.isnull().sum()

GROUP                  0
CODE                   0
NAME                   0
YEAR                   0
DISEASE                0
DISEASE_DESCRIPTION    0
DENOMINATOR            0
INCIDENCE_RATE         0
dtype: int64

Normalize Units & Date Consistency

In [28]:
# Convert YEAR to integer
incidence_rate_data['YEAR'] = incidence_rate_data['YEAR'].astype(int)

Standardizing the text format

In [29]:
incidence_rate_data["DENOMINATOR"].unique()

array(['per 10,000 live births', 'per 1,000,000 total population',
       'per 1,000 live births', 'per 1,000,000 <15 population',
       'per 1000 live births'], dtype=object)

In [30]:
# Standardize the text
incidence_rate_data['DENOMINATOR'] = incidence_rate_data['DENOMINATOR'].replace({'per 1000 live births': 'per 1,000 live births'})

In [31]:
incidence_rate_data

Unnamed: 0,GROUP,CODE,NAME,YEAR,DISEASE,DISEASE_DESCRIPTION,DENOMINATOR,INCIDENCE_RATE
0,COUNTRIES,ABW,Aruba,2023,CRS,Congenital rubella syndrome,"per 10,000 live births",0.0
1,COUNTRIES,ABW,Aruba,2023,DIPHTHERIA,Diphtheria,"per 1,000,000 total population",0.0
2,COUNTRIES,ABW,Aruba,2023,INVASIVE_MENING,Invasive meningococcal disease,"per 1,000,000 total population",9.3
3,COUNTRIES,ABW,Aruba,2023,MEASLES,Measles,"per 1,000,000 total population",16.5
4,COUNTRIES,ABW,Aruba,2023,MUMPS,Mumps,"per 1,000,000 total population",0.0
...,...,...,...,...,...,...,...,...
84940,COUNTRIES,ZWE,Zimbabwe,1980,NTETANUS,Neonatal tetanus,"per 1,000 live births",0.4
84941,COUNTRIES,ZWE,Zimbabwe,1980,PERTUSSIS,Pertussis,"per 1,000,000 total population",893.3
84942,COUNTRIES,ZWE,Zimbabwe,1980,POLIO,Poliomyelitis,"per 1,000,000 <15 population",0.0
84943,COUNTRIES,ZWE,Zimbabwe,1980,TTETANUS,Total tetanus,"per 1,000,000 total population",19.7


In [32]:
incidence_rate_data['INCIDENCE_RATE'].describe()

count    84945.000000
mean        79.828795
std        846.253958
min          0.000000
25%          0.000000
50%          0.000000
75%          2.800000
max      69101.300000
Name: INCIDENCE_RATE, dtype: float64

data3 : reported-cases-data

In [33]:
reported_cases_data = pd.read_excel("reported-cases-data.xlsx")

In [34]:
reported_cases_data.head()

Unnamed: 0,GROUP,CODE,NAME,YEAR,DISEASE,DISEASE_DESCRIPTION,CASES
0,COUNTRIES,ABW,Aruba,2023.0,CRS,Congenital rubella syndrome,0.0
1,COUNTRIES,ABW,Aruba,2023.0,DIPHTHERIA,Diphtheria,0.0
2,COUNTRIES,ABW,Aruba,2023.0,INVASIVE_MENING,Invasive meningococcal disease,1.0
3,COUNTRIES,ABW,Aruba,2023.0,MEASLES,Measles,
4,COUNTRIES,ABW,Aruba,2023.0,MUMPS,Mumps,0.0


In [35]:
reported_cases_data.shape

(84870, 7)

In [36]:
reported_cases_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84870 entries, 0 to 84869
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   GROUP                84870 non-null  object 
 1   CODE                 84869 non-null  object 
 2   NAME                 84869 non-null  object 
 3   YEAR                 84869 non-null  float64
 4   DISEASE              84869 non-null  object 
 5   DISEASE_DESCRIPTION  84869 non-null  object 
 6   CASES                65470 non-null  float64
dtypes: float64(2), object(5)
memory usage: 4.5+ MB


In [37]:
reported_cases_data.isnull().sum()

GROUP                      0
CODE                       1
NAME                       1
YEAR                       1
DISEASE                    1
DISEASE_DESCRIPTION        1
CASES                  19400
dtype: int64

Handle Missing Data

In [38]:
# Count NaNs per row
nan_counts = reported_cases_data.isna().sum(axis=1)

# Get the maximum number of NaNs in any row
max_nans = nan_counts.max()

# Get row(s) where number of NaNs equals the maximum
rows_with_most_nans = reported_cases_data[nan_counts == max_nans]
rows_with_most_nans

Unnamed: 0,GROUP,CODE,NAME,YEAR,DISEASE,DISEASE_DESCRIPTION,CASES
84869,Created: 2025-02-01 16:02 UTC,,,,,,


In [39]:
# dropping the row where all the values are NaN
reported_cases_data = reported_cases_data.drop(index=84869)

In [40]:
# impute missing values using group median
reported_cases_data['CASES'] = reported_cases_data.groupby(['DISEASE'])['CASES'].transform(lambda x: x.fillna(x.median()))

In [41]:
reported_cases_data.isnull().sum()

GROUP                  0
CODE                   0
NAME                   0
YEAR                   0
DISEASE                0
DISEASE_DESCRIPTION    0
CASES                  0
dtype: int64

Normalize Units & Date Consistency

In [42]:
# Convert YEAR,CASES to integer
reported_cases_data['YEAR'] = reported_cases_data['YEAR'].astype(int)
reported_cases_data['CASES'] = reported_cases_data['CASES'].astype(int)

In [43]:
reported_cases_data

Unnamed: 0,GROUP,CODE,NAME,YEAR,DISEASE,DISEASE_DESCRIPTION,CASES
0,COUNTRIES,ABW,Aruba,2023,CRS,Congenital rubella syndrome,0
1,COUNTRIES,ABW,Aruba,2023,DIPHTHERIA,Diphtheria,0
2,COUNTRIES,ABW,Aruba,2023,INVASIVE_MENING,Invasive meningococcal disease,1
3,COUNTRIES,ABW,Aruba,2023,MEASLES,Measles,104
4,COUNTRIES,ABW,Aruba,2023,MUMPS,Mumps,0
...,...,...,...,...,...,...,...
84864,COUNTRIES,ZWE,Zimbabwe,1980,NTETANUS,Neonatal tetanus,134
84865,COUNTRIES,ZWE,Zimbabwe,1980,PERTUSSIS,Pertussis,6290
84866,COUNTRIES,ZWE,Zimbabwe,1980,POLIO,Poliomyelitis,32
84867,COUNTRIES,ZWE,Zimbabwe,1980,TTETANUS,Total tetanus,139


In [44]:
reported_cases_data['CASES'].describe()

count    8.486900e+04
mean     3.452942e+03
std      5.373628e+04
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      4.500000e+01
max      4.583555e+06
Name: CASES, dtype: float64

data4 : vaccine-introduction-data

In [45]:
vaccine_introduction_data=pd.read_excel("vaccine-introduction-data.xlsx")

In [46]:
vaccine_introduction_data.head()

Unnamed: 0,ISO_3_CODE,COUNTRYNAME,WHO_REGION,YEAR,DESCRIPTION,INTRO
0,AFG,Afghanistan,EMRO,2023.0,aP (acellular pertussis) vaccine,No
1,AFG,Afghanistan,EMRO,2023.0,Hepatitis A vaccine,No
2,AFG,Afghanistan,EMRO,2023.0,Hepatitis B vaccine,Yes
3,AFG,Afghanistan,EMRO,2023.0,HepB birth dose,Yes
4,AFG,Afghanistan,EMRO,2023.0,Hib (Haemophilus influenzae type B) vaccine,Yes


In [47]:
vaccine_introduction_data.shape

(138321, 6)

In [48]:
vaccine_introduction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138321 entries, 0 to 138320
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ISO_3_CODE   138321 non-null  object 
 1   COUNTRYNAME  138320 non-null  object 
 2   WHO_REGION   138320 non-null  object 
 3   YEAR         138320 non-null  float64
 4   DESCRIPTION  138320 non-null  object 
 5   INTRO        138320 non-null  object 
dtypes: float64(1), object(5)
memory usage: 6.3+ MB


In [49]:
vaccine_introduction_data.isnull().sum()

ISO_3_CODE     0
COUNTRYNAME    1
WHO_REGION     1
YEAR           1
DESCRIPTION    1
INTRO          1
dtype: int64

Handle Missing Data

In [50]:
# Count NaNs per row
nan_counts = vaccine_introduction_data.isna().sum(axis=1)

# Get the maximum number of NaNs in any row
max_nans = nan_counts.max()

# Get row(s) where number of NaNs equals the maximum
rows_with_most_nans = vaccine_introduction_data[nan_counts == max_nans]
rows_with_most_nans

Unnamed: 0,ISO_3_CODE,COUNTRYNAME,WHO_REGION,YEAR,DESCRIPTION,INTRO
138320,Created: 2025-02-01 07:09 UTC,,,,,


In [51]:
# dropping the row where all the values are NaN
vaccine_introduction_data=vaccine_introduction_data.drop(index=138320)

In [52]:
vaccine_introduction_data.isnull().sum()

ISO_3_CODE     0
COUNTRYNAME    0
WHO_REGION     0
YEAR           0
DESCRIPTION    0
INTRO          0
dtype: int64

Normalize Units & Date consistency

In [53]:
# Convert YEAR to integer
vaccine_introduction_data['YEAR'] = vaccine_introduction_data['YEAR'].astype(int)

In [54]:
vaccine_introduction_data

Unnamed: 0,ISO_3_CODE,COUNTRYNAME,WHO_REGION,YEAR,DESCRIPTION,INTRO
0,AFG,Afghanistan,EMRO,2023,aP (acellular pertussis) vaccine,No
1,AFG,Afghanistan,EMRO,2023,Hepatitis A vaccine,No
2,AFG,Afghanistan,EMRO,2023,Hepatitis B vaccine,Yes
3,AFG,Afghanistan,EMRO,2023,HepB birth dose,Yes
4,AFG,Afghanistan,EMRO,2023,Hib (Haemophilus influenzae type B) vaccine,Yes
...,...,...,...,...,...,...
138315,ZWE,Zimbabwe,AFRO,1944,Seasonal Influenza vaccine,No
138316,ZWE,Zimbabwe,AFRO,1943,Seasonal Influenza vaccine,No
138317,ZWE,Zimbabwe,AFRO,1942,Seasonal Influenza vaccine,No
138318,ZWE,Zimbabwe,AFRO,1941,Seasonal Influenza vaccine,No


data5 : vaccine-schedule-data

In [55]:
vaccine_schedule_data=pd.read_excel("vaccine-schedule-data.xlsx")

In [56]:
vaccine_schedule_data.head()

Unnamed: 0,ISO_3_CODE,COUNTRYNAME,WHO_REGION,YEAR,VACCINECODE,VACCINE_DESCRIPTION,SCHEDULEROUNDS,TARGETPOP,TARGETPOP_DESCRIPTION,GEOAREA,AGEADMINISTERED,SOURCECOMMENT
0,ABW,Aruba,AMRO,2023.0,DTAPHIBIPV,DTaP-Hib-IPV (acellular) vaccine,1.0,,General/routine,NATIONAL,M2,
1,ABW,Aruba,AMRO,2023.0,DTAPHIBIPV,DTaP-Hib-IPV (acellular) vaccine,2.0,,General/routine,NATIONAL,M4,
2,ABW,Aruba,AMRO,2023.0,DTAPHIBIPV,DTaP-Hib-IPV (acellular) vaccine,3.0,,General/routine,NATIONAL,M6,
3,ABW,Aruba,AMRO,2023.0,DTAPHIBIPV,DTaP-Hib-IPV (acellular) vaccine,4.0,B_2YL_W,General/routine,NATIONAL,M15,
4,ABW,Aruba,AMRO,2023.0,DTAPIPV,DTaP-IPV (acellular) vaccine,5.0,B_CHILD_W,General/routine,NATIONAL,Y4,


In [57]:
vaccine_schedule_data.shape

(8053, 12)

In [58]:
vaccine_schedule_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8053 entries, 0 to 8052
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ISO_3_CODE             8053 non-null   object 
 1   COUNTRYNAME            8052 non-null   object 
 2   WHO_REGION             8052 non-null   object 
 3   YEAR                   8052 non-null   float64
 4   VACCINECODE            8052 non-null   object 
 5   VACCINE_DESCRIPTION    8052 non-null   object 
 6   SCHEDULEROUNDS         8052 non-null   float64
 7   TARGETPOP              3795 non-null   object 
 8   TARGETPOP_DESCRIPTION  8052 non-null   object 
 9   GEOAREA                8022 non-null   object 
 10  AGEADMINISTERED        7007 non-null   object 
 11  SOURCECOMMENT          5139 non-null   object 
dtypes: float64(2), object(10)
memory usage: 755.1+ KB


In [59]:
vaccine_schedule_data.isnull().sum()

ISO_3_CODE                  0
COUNTRYNAME                 1
WHO_REGION                  1
YEAR                        1
VACCINECODE                 1
VACCINE_DESCRIPTION         1
SCHEDULEROUNDS              1
TARGETPOP                4258
TARGETPOP_DESCRIPTION       1
GEOAREA                    31
AGEADMINISTERED          1046
SOURCECOMMENT            2914
dtype: int64

Handle Missing Data

In [60]:
# Count NaNs per row
nan_counts = vaccine_schedule_data.isna().sum(axis=1)

# Get the maximum number of NaNs in any row
max_nans = nan_counts.max()

# Get row(s) where number of NaNs equals the maximum
rows_with_most_nans = vaccine_schedule_data[nan_counts == max_nans]
rows_with_most_nans

Unnamed: 0,ISO_3_CODE,COUNTRYNAME,WHO_REGION,YEAR,VACCINECODE,VACCINE_DESCRIPTION,SCHEDULEROUNDS,TARGETPOP,TARGETPOP_DESCRIPTION,GEOAREA,AGEADMINISTERED,SOURCECOMMENT
8052,Created: 2025-02-01 16:10 UTC,,,,,,,,,,,


In [61]:
# dropping the row where all the values are NaN
vaccine_schedule_data=vaccine_schedule_data.drop(index=8052)

In [62]:
vaccine_schedule_data['TARGETPOP'] = vaccine_schedule_data['TARGETPOP'].fillna(vaccine_schedule_data['TARGETPOP'].mode()[0])
vaccine_schedule_data['AGEADMINISTERED'] = vaccine_schedule_data['AGEADMINISTERED'].fillna(vaccine_schedule_data['AGEADMINISTERED'].mode()[0])
vaccine_schedule_data.update(vaccine_schedule_data['GEOAREA'].fillna("NATIONAL"))
vaccine_schedule_data.update(vaccine_schedule_data['SOURCECOMMENT'].fillna('No comment provided'))

In [63]:
vaccine_schedule_data.isnull().sum()

ISO_3_CODE               0
COUNTRYNAME              0
WHO_REGION               0
YEAR                     0
VACCINECODE              0
VACCINE_DESCRIPTION      0
SCHEDULEROUNDS           0
TARGETPOP                0
TARGETPOP_DESCRIPTION    0
GEOAREA                  0
AGEADMINISTERED          0
SOURCECOMMENT            0
dtype: int64

Normalize Units & Date Consistency

In [64]:
# Convert YEAR, SCHEDULEROUNDS to integer
vaccine_schedule_data['YEAR'] = vaccine_schedule_data['YEAR'].astype(int)
vaccine_schedule_data['SCHEDULEROUNDS'] = vaccine_schedule_data['SCHEDULEROUNDS'].astype(int)

In [65]:
vaccine_schedule_data

Unnamed: 0,ISO_3_CODE,COUNTRYNAME,WHO_REGION,YEAR,VACCINECODE,VACCINE_DESCRIPTION,SCHEDULEROUNDS,TARGETPOP,TARGETPOP_DESCRIPTION,GEOAREA,AGEADMINISTERED,SOURCECOMMENT
0,ABW,Aruba,AMRO,2023,DTAPHIBIPV,DTaP-Hib-IPV (acellular) vaccine,1,RISKGROUPS,General/routine,NATIONAL,M2,No comment provided
1,ABW,Aruba,AMRO,2023,DTAPHIBIPV,DTaP-Hib-IPV (acellular) vaccine,2,RISKGROUPS,General/routine,NATIONAL,M4,No comment provided
2,ABW,Aruba,AMRO,2023,DTAPHIBIPV,DTaP-Hib-IPV (acellular) vaccine,3,RISKGROUPS,General/routine,NATIONAL,M6,No comment provided
3,ABW,Aruba,AMRO,2023,DTAPHIBIPV,DTaP-Hib-IPV (acellular) vaccine,4,B_2YL_W,General/routine,NATIONAL,M15,No comment provided
4,ABW,Aruba,AMRO,2023,DTAPIPV,DTaP-IPV (acellular) vaccine,5,B_CHILD_W,General/routine,NATIONAL,Y4,No comment provided
...,...,...,...,...,...,...,...,...,...,...,...,...
8047,ZWE,Zimbabwe,AFRO,2023,VITAMINA,Vitamin A supplements,2,RISKGROUPS,General/routine,NATIONAL,M12,M6-M59
8048,ZWE,Zimbabwe,AFRO,2023,VITAMINA,Vitamin A supplements,3,RISKGROUPS,General/routine,NATIONAL,M18,M6-M59
8049,ZWE,Zimbabwe,AFRO,2023,VITAMINA,Vitamin A supplements,4,RISKGROUPS,General/routine,NATIONAL,M24,M6-M59
8050,ZWE,Zimbabwe,AFRO,2023,VITAMINA,Vitamin A supplements,5,RISKGROUPS,General/routine,NATIONAL,M30,M6-M59


In [66]:
vaccine_schedule_data['SCHEDULEROUNDS'].describe()

count    8052.000000
mean        2.053403
std         1.316560
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         7.000000
Name: SCHEDULEROUNDS, dtype: float64

In [67]:
coverage_data.to_csv("coverage_dataN", index=False)
incidence_rate_data.to_csv("incidence_rate_dataN",index=False)
reported_cases_data.to_csv("reported_cases_dataN",index=False)
vaccine_introduction_data.to_csv("vaccine_introduction_dataN",index=False)
vaccine_schedule_data.to_csv("vaccine_schedule_dataN",index=False)