#### Imports

In [3]:
import pandas as pd
import numpy as np

#### Load Dataset

The dataset is collected from Centers of Disease control and prevention [data.cdc.gov](https://www.cdc.gov/) under the category of [Vaccination Coverage among Pregnancy Women](https://data.cdc.gov/Pregnancy-Vaccination/Vaccination-Coverage-among-Pregnant-Women/h7pm-wmjc/data)

In [4]:
# Read the data
data = pd.read_csv('Vaccination_Coverage_among_Pregnant_Women.csv')

In [5]:
# Display Features
data.columns

Index(['Vaccine', 'Geography Type', 'Geography',
       'Survey Year/Influenza Season', 'Dimension Type', 'Dimension',
       'Estimate (%)', '95% CI (%)', 'Sample Size'],
      dtype='object')

In [6]:
# Display Values
data

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,Influenza,States,Alaska,2012,Age,≥18 Years,49.2,45.3 to 53.1,852.0
1,Influenza,States,Arkansas,2012,Age,≥18 Years,46.6,40.7 to 52.5,756.0
2,Influenza,States,Colorado,2012,Age,≥18 Years,56.1,52.1 to 60.0,1170.0
3,Influenza,States,Delaware,2012,Age,≥18 Years,41.6,38.4 to 44.8,981.0
4,Influenza,States,Georgia,2012,Age,≥18 Years,33.6,29.6 to 37.7,1007.0
...,...,...,...,...,...,...,...,...,...
4132,Tdap,States,Utah,2020,Race/Ethnicity,"White, Non-Hispanic",80.1,77.0 to 83.0,979.0
4133,Tdap,States,Vermont,2020,Race/Ethnicity,"White, Non-Hispanic",86.4,83.6 to 88.9,696.0
4134,Tdap,States,Virginia,2020,Race/Ethnicity,"White, Non-Hispanic",83.1,76.9 to 88.2,503.0
4135,Tdap,States,Washington,2020,Race/Ethnicity,"White, Non-Hispanic",80.9,76.2 to 85.0,352.0


In [7]:
# Display Dimension
data.shape

(4137, 9)

In [8]:
# Display Information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4137 entries, 0 to 4136
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Vaccine                       4137 non-null   object 
 1   Geography Type                4137 non-null   object 
 2   Geography                     4137 non-null   object 
 3   Survey Year/Influenza Season  4137 non-null   int64  
 4   Dimension Type                4137 non-null   object 
 5   Dimension                     4137 non-null   object 
 6   Estimate (%)                  4137 non-null   object 
 7   95% CI (%)                    4137 non-null   object 
 8   Sample Size                   3933 non-null   float64
dtypes: float64(1), int64(1), object(7)
memory usage: 291.0+ KB


In [9]:
# Display the Unique values
data['Vaccine'].value_counts()

Influenza    2891
Tdap         1246
Name: Vaccine, dtype: int64

In [10]:
# Display top 3 values
data.head(3)

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,Influenza,States,Alaska,2012,Age,≥18 Years,49.2,45.3 to 53.1,852.0
1,Influenza,States,Arkansas,2012,Age,≥18 Years,46.6,40.7 to 52.5,756.0
2,Influenza,States,Colorado,2012,Age,≥18 Years,56.1,52.1 to 60.0,1170.0


In [11]:
data['Survey Year/Influenza Season'].unique()

array([2012, 2020, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [12]:
# Make a new copy
new_data = data.copy()

In [13]:
# Display the presence of missing values
new_data.isnull().sum()

Vaccine                           0
Geography Type                    0
Geography                         0
Survey Year/Influenza Season      0
Dimension Type                    0
Dimension                         0
Estimate (%)                      0
95% CI (%)                        0
Sample Size                     204
dtype: int64

In [14]:
new_data.describe()

Unnamed: 0,Survey Year/Influenza Season,Sample Size
count,4137.0,3933.0
mean,2016.230602,805.390796
std,2.663058,2658.305251
min,2012.0,30.0
25%,2014.0,169.0
50%,2017.0,325.0
75%,2019.0,659.0
max,2020.0,43737.0


In [15]:
type(new_data['Estimate (%)'][0])

str

In [16]:
new_data['Estimate (%)'].unique()

array(['49.2', '46.6', '56.1', '41.6', '33.6', '42.0', '49.1', '53.0',
       '47.9', '66.1', '42.8', '66.0', '45.5', '58.8', '38.6', '37.8',
       '39.5', '43.6', '54.3', '47.3', '47.4', '60.4', '44.1', '48.9',
       '57.5', '60.9', '43.4', '58.6', '39.1', '49.6', '64.0', '57.3',
       '58.2', '74.7', '73.5', '62.9', '73.1', '41.3', '40.9', '68.7',
       '65.1', '77.1', '71.2', '53.7', '72.4', '70.5', '79.8', '58.4',
       '68.2', '48.1', '64.8', '68.5', '76.0', '75.0', '60.8', '67.6',
       '65.0', '74.1', '67.1', '64.5', '34.8', '61.1', '76.1', '54.9',
       '67.2', '64.4', '74.0', '64.7', '70.7', '63.3', '39.0', '51.3',
       '33.4', '27.6', '38.5', '42.5', '49.0', '41.0', '51.0', '37.0',
       '59.4', '32.6', '51.6', '30.7', '50.6', '29.5', '50.0', '38.1',
       '39.8', '56.7', '38.9', '41.9', '51.7', '53.2', '39.7', '52.4',
       '32.0', '40.5', '66.8', '56.4', '67.8', '63.5', '51.1', '49.7',
       '25.9', '29.9', '72.5', '50.3', '73.3', '43.9', '40.4', '69.5',
      

In [18]:
# There is some unwanted values so replacing it with NaN for easy cleaning of data
new_data['Estimate (%)'] = new_data['Estimate (%)'].replace('NR*', 'NaN')

In [19]:
new_data['Estimate (%)'].unique()

array(['49.2', '46.6', '56.1', '41.6', '33.6', '42.0', '49.1', '53.0',
       '47.9', '66.1', '42.8', '66.0', '45.5', '58.8', '38.6', '37.8',
       '39.5', '43.6', '54.3', '47.3', '47.4', '60.4', '44.1', '48.9',
       '57.5', '60.9', '43.4', '58.6', '39.1', '49.6', '64.0', '57.3',
       '58.2', '74.7', '73.5', '62.9', '73.1', '41.3', '40.9', '68.7',
       '65.1', '77.1', '71.2', '53.7', '72.4', '70.5', '79.8', '58.4',
       '68.2', '48.1', '64.8', '68.5', '76.0', '75.0', '60.8', '67.6',
       '65.0', '74.1', '67.1', '64.5', '34.8', '61.1', '76.1', '54.9',
       '67.2', '64.4', '74.0', '64.7', '70.7', '63.3', '39.0', '51.3',
       '33.4', '27.6', '38.5', '42.5', '49.0', '41.0', '51.0', '37.0',
       '59.4', '32.6', '51.6', '30.7', '50.6', '29.5', '50.0', '38.1',
       '39.8', '56.7', '38.9', '41.9', '51.7', '53.2', '39.7', '52.4',
       '32.0', '40.5', '66.8', '56.4', '67.8', '63.5', '51.1', '49.7',
       '25.9', '29.9', '72.5', '50.3', '73.3', '43.9', '40.4', '69.5',
      

In [20]:
new_data['Estimate (%)'] = new_data['Estimate (%)'].dropna()

In [21]:
# The column is in str so typecasting it to float for numerical computations
new_data['Estimate (%)'] = new_data['Estimate (%)'].astype(float)

In [22]:
new_data['Estimate (%)'][0]

49.2

In [23]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4137 entries, 0 to 4136
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Vaccine                       4137 non-null   object 
 1   Geography Type                4137 non-null   object 
 2   Geography                     4137 non-null   object 
 3   Survey Year/Influenza Season  4137 non-null   int64  
 4   Dimension Type                4137 non-null   object 
 5   Dimension                     4137 non-null   object 
 6   Estimate (%)                  3715 non-null   float64
 7   95% CI (%)                    4137 non-null   object 
 8   Sample Size                   3933 non-null   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 291.0+ KB


In [74]:
# Dropping all the null values
new_data = new_data.dropna()

In [25]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3715 entries, 0 to 4136
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Vaccine                       3715 non-null   object 
 1   Geography Type                3715 non-null   object 
 2   Geography                     3715 non-null   object 
 3   Survey Year/Influenza Season  3715 non-null   int64  
 4   Dimension Type                3715 non-null   object 
 5   Dimension                     3715 non-null   object 
 6   Estimate (%)                  3715 non-null   float64
 7   95% CI (%)                    3715 non-null   object 
 8   Sample Size                   3715 non-null   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 290.2+ KB


In [26]:
new_data.describe()

Unnamed: 0,Survey Year/Influenza Season,Estimate (%),Sample Size
count,3715.0,3715.0,3715.0
mean,2016.241184,61.275074,849.202153
std,2.648391,15.169773,2728.862515
min,2012.0,5.2,32.0
25%,2014.0,52.0,191.0
50%,2017.0,62.7,346.0
75%,2019.0,72.1,685.5
max,2020.0,98.2,43737.0


#### Central Tendency Measures

In [27]:
# Calculate Mean
new_data['Estimate (%)'].mean()

61.275074024226114

In [28]:
# Calculate Median
new_data['Estimate (%)'].median()

62.7

In [29]:
# Calculate Mode
new_data['Estimate (%)'].mode()

0    57.8
Name: Estimate (%), dtype: float64

In [30]:
# Calculate Quantile
new_data['Estimate (%)'].quantile()

62.7

#### Measure of dispersion

In [31]:
# Calculate Variance
new_data['Estimate (%)'].var()

230.12201180792763

In [32]:
# Calculate Standard Deviation
new_data['Estimate (%)'].std()

15.169772964943398

In [33]:
# Correlation
# Calculate Variance
new_data['Estimate (%)'].corr(new_data['Sample Size'])

-0.019988084872987543

In [35]:
new_data.sample(2)

Unnamed: 0,Vaccine,Geography Type,Geography,Survey Year/Influenza Season,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
3513,Tdap,National,United States,2020,Age,≥35 Years,78.7,76.8 to 80.5,4108.0
1258,Influenza,States,Arkansas,2015,Age,25-34 Years,66.9,60.6 to 72.8,428.0


In [37]:
new_data['Vaccine'].value_counts()

Influenza    2573
Tdap         1142
Name: Vaccine, dtype: int64

In [38]:
# Create Frequency Table
pd.crosstab(index=new_data['Vaccine'], columns='count')

col_0,count
Vaccine,Unnamed: 1_level_1
Influenza,2573
Tdap,1142


In [39]:
# Create two-way table
pd.crosstab(index=new_data['Geography Type'], columns=new_data['Vaccine'])

Vaccine,Influenza,Tdap
Geography Type,Unnamed: 1_level_1,Unnamed: 2_level_1
National,80,72
States,2493,1070


In [40]:
# Two-way table probability
pd.crosstab(index=new_data['Geography Type'], columns=new_data['Vaccine'], normalize=True)

Vaccine,Influenza,Tdap
Geography Type,Unnamed: 1_level_1,Unnamed: 2_level_1
National,0.021534,0.019381
States,0.671063,0.288022


In [41]:
# Two-way table conditional probability
pd.crosstab(index=new_data['Geography Type'], columns=new_data['Vaccine'], normalize='index', margins=True)

Vaccine,Influenza,Tdap
Geography Type,Unnamed: 1_level_1,Unnamed: 2_level_1
National,0.526316,0.473684
States,0.699691,0.300309
All,0.692598,0.307402


In [42]:
# Two-way table conditional probability
pd.crosstab(index=new_data['Geography Type'], columns=new_data['Vaccine'], normalize='columns', margins=True)

Vaccine,Influenza,Tdap,All
Geography Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
National,0.031092,0.063047,0.040915
States,0.968908,0.936953,0.959085


In [43]:
num_data = new_data.select_dtypes(exclude=[object])

In [44]:
num_data.columns

Index(['Survey Year/Influenza Season', 'Estimate (%)', 'Sample Size'], dtype='object')

In [45]:
# correlation matrix
corr_data = num_data.corr()
corr_data

Unnamed: 0,Survey Year/Influenza Season,Estimate (%),Sample Size
Survey Year/Influenza Season,1.0,0.53468,-0.01362
Estimate (%),0.53468,1.0,-0.019988
Sample Size,-0.01362,-0.019988,1.0


#### Additional Tasks

In [51]:
# kurtosis
kurt_data = new_data['Estimate (%)'].kurt(axis=0)
kurt_data

0.31221625194292413

In [63]:
# Skewness of the data
skew_data = new_data['Sample Size'].skew(axis=0)
skew_data

9.487462004182909

In [54]:
new_data.describe()

Unnamed: 0,Survey Year/Influenza Season,Estimate (%),Sample Size
count,3715.0,3715.0,3715.0
mean,2016.241184,61.275074,849.202153
std,2.648391,15.169773,2728.862515
min,2012.0,5.2,32.0
25%,2014.0,52.0,191.0
50%,2017.0,62.7,346.0
75%,2019.0,72.1,685.5
max,2020.0,98.2,43737.0


0 quartile = 0 quantile = 0 percentile

1 quartile = 0.25 quantile = 25 percentile

2 quartile = .5 quantile = 50 percentile (median)

3 quartile = .75 quantile = 75 percentile

4 quartile = 1 quantile = 100 percentile

---

Percentiles go from 0 to 100.

Quartiles go from 1 to 4(or 0 to 4).

Quantiles can go from anything to anything.

Percentiles and quartiles are examples of quantiles.

In [56]:
# Quartile
new_data.quantile(1) 

  new_data.quantile(1)


Survey Year/Influenza Season     2020.0
Estimate (%)                       98.2
Sample Size                     43737.0
Name: 1.0, dtype: float64

In [73]:
# Percentile
new_data.quantile(0.75)

  new_data.quantile(0.75)


Survey Year/Influenza Season    2019.0
Estimate (%)                      72.1
Sample Size                      685.5
Name: 0.75, dtype: float64

In [72]:
np.percentile(new_data['Sample Size'], 75)

685.5

In [62]:
# Quantile
new_data.quantile()

  new_data.quantile()


Survey Year/Influenza Season    2017.0
Estimate (%)                      62.7
Sample Size                      346.0
Name: 0.5, dtype: float64