In [1]:
import pandas as pd
import numpy as np

# Question #1

### Goal:
return the proportion of children in the dataset who had a mother with the education levels equal to:
- less than high school
- high school
- more than high school but not a college graduate
- college degree

In [2]:
df = pd.read_csv('NISPUF17.csv')

In [3]:
df['EDUC1'].unique()

array([4, 3, 1, 2], dtype=int64)

CATEGORIES:
- **4** = College
- **3** = More than high school but not college
- **2** = High School
- **1** = Less than High School

In [4]:
df['EDUC1'].value_counts()

4    13656
3     6999
2     4906
1     2904
Name: EDUC1, dtype: int64

To get the proportion of children in the dataset who had a mother with the education levels equal to the 4 categories
we need to divide the sum of these values to the size of `'EDUC1'`

In [5]:
def proportion_of_educ():
    # calculate proportion 'sum_values / size'
    prop = df['EDUC1'].value_counts() / df['EDUC1'].shape[0]
    # educ labels, fixed arrangement
    educ = ['College', 'more than high school but not college', 'high school', 'less than high school']
    # zip it
    z_prop_educ = zip(educ, prop)
    # convert to dictionary and return it
    return dict(z_prop_educ)

In [6]:
proportion_of_educ()

{'College': 0.47974705779026877,
 'more than high school but not college': 0.24588090637625154,
 'high school': 0.172352011241876,
 'less than high school': 0.10202002459160373}

# Question #2
### Goal:
Return a tuple of the average number of influenza vaccines for those children we know received breastmilk as a child and those who know did not.

In [7]:
df = pd.read_csv('NISPUF17.csv')

In [8]:
df['CBF_01'].head()

0    1
1    2
2    2
3    2
4    1
Name: CBF_01, dtype: int64

Since we only need to return a tuple of the average number of influenza vaccines for those children we know **received breastmilk** as a child and those who know **did not**.
We only need to get the value:

- **1** - Yes
- **2** - No

for `'CBF_01'`.

In [9]:
print("These values are the number of influenza doses")
df['P_NUMFLU'].unique()

These values are the number of influenza doses


array([nan,  3.,  0.,  2.,  1.,  4.,  5.,  6.])

In [10]:
def average_influenza_doses():
    # get the values of children who recived breastmilk
    yes_mlk = df[df['CBF_01'] == 1]
    # get the values of children who did not
    no_mlk = df[df['CBF_01'] == 2]
    
    # return the mean of children who received breastmilk with influenza doses
    # and the mean of children who did not with influenza doses
    return yes_mlk['P_NUMFLU'].mean(), no_mlk['P_NUMFLU'].mean()

In [11]:
average_influenza_doses()

(1.8799187420058687, 1.5963945918878317)

# Question #3
### Goal:
See if there is any evidence of a link between vaccine effectiveness and sex of the child. Calculate the ratio of the number of children who contracted chickenpox but were vaccinated against it (at least one varicella dose) versus those who were vaccinated but did not contract chicken pox. Return results by sex.

In [12]:
df = pd.read_csv('NISPUF17.csv')

In [13]:
# 'SEX', sex of the child
df['SEX'].unique()

array([1, 2], dtype=int64)

- **1** - MALE
- **2** - FEMALE

In [14]:
# 'HAD_CPOX', had chickenpox
df['HAD_CPOX'].head()

0    2
1    2
2    2
3    2
4    2
Name: HAD_CPOX, dtype: int64

We only need to get
- **1** - who contracted chickenpox
- **2** - did not contract chicken pox

In [15]:
# 'P_NUMVRC', total number of varciella doses
df['P_NUMVRC'].unique()

array([nan,  1.,  0.,  2.,  3.])

In [16]:
def chickenpox_by_sex():
    # had chickenpox, vaccinated
    cbs_myv = df[ (df['SEX'] == 1) & (df['HAD_CPOX'] == 1) & (df['P_NUMVRC'] >= 1) ]
    cbs_fyv = df[ (df['SEX'] == 2) & (df['HAD_CPOX'] == 1) & (df['P_NUMVRC'] >= 1) ]
    # did not contract chicken pox, not vaccinated
    cbs_mnv = df[ (df['SEX'] == 1) & (df['HAD_CPOX'] == 2) & (df['P_NUMVRC'] >= 1) ]
    cbs_fnv = df[ (df['SEX'] == 2) & (df['HAD_CPOX'] == 2) & (df['P_NUMVRC'] >= 1) ]
    
    # get the ratio
    cbs_m = cbs_myv.shape[0] / cbs_mnv.shape[0]
    cbs_f = cbs_fyv.shape[0] / cbs_fnv.shape[0]
    
    # zip it
    z_mf = zip(['male', 'female'], [cbs_m, cbs_f])
    
    # convert to dictionary and return it
    return dict(z_mf)

In [17]:
chickenpox_by_sex()

{'male': 0.009675583380762664, 'female': 0.0077918259335489565}

# Question 4
### Goal:
to see if there is a correlation between having had the chicken pox and the number of chickenpox vaccine doses given (varicella).

In [18]:
import scipy.stats as stats

In [19]:
df = pd.read_csv('NISPUF17.csv')

- `had_chickenpox_column` - is either 1 (for yes) or 2 (for no)
- `num_chickenpox_vaccine_column` - is the number of doses a child has been given of the varicella vaccine.

In [20]:
def corr_chickenpox():
    # get 'HAD_CPOX' value with 1, 2 and ignore NaN of 'P_NUMVRC'
    hcc_ncvc = df[(df['HAD_CPOX'] < 3) & (df['P_NUMVRC'].notna())]
    # update 'hcc_ncvc' with the only columns we need
    hcc_ncvc = hcc_ncvc.loc[:, ['HAD_CPOX', 'P_NUMVRC']]
    
    dfs = pd.DataFrame({"had_chickenpox_column": hcc_ncvc['HAD_CPOX'],
                     "num_chickenpox_vaccine_column": hcc_ncvc['P_NUMVRC']})
    
    # calculate correlation coefficient and the p-value for testing non-correlation.
    corr, pval = stats.pearsonr( dfs["had_chickenpox_column"], dfs["num_chickenpox_vaccine_column"] )
    
    return corr

In [21]:
corr_chickenpox()

0.07044873460148