In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
df = pd.read_csv("assets/NISPUF17.csv", index_col=0)

In [3]:
df.shape

(28465, 453)

In [4]:
##### Question 1 #####
# returns the proportion of children in the dataset who had a mother with 
# the education levels equal to 
# - less than high school (<12)                             (1)
# - high school (12)                                        (2)
# - more than high school but not a college graduate (>12)  (3)
# - and college degree.                                     (4)
"""
    {"less than high school":0.2,
     "high school":0.4,
     "more than high school but not college":0.2,
     "college":0.2}
"""

'\n    {"less than high school":0.2,\n     "high school":0.4,\n     "more than high school but not college":0.2,\n     "college":0.2}\n'

In [5]:
df.EDUC1.unique()

array([4, 3, 1, 2])

In [6]:
def proportion_of_education():
  n = len(df.EDUC1)
  less_than_hs = (df.EDUC1 < 2).sum()/n
  hs = (df.EDUC1 == 2).sum()/n
  more_than_hs_less_than_college = (df.EDUC1 == 3).sum()/n
  college = (df.EDUC1 == 4).sum()/n
  
  return {
    'less than high school': less_than_hs,
    'high school': hs,
    'more than high school but not college': more_than_hs_less_than_college,
    'college': college
  }

In [7]:
proportion_of_education()

{'less than high school': 0.10202002459160373,
 'high school': 0.172352011241876,
 'more than high school but not college': 0.24588090637625154,
 'college': 0.47974705779026877}

In [8]:
assert type(proportion_of_education())==type({}), "You must return a dictionary."
assert len(proportion_of_education()) == 4, "You have not returned a dictionary with four items in it."
assert "less than high school" in proportion_of_education().keys(), "You have not returned a dictionary with the correct keys."
assert "high school" in proportion_of_education().keys(), "You have not returned a dictionary with the correct keys."
assert "more than high school but not college" in proportion_of_education().keys(), "You have not returned a dictionary with the correct keys."
assert "college" in proportion_of_education().keys(), "You have not returned a dictionary with the correct keys."

In [9]:
##### Question 2 #####
# explore the relationship between
# - being fed breastmilk as a child                                         (1)
# - and getting a seasonal influenza vaccine from a healthcare provider.

# Return a tuple of  -- (2.5, 0.1)
# - the average number of influenza vaccines for those children we know received breastmilk as a child
# - and those who know did not.

In [10]:
def average_influenza_doses():
  breastmilk = df[(df.CBF_01 == 1)]
  avg_vaccine_breast_milk = breastmilk.P_NUMFLU.dropna().mean()

  no_breastmilk = df[~(df.CBF_01 == 1)]
  avg_vaccine_no_breast_milk = no_breastmilk.P_NUMFLU.dropna().mean()

  return avg_vaccine_breast_milk, avg_vaccine_no_breast_milk

In [11]:
average_influenza_doses()

(1.8799187420058687, 1.591087169441724)

In [12]:
assert len(average_influenza_doses())==2, "Return two values in a tuple, the first for yes and the second for no."

In [13]:
#### Question 3 ####
# see if there is any evidence of a link between:
# - vaccine effectiveness
# - and sex of the child
# - Calculate:
#    - the ratio of the number of children who contracted chickenpox but were vaccinated against it (at least one varicella dose) 
#    - versus those who were vaccinated but did not contract chicken pox. Return results by sex.
"""
{"male":0.2,
 "female":0.4}
"""

'\n{"male":0.2,\n "female":0.4}\n'

In [14]:
def chickenpox_by_sex():
  var = ['SEX', 'HAD_CPOX', 'P_NUMVRC']
  tmp = df[var].dropna()

  male_cp_vax = tmp[(tmp.P_NUMVRC >= 1) & (tmp.HAD_CPOX == 1) & (tmp.SEX == 1)]
  female_cp_vax = tmp[(tmp.P_NUMVRC >= 1) & (tmp.HAD_CPOX == 1) & (tmp.SEX == 2)]

  male_no_cp_vax = tmp[(tmp.P_NUMVRC >= 1) & (tmp.HAD_CPOX == 2) & (tmp.SEX == 1)]
  female_no_cp_vax = tmp[(tmp.P_NUMVRC >= 1) & (tmp.HAD_CPOX == 2) & (tmp.SEX == 2)]
  
  return {
    'female': len(female_cp_vax)/len(female_no_cp_vax),
    'male':len(male_cp_vax)/len(male_no_cp_vax)
  }

In [15]:
chickenpox_by_sex()

{'female': 0.0077918259335489565, 'male': 0.009675583380762664}

In [16]:
assert len(chickenpox_by_sex())==2, "Return a dictionary with two items, the first for males and the second for females."

In [17]:
##### Question 4 #####
# correlation between:
# - having had the chicken pox
# - and the number of chickenpox vaccine doses given

In [18]:
def corr_chickenpox():
  var = ['HAD_CPOX', 'P_NUMVRC']
  tmp = df[var]
  tmp = tmp[(tmp.HAD_CPOX < 3) & ~(tmp.P_NUMVRC.isna())]
  corr, pval = stats.pearsonr(tmp.HAD_CPOX, tmp.P_NUMVRC)
  return corr

In [19]:
corr_chickenpox()

0.07044873460148016

In [20]:
assert -1<=corr_chickenpox()<=1, "You must return a float number between -1.0 and 1.0."