In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [2]:
df_nis = pd.read_csv("../Data/NISPUF17.csv")

In [3]:
df_nis.dtypes

Unnamed: 0       int64
SEQNUMC          int64
SEQNUMHH         int64
PDAT             int64
PROVWT_D       float64
                ...   
XVRCTY7        float64
XVRCTY8        float64
XVRCTY9        float64
INS_STAT2_I    float64
INS_BREAK_I    float64
Length: 454, dtype: object

### Q1. Identify the type of variables and list them.

<table>
  <tr>
    <th>Feature Name</th>
    <th>Levels of Measurement </th>
  </tr>
  <tr>
    <td>SEQNUMC – unique child ID variable</td>
    <td>Nominal</td>
  </tr>
  <tr>
    <td>SEQNUMHH – unique household ID variable</td>
    <td>Nominal</td>
  </tr>
   <tr>
    <td>ESTIAP17</td>
    <td>Nominal</td>
  </tr>
    <tr>
    <td>STATE</td>
    <td>Nominal</td>
  </tr>
    
   <tr>
    <td>CEN_REG</td>
    <td>Nominal</td>
  </tr>
  <tr>
    <td>AGEGRP – age category of child</td>
    <td>Interval</td>
  </tr>
  <tr>
    <td>RACEETHK</td>
    <td>Nominal</td>
  </tr>
   <tr>
    <td>SEX</td>
    <td>Nominal</td>
  </tr>
    <tr>
    <td>FRSTBRN</td>
    <td>Nominal</td>
  </tr>
  <tr>
    <td>EDUC1</td>
    <td>Nominal</td>
  </tr>
  <tr>
    <td>MARITAL2</td>
    <td>Nominal</td>
  </tr>
   <tr>
    <td>M_AGEGRP2</td>
    <td>Nominal</td>
  </tr>
    <tr>
    <td>STATE</td>
    <td>Nominal</td>
  </tr>
  
</table>

### Q2. Write a function called `proportion_of_education` which returns the proportion of children in the dataset who had a mother with the education levels equal to less than high school (<12), high school (12), more than high school but not a college graduate (>12) and college degree.

In [6]:
def proportion_of_education(df):
    poe = {"less than high school": len(df[df['EDUC1'] == 1])/len(df),
            "high school": len(df[df['EDUC1'] == 2])/len(df),
            "more than high school but not college": len(df[df['EDUC1'] == 3])/len(df),
            "college": len(df[df['EDUC1'] == 4])/len(df)}
    return poe

proportion_of_education(df_nis)

{'less than high school': 0.10202002459160373,
 'high school': 0.172352011241876,
 'more than high school but not college': 0.24588090637625154,
 'college': 0.47974705779026877}

### 3. Return a tuple of the average number of influenza vaccines for those children we know received nutritious food in childhood and those who know did not.

In [7]:
def average_influenza_doses(df):
    
    df1 = df[df["CBF_01"] == 1]
    df2 = df[df["CBF_01"] == 2]
    
    return (df1["P_NUMFLU"].mean(), df2["P_NUMFLU"].mean())

average_influenza_doses(df_nis)

(1.8799187420058687, 1.5963945918878317)

### 4. Calculate the ratio of the number of children who contracted chickenpox but were vaccinated against it (at least one varicella dose) versus those who were vaccinated but did not contract chicken pox. Return results by gender.

In [14]:
def chickenpox_by_sex(mb):
    
    v1m = mb[(mb['P_NUMVRC'] >=1) & (mb['HAD_CPOX'] == 1) & (mb['SEX'] == 1)]
    v1f = mb[(mb['P_NUMVRC'] >=1) & (mb['HAD_CPOX'] == 1) & (mb['SEX'] == 2)]
    v2m = mb[(mb['P_NUMVRC'] >=1) & (mb['HAD_CPOX'] == 2) & (mb['SEX'] == 1)]
    v2f = mb[(mb['P_NUMVRC'] >=1) & (mb['HAD_CPOX'] == 2) & (mb['SEX'] == 2)]
    rm = v1m.shape[0]/v2m.shape[0]
    rf = v1f.shape[0]/v2f.shape[0]
    r = [rm, rf]
    sex = ['male', 'female']
    d = {}
    i = 0
    for s in sex:
        d[s] = r[i]
        i+=1
    return d 
chickenpox_by_sex(df_nis)

{'male': 0.009675583380762664, 'female': 0.0077918259335489565}

### 5. Is there a correlation between having had the chicken pox and the number of chickenpox vaccine doses given (varicella).

In [12]:
def corr_chickenpox(df):
    df = df[df["HAD_CPOX"] <= 2]
    df = df[~df["P_NUMVRC"].isna() & ~df["HAD_CPOX"].isna()]

    corr, pval=stats.pearsonr(df["HAD_CPOX"], df["P_NUMVRC"])

    return corr

corr_chickenpox(df_nis)

0.07044873460147867