<a href="https://colab.research.google.com/github/jayanthbagare/allendowney/blob/main/ThinkStats/Chapter1_TS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Exploratory Data analysis for Think Stats Chapter 1
# Download the dataset from National Survey of Family growth (NSFG)

In [3]:
from os.path import basename, exists

def download(url):
  filename = basename(url)
  if not exists(filename):
    from urllib.request import urlretrieve

    local, _ = urlretrieve(url,filename)
    print("Download " + local)

download("https://github.com/AllenDowney/ThinkStats/raw/v3/nb/thinkstats.py")

Download thinkstats.py


In [4]:
try:
  import empiricaldist
except ImportError:
  %pip install empiricaldist


Collecting empiricaldist
  Downloading empiricaldist-0.9.0.tar.gz (14 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: empiricaldist
  Building wheel for empiricaldist (pyproject.toml) ... [?25l[?25hdone
  Created wheel for empiricaldist: filename=empiricaldist-0.9.0-py3-none-any.whl size=14296 sha256=97ecd94f6b61bbb91bd777c0039ef91fcbdcab5561545dfc4ac079635595e93d
  Stored in directory: /root/.cache/pip/wheels/96/04/f1/a934ef1f5f3db621dc39566b2c7de45b8c6916044be6a9c9ac
Successfully built empiricaldist
Installing collected packages: empiricaldist
Successfully installed empiricaldist-0.9.0


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import HTML
from thinkstats import decorate

In [6]:
download("https://github.com/AllenDowney/ThinkStats/raw/v3/data/2002FemPreg.dct")
download("https://github.com/AllenDowney/ThinkStats/raw/v3/data/2002FemPreg.dat.gz")

Download 2002FemPreg.dct
Download 2002FemPreg.dat.gz


In [7]:
try:
  import statadict
except ImportError:
  %pip install statadict

Collecting statadict
  Downloading statadict-1.1.0-py3-none-any.whl.metadata (1.7 kB)
Downloading statadict-1.1.0-py3-none-any.whl (9.4 kB)
Installing collected packages: statadict
Successfully installed statadict-1.1.0


In [8]:
dct_file = "2002FemPreg.dct"
dat_file = "2002FemPreg.dat.gz"

In [9]:
from statadict import parse_stata_dict

def read_stata(dct_file,dat_file):
  stata_dict = parse_stata_dict(dct_file)
  resp = pd.read_fwf(
      dat_file,
      names=stata_dict.names,
      colspecs=stata_dict.colspecs,
      compression="gzip",
  )
  return resp

In [10]:
preg = read_stata(dct_file,dat_file)

In [11]:
preg.shape

(13593, 243)

In [12]:
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,poverty_i,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw
0,1,1,,,,,6.0,,1.0,,...,0,0,0,0,3410.389399,3869.349602,6448.271112,2,9,1231
1,1,2,,,,,6.0,,1.0,,...,0,0,0,0,3410.389399,3869.349602,6448.271112,2,9,1231
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231
3,2,2,,,,,6.0,,1.0,,...,0,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231
4,2,3,,,,,6.0,,1.0,,...,0,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231


In [13]:
preg.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'poverty_i', 'laborfor_i', 'religion_i', 'metro_i', 'basewgt',
       'adj_mod_basewgt', 'finalwgt', 'secu_p', 'sest', 'cmintvw'],
      dtype='object', length=243)

In [14]:
preg["outcome"].value_counts().sort_index()

Unnamed: 0_level_0,count
outcome,Unnamed: 1_level_1
1,9148
2,1862
3,120
4,1921
5,190
6,352


In [15]:
counts = preg["birthwgt_lb"].value_counts(dropna=False).sort_index()
counts

Unnamed: 0_level_0,count
birthwgt_lb,Unnamed: 1_level_1
0.0,8
1.0,40
2.0,53
3.0,98
4.0,229
5.0,697
6.0,2223
7.0,3049
8.0,1889
9.0,623


In [16]:
counts.loc[0:5]

Unnamed: 0_level_0,count
birthwgt_lb,Unnamed: 1_level_1
0.0,8
1.0,40
2.0,53
3.0,98
4.0,229
5.0,697


In [17]:
counts.loc[0:5].sum()

np.int64(1125)

In [18]:
preg["birthwgt_lb"] = preg["birthwgt_lb"].replace([51, 97, 98, 99], np.nan)

In [20]:
preg["agepreg"] /= 100
preg["agepreg"].mean()

np.float64(24.6881511970395)

In [21]:
preg["birthwgt_oz"].value_counts(dropna=False).sort_index()

Unnamed: 0_level_0,count
birthwgt_oz,Unnamed: 1_level_1
0.0,1037
1.0,408
2.0,603
3.0,533
4.0,525
5.0,535
6.0,709
7.0,501
8.0,756
9.0,505


In [22]:
preg["birthwgt_oz"] = preg["birthwgt_oz"].replace([97, 98, 99], np.nan)

In [23]:
preg["totalwgt_lb"] = preg["birthwgt_lb"] + preg["birthwgt_oz"] / 16.0
preg["totalwgt_lb"].mean()

np.float64(7.265628457623368)