# Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [26]:
from __future__ import print_function, division

import nsfg

## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [2]:
preg = nsfg.ReadFemPreg()
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


Print the column names.

In [3]:
preg.columns

Index([         u'caseid',        u'pregordr',       u'howpreg_n',
             u'howpreg_p',        u'moscurrp',        u'nowprgdk',
              u'pregend1',        u'pregend2',        u'nbrnaliv',
              u'multbrth',
       ...
            u'laborfor_i',      u'religion_i',         u'metro_i',
               u'basewgt', u'adj_mod_basewgt',        u'finalwgt',
                u'secu_p',            u'sest',         u'cmintvw',
           u'totalwgt_lb'],
      dtype='object', length=244)

Select a single column name.

In [4]:
preg.columns[1]

u'pregordr'

Select a column and check what type it is.

In [5]:
pregordr = preg['pregordr']
type(pregordr)

pandas.core.series.Series

Print a column.

In [6]:
pregordr

0        1
1        2
2        1
3        2
4        3
5        1
6        2
7        3
8        1
9        2
10       1
11       1
12       2
13       3
14       1
15       2
16       3
17       1
18       2
19       1
20       2
21       1
22       2
23       1
24       2
25       3
26       1
27       1
28       2
29       3
        ..
13563    2
13564    3
13565    1
13566    1
13567    1
13568    2
13569    1
13570    2
13571    3
13572    4
13573    1
13574    2
13575    1
13576    1
13577    2
13578    1
13579    2
13580    1
13581    2
13582    3
13583    1
13584    2
13585    1
13586    2
13587    3
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, dtype: int64

Select a single element from a column.

In [7]:
pregordr[0]

1

Select a slice from a column.

In [8]:
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

Select a column using dot notation.

In [9]:
pregordr = preg.pregordr

Count the number of times each value occurs.

In [10]:
preg.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

Check the values of another variable.

In [11]:
preg.birthwgt_lb.value_counts().sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

In [12]:
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1])

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [7]:
# Solution goes here
import nsfg
preg = nsfg.ReadFemPreg()
preg.birthord.value_counts().sort_index()


1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
Name: birthord, dtype: int64

We can also use `isnull` to count the number of nans.

In [14]:
preg.birthord.isnull().sum()

4445

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [8]:
# Solution goes here
preg.prglngth.value_counts()


39    4744
40    1120
38     609
9      594
41     591
6      543
37     457
13     446
4      412
8      409
35     357
36     329
42     328
17     253
11     202
30     198
5      181
7      175
12     170
3      151
43     148
22     147
10     137
32     122
26     117
2       78
34      60
33      50
44      46
16      44
15      39
28      38
21      37
19      34
24      31
31      29
14      29
29      23
20      18
18      17
0       15
25      15
23      12
45      10
1        9
27       8
48       7
50       2
46       1
47       1
Name: prglngth, dtype: int64

To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [16]:
preg.totalwgt_lb.mean()

7.265628457623368

In [16]:
# Solution goes here
preg['totalwgt_kg'] = preg.birthwgt_lb*0.453
preg.totalwgt_kg

0        3.624
1        3.171
2        4.077
3        3.171
4        2.718
5        3.624
6        4.077
7        3.624
8        3.171
9        2.718
10       3.171
11       3.171
12       1.812
13         NaN
14         NaN
15       3.171
16       3.171
17       2.718
18         NaN
19       3.624
20       3.624
21       2.265
22         NaN
23       2.718
24       3.171
25       2.718
26       3.624
27       3.171
28       2.718
29       3.171
         ...  
13563    3.171
13564    3.171
13565    3.624
13566    3.171
13567      NaN
13568      NaN
13569    2.265
13570    2.718
13571    2.718
13572    2.265
13573    2.718
13574    2.718
13575      NaN
13576    2.718
13577      NaN
13578    2.718
13579    3.171
13580      NaN
13581    2.718
13582      NaN
13583      NaN
13584    2.718
13585      NaN
13586      NaN
13587      NaN
13588    2.718
13589      NaN
13590      NaN
13591    3.171
13592    3.171
Name: totalwgt_kg, Length: 13593, dtype: float64

`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [18]:
resp = nsfg.ReadFemResp()

`DataFrame` provides a method `head` that displays the first five rows:

In [19]:
resp.head()

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.79949,4744.19135,2,18,1233,1221,16:30:59,64.294
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.79949,4744.19135,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667


Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [52]:
# Solution goes here
#resp = nsfg.ReadFemResp()
resp.age_r.value_counts().sort_index()

15    217
16    223
17    234
18    235
19    241
20    258
     ... 
39    215
40    256
41    250
42    215
43    253
44    235
Name: age_r, Length: 30, dtype: int64

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [37]:
import pandas as pd
pd.set_option('max_columns', 500)

resp[resp.caseid==2298]

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,marstat,fmarstat,fmarit,evrmarry,hisp,hispgrp,numrace,roscnt,hplocale,manrel,fl_rage,fl_rrace,fl_rhisp,goschol,vaca,higrade,compgrd,havedip,dipged,cmhsgrad,havedeg,degrees,wthparnw,onown,intact,parmarr,lvsit14f,lvsit14m,womrasdu,momdegre,momworkd,momchild,momfstch,mom18,manrasdu,daddegre,bothbiol,intact18,onown18,numbabes,totplacd,nplaced,ndied,nadoptv,hasbabes,cmlastlb,cmfstprg,cmlstprg,menarche,pregnowq,maybpreg,numpregs,everpreg,currpreg,moscurrp,giveadpt,ngivenad,otherkid,nothrkid,sexothkd,relothkd,adptotkd,tryadopt,tryeithr,stilhere,cmokdcam,othkdfos,cmokddob,othkdspn,othkdrac1,othkdrac2,kdbstrac,okbornus,okdisabl1,sexothkd2,relothkd2,adptotkd2,tryadopt2,tryeithr2,stilhere2,cmokdcam2,othkdfos2,cmokddob2,othkdspn2,othkdrac6,okbornus2,okdisabl5,sexothkd3,relothkd3,adptotkd3,tryadopt3,tryeithr3,stilhere3,cmokdcam3,othkdfos3,cmokddob3,othkdspn3,othkdrac11,okbornus3,okdisabl9,sexothkd4,relothkd4,adptotkd4,tryadopt4,tryeithr4,stilhere4,cmokdcam4,othkdfos4,cmokddob4,othkdspn4,othkdrac16,okbornus4,okdisabl13,sexothkd5,relothkd5,adptotkd5,tryadopt5,tryeithr5,stilhere5,cmokdcam5,othkdfos5,cmokddob5,othkdspn5,othkdrac21,okbornus5,okdisabl17,sexothkd6,relothkd6,adptotkd6,tryadopt6,tryeithr6,stilhere6,cmokdcam6,othkdfos6,cmokddob6,othkdspn6,othkdrac26,okbornus6,okdisabl21,sexothkd7,relothkd7,adptotkd7,tryadopt7,tryeithr7,stilhere7,cmokdcam7,othkdfos7,cmokddob7,othkdspn7,othkdrac31,okbornus7,okdisabl25,sexothkd8,relothkd8,adptotkd8,tryeithr8,stilhere8,cmokdcam8,othkdfos8,cmokddob8,othkdspn8,othkdrac36,othkdrac37,kdbstrac8,okbornus8,okdisabl29,sexothkd9,relothkd9,adptotkd9,tryeithr9,stilhere9,sexothkd10,relothkd10,adptotkd10,tryeithr10,stilhere10,sexothkd11,relothkd11,adptotkd11,tryeithr11,stilhere11,sexothkd12,relothkd12,adptotkd12,tryeithr12,stilhere12,sexothkd13,relothkd13,adptotkd13,tryeithr13,stilhere13,sexothkd14,relothkd14,adptotkd14,tryeithr14,stilhere14,sexothkd15,relothkd15,adptotkd15,tryeithr15,stilhere15,sexothkd16,relothkd16,adptotkd16,tryeithr16,stilhere16,sexothkd17,relothkd17,adptotkd17,tryeithr17,stilhere17,everadpt,seekadpt,contagem,trylong,knowadpt,chosesex,typesexf,typesexm,chosrace,typracbk,typracwh,typracot,choseage,typage2m,typage5m,typag12m,typag13m,chosdisb,typdisbn,typdisbm,typdisbs,chosenum,typnum1m,typnum2m,evwntano,evcontag,turndown,yquittry,aprocess1,...,oldwp06,oldwp07,oldwp08,oldwp09,oldwp10,oldwp11,oldwp12,oldwp13,oldwp14,oldwp15,oldwp16,oldwp17,oldwp18,oldwp19,oldwr01,oldwr02,oldwr03,oldwr04,oldwr05,oldwr06,oldwr07,oldwr08,oldwr09,oldwr10,oldwr11,oldwr12,oldwr13,oldwr14,oldwr15,oldwr16,oldwr17,oldwr18,oldwr19,wantrp01,wantrp02,wantrp03,wantrp04,wantrp05,wantrp06,wantrp07,wantrp08,wantrp09,wantrp10,wantrp11,wantrp12,wantrp13,wantrp14,wantrp15,wantrp16,wantrp17,wantrp18,wantrp19,wantp01,wantp02,wantp03,wantp04,wantp05,wantp06,wantp07,wantp08,wantp09,wantp10,wantp11,wantp12,wantp13,wantp14,wantp15,wantp16,wantp17,wantp18,wantp19,wantp5,strloper_i,fecund_i,infert_i,anymthd_i,nosex12_i,sexp3mo_i,sex3mo_i,constat1_i,constat2_i,constat3_i,constat4_i,pillr_i,condomr_i,sex1mthd1_i,sex1mthd2_i,sex1mthd3_i,sex1mthd4_i,mthuse12_i,meth12m1_i,meth12m2_i,meth12m3_i,meth12m4_i,mthuse3_i,meth3m1_i,meth3m2_i,meth3m3_i,meth3m4_i,nump3mos_i,fmethod1_i,fmethod2_i,fmethod3_i,fmethod4_i,dateuse1_i,sourcem1_i,sourcem2_i,sourcem3_i,sourcem4_i,oldwp01_i,oldwp02_i,oldwp03_i,oldwp04_i,oldwp05_i,oldwp06_i,oldwp07_i,oldwp08_i,oldwp09_i,oldwp10_i,oldwp11_i,oldwp12_i,oldwp13_i,oldwp14_i,oldwp15_i,oldwp16_i,oldwp17_i,oldwp18_i,oldwp19_i,oldwr01_i,oldwr02_i,oldwr03_i,oldwr04_i,oldwr05_i,oldwr06_i,oldwr07_i,oldwr08_i,oldwr09_i,oldwr10_i,oldwr11_i,oldwr12_i,oldwr13_i,oldwr14_i,oldwr15_i,oldwr16_i,oldwr17_i,oldwr18_i,oldwr19_i,wantrp01_i,wantrp02_i,wantrp03_i,wantrp04_i,wantrp05_i,wantrp06_i,wantrp07_i,wantrp08_i,wantrp09_i,wantrp10_i,wantrp11_i,wantrp12_i,wantrp13_i,wantrp14_i,wantrp15_i,wantrp16_i,wantrp17_i,wantrp18_i,wantrp19_i,wantp01_i,wantp02_i,wantp03_i,wantp04_i,wantp05_i,wantp06_i,wantp07_i,wantp08_i,wantp09_i,wantp10_i,wantp11_i,wantp12_i,wantp13_i,wantp14_i,wantp15_i,wantp16_i,wantp17_i,wantp18_i,wantp19_i,wantp5_i,fptit12,fptitmed,fpregfp,fpregmed,fptit12_i,fptitmed_i,fpregfp_i,fpregmed_i,r_stclin,intent,addexp,intent_i,addexp_i,anyprghp,anymschp,infever,ovulate,tubes,infertr,inferth,advice,insem,invitro,endomet,fibroids,pidtreat,evhivtst,anyprghp_i,anymschp_i,infever_i,ovulate_i,tubes_i,infertr_i,inferth_i,advice_i,insem_i,invitro_i,endomet_i,fibroids_i,pidtreat_i,evhivtst_i,insuranc,metro,religion,laborfor,insuranc_i,metro_i,religion_i,laborfor_i,poverty,totincr,pubassis,poverty_i,totincr_i,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,2,6.0,5,0,1,1.0,1,5,1.0,2.0,0,0,0,5,,10,1.0,5.0,,,,,2,5.0,1.0,1,,,,1.0,1.0,6.0,2.0,,,1.0,0,1,5,4,0.0,0,0,0,1,1198.0,1119.0,1198.0,12,5.0,,4.0,1,5,,5.0,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,5.0,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,...,,,,,,,,,,,,,,,5.0,2.0,3.0,2.0,,,,,,,,,,,,,,,,5.0,2.0,3.0,2.0,,,,,,,,,,,,,,,,2.0,2.0,3.0,2.0,,,,,,,,,,,,,,,,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,2.0,,,0,0,0,0,0,2,0,0,0,2.0,2.0,2.0,,,,,,,,,,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,7,0,0,0,0,156,9,2,0,0,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667


And we can get the corresponding rows from `preg` like this:

In [22]:
preg[preg.caseid==2298]

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
2610,2298,1,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.125
2611,2298,2,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,5.5,2.5
2612,2298,3,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,4.1875,1.903409
2613,2298,4,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.125


How old is the respondent with `caseid` 1?

In [29]:
# Solution goes here
resp[resp.caseid == 1].age_r

1069    44
Name: age_r, dtype: int64

What are the pregnancy lengths for the respondent with `caseid` 2298?

In [51]:
# Solution goes here
preg[preg.caseid==2298].prglngth

2610    40
2611    36
2612    30
2613    40
Name: prglngth, dtype: int64

In [48]:
# Solution goes here
preg[preg.caseid == 5012].birthwgt_lb

5515    6.0
Name: birthwgt_lb, dtype: float64