# Weekly Sequences for Hope's Project

## Sample Selection

In [1]:
#setup
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import datetime
from collections import defaultdict
from dateutil.relativedelta import relativedelta

# set some nicer defaults for matplotlib
from matplotlib import rcParams

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

## Start from the entire NLSY79 sample

In [2]:
#read birth data
dob_df = pd.read_csv('../data/childbirth.csv')
print dob_df.shape
dob_df.head()

(12686, 26)


Unnamed: 0,caseid_1979,sample_id_1979,sample_race_1979,sample_sex_1979,c1dob_m_xrnd,c1dob_y_xrnd,c2dob_m_xrnd,c2dob_y_xrnd,c3dob_m_xrnd,c3dob_y_xrnd,...,c7dob_m_xrnd,c7dob_y_xrnd,c8dob_m_xrnd,c8dob_y_xrnd,c9dob_m_xrnd,c9dob_y_xrnd,c10dob_m_xrnd,c10dob_y_xrnd,c11dob_m_xrnd,c11dob_y_xrnd
0,1,5,3,2,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
1,2,5,3,2,3,1993,11,1994,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
2,3,5,3,2,6,1981,10,1983,4,1986,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
3,4,5,3,2,8,1980,3,1997,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,5,1,3,1,5,1989,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4


In [3]:
#drop males
#1-male; 2-female
sample_df = dob_df[dob_df.sample_sex_1979==2]
print sample_df.shape

(6283, 26)


In [4]:
#drop subsamples that weren’t followed over the entire survey period
print 'subsamples before dropping:', sample_df.sample_id_1979.unique()
sample_df = sample_df[(~sample_df.sample_id_1979.isin([9, 12, 15, 16, 17, 18, 19, 20]))]
print 'subsamples after dropping:', sample_df.sample_id_1979.unique()
print sample_df.shape

subsamples before dropping: [ 5  6  8  7 14 12 13 18 19 20]
subsamples after dropping: [ 5  6  8  7 14 13]
(4926, 26)


There are 4926 women in NLSY79.

In [6]:
sample_df.to_csv('../data/sample_hope.csv', index=False)