# Construct IVs for 14-year-sequence Sample

In [1]:
#setup
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from operator import itemgetter
from datetime import datetime

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()



In [2]:
# helper functions

def diff_month(d1, d2):
    """
    Return the number of months between the two dates
    """
    
    return (d1.year - d2.year) * 12 + d1.month - d2.month


def print_full(x):
    """display full table in this notebook"""
    
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display.max_columns', x.shape[1])
    return x
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    

## Convert between calendar date and continuous weeks

In [3]:
#this is the file to switch between week number in NLSY79 and calendar year, month, and day
xl_file = pd.ExcelFile('../data/continuous_week_crosswalk_2012.xlsx') 
weekdf = xl_file.parse('weekdates1')
weekdf.head()

Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
0,1,1,1978,1,1
1,1,8,1978,2,2
2,1,15,1978,3,3
3,1,22,1978,4,4
4,1,29,1978,5,5


In [4]:
def get_week_num(year, month, which, df=weekdf):
    """
    Given a calendar year and month, return the corresponding week number in NLSY79.
    
    If which=='first', return the first week in that month.
    If which=='last', return the last week in that month. 
    """
    
    the_month = weekdf[(weekdf['Week Start:\nMonth']==month)&(weekdf['Week Start: \nYear']==year)]
    
    if which == 'first':
        return int(the_month.iloc[0]['Continuous \nWeek Number'])
    elif which == 'last':
        return int(the_month.iloc[len(the_month)-1]['Continuous \nWeek Number'])

In [5]:
def get_year_month(weeknum, df=weekdf):
    """
    Given a week number, return the corresponding calendar month and year (yyyy, m).
    """
    
    the_week = weekdf[weekdf['Continuous \nWeek Number']==weeknum]
    return int(the_week.iloc[0]['Week Start: \nYear']), int(the_week.iloc[0]['Week Start:\nMonth'])

## Get sample

In [6]:
# read sample data files
iv_df = pd.read_csv("../data/analytic_df_seq14.csv")
print iv_df.columns

Index([u'caseid_1979', u'sample_id_1979', u'sample_race_1979',
       u'sample_sex_1979', u'c1dob_m_xrnd', u'c1dob_y_xrnd', u'c2dob_m_xrnd',
       u'c2dob_y_xrnd', u'c3dob_m_xrnd', u'c3dob_y_xrnd', u'c4dob_m_xrnd',
       u'c4dob_y_xrnd', u'c5dob_m_xrnd', u'c5dob_y_xrnd', u'c6dob_m_xrnd',
       u'c6dob_y_xrnd', u'c7dob_m_xrnd', u'c7dob_y_xrnd', u'c8dob_m_xrnd',
       u'c8dob_y_xrnd', u'c9dob_m_xrnd', u'c9dob_y_xrnd', u'c10dob_m_xrnd',
       u'c10dob_y_xrnd', u'c11dob_m_xrnd', u'c11dob_y_xrnd', u'y_dob',
       u'm_dob', u'y_age35', u'y_deceased', u'last_yr', u'attr_before35',
       u'deceased_before35', u'y_child18', u'deceased_child18', u'y_child14',
       u'attr_child14'],
      dtype='object')


In [7]:
iv_df = iv_df[['caseid_1979', 'c1dob_y_xrnd', 'c1dob_m_xrnd', 'y_dob', 'm_dob']]
iv_df = iv_df.rename(index=str, columns={"c1dob_y_xrnd": "childbirth_year", "c1dob_m_xrnd":"childbirth_month"})

In [8]:
# read main (18-year-seq) sample - use the original size-3377 sample (before dropping the cases with invalid marstat)
main_df = pd.read_csv("../data/analytic_df.csv")
print main_df.shape
main_df.head()

(3434, 36)


Unnamed: 0,caseid_1979,sample_id_1979,sample_race_1979,sample_sex_1979,c1dob_m_xrnd,c1dob_y_xrnd,c2dob_m_xrnd,c2dob_y_xrnd,c3dob_m_xrnd,c3dob_y_xrnd,...,y_dob,m_dob,y_age35,y_deceased,last_yr,attr_before35,deceased_before35,y_child18,deceased_child18,attr_child18
0,2,5,3,2,3,1993,11,1994,-4,-4,...,1959,1,1994,,2014,False,False,2011,False,False
1,3,5,3,2,6,1981,10,1983,4,1986,...,1961,8,1996,,2014,False,False,1999,False,False
2,4,5,3,2,8,1980,3,1997,-4,-4,...,1962,8,1997,,1998,False,False,1998,False,False
3,8,6,3,2,3,1976,5,1979,9,1982,...,1958,7,1993,,2014,False,False,1994,False,False
4,16,5,3,2,2,1990,8,1993,9,1996,...,1958,10,1993,,2014,False,False,2008,False,False


In [9]:
# construct IVs for only the newly added cases that are not present in the main sapmle
print iv_df.shape
iv_df = iv_df[(~iv_df.caseid_1979.isin(main_df.caseid_1979.values.tolist()))]
print iv_df.shape

sample = iv_df.caseid_1979.values.tolist()

(3682, 5)
(248, 5)


In [10]:
iv_df.childbirth_year.value_counts()

1996    43
1997    39
1998    28
1999    22
1985    14
1984    10
1982     9
1986     9
1983     7
1989     7
1991     6
1977     6
1992     6
1980     6
1981     6
1979     4
1995     4
1994     4
1993     4
1990     3
1988     3
1987     2
1978     2
1976     2
1975     1
1973     1
Name: childbirth_year, dtype: int64

In [11]:
def get_start_month(row):
    """
    Find the month after birth month
    """
    
    if row['childbirth_month'] == 12:
        return 1
    else:
        return row['childbirth_month']+1
 
def get_start_year(row):
    """
    Find the month after birth month
    """
    
    if row['childbirth_month'] == 12:
        return row['childbirth_year']+1
    else:
        return row['childbirth_year']
    
iv_df['start_y'] = iv_df.apply(get_start_year, axis=1)
iv_df['start_m'] = iv_df.apply(get_start_month, axis=1)
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m
13,46,1994,4,1962,11,1994,5
19,62,1996,8,1964,7,1996,9
22,78,1989,3,1957,4,1989,4
36,118,1981,12,1961,10,1982,1
51,159,1985,9,1960,4,1985,10


## Age at first birth 

Use respondent's birth year reported in 1979

In [12]:
def get_age(row):
    """Use both year and month to determine age"""

    return row["childbirth_year"] - row["y_dob"] - (row["childbirth_month"] < row['m_dob'])

iv_df["age"] = iv_df.apply(get_age, axis=1)
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age
13,46,1994,4,1962,11,1994,5,31
19,62,1996,8,1964,7,1996,9,32
22,78,1989,3,1957,4,1989,4,31
36,118,1981,12,1961,10,1982,1,20
51,159,1985,9,1960,4,1985,10,25


## Education

* Type 1: mother gave birth before <= 1979 (so that we lack yearly education variable)

At the time of pulling, if less than age 18 (strictly), assign no hs. Otherwise, use education reported in 1979. 


* Type 2: mother gave birth after > 1979

Use highest grade completed (revised) by the time one year prior to birth --- These variables represent the highest grade completed by the respondent as of May 1 survey year. These are created with some adjustments to keep people from regressing in years of schooling when they take such actions as returning to school in a new field.

In [13]:
hi_grade_df = pd.read_csv('../data/hi_grade_new.csv')
hi_grade_df.head()

Unnamed: 0,caseid_1979,q3_4_1979,hgc_1979,hgcrev79_1979,q3_4_1980,hgc_1980,hgcrev80_1980,q3_4_1981,hgc_1981,hgcrev81_1981,...,q3_4_2008,hgc_2008,hgcrev08_2008,q3_4_2010,hgc_2010,hgcrev10_2010,q3_4_2012,hgc_2012,q3_4_2014,hgc_2014
0,1,12,12,12,-5,-5,-5,-4,12,12,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5
1,2,9,9,9,-4,9,9,-4,9,9,...,13,12,12,13,12,12,13,12,12,12
2,3,10,10,10,10,10,10,-5,-5,-5,...,12,12,12,-5,-5,-5,10,12,10,12
3,4,9,9,9,-5,-5,-5,-4,9,9,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5
4,5,13,13,13,14,14,14,15,15,15,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5


In [14]:
# check different scenarios. merge data to access variables easily 
merged = iv_df.merge(hi_grade_df, on='caseid_1979')

In [15]:
# type 1: mother gave birth before <= 1979
# at the time of pulling, if less than age 18 (strictly), assign no hs
# if >= 18, use education in 1979
age_prebirth = merged[merged.childbirth_year <= 1979].age - 1
age_prebirth.value_counts()

17    4
16    4
15    3
21    2
14    1
19    1
18    1
Name: age, dtype: int64

In [16]:
def get_hi_grade(row):
    """
    Given a row of a respondent's data, return the highest grade completed by one year prebirth.
    
    If mother gave birth before <= 1979:
        # At the time of pulling, if less than age 18 (strictly), assign no hs
        # Otherwise, use education in 1979
        
    If mother gave birth after > 1979:
        Use the highest grade completed by one year prior to birth
    """
    
    if row['childbirth_year'] <= 1979:
    
        if row['age'] - 1 < 18: # age at one year prebirth < 18
            return -1 # assume less than high school
        else:
            if row['hgcrev79_1979'] >= 0: # valid grade
                return row['hgcrev79_1979']
            else:
                print 'missing pre-1979, respondent %s, grade %s'%(row['caseid_1979'], row['hgcrev79_1979'])
                return None
            
    else:
        
        # up to 2010, revised highest grade is not provided in 2012 or 2014
        years = range(1979, 1995) + range(1996, 2011, 2)

        # do not include grade 95 (which stands for ungraded study)
        # only count until the year prior to birth 
        vals = [row['hgcrev%s_%s'%(str(y)[2:], y)] for y in years 
            if row['hgcrev%s_%s'%(str(y)[2:], y)]<95 and row['hgcrev%s_%s'%(str(y)[2:], y)]>=0 and y<row['childbirth_year']]

        # record the corresponding years in the same order
        #years = [y for y in years 
        #    if row['hgcrev%s_%s'%(str(y)[2:], y)]<95 and row['hgcrev%s_%s'%(str(y)[2:], y)]>=0] 

        #return the highest degree and the year in which it was completed (the earliest year the highest grade appeared)
        if len(vals) > 0:
            return max(vals)
        else:
            print 'missing post-1979, respondent %s, grade %s'%(row['caseid_1979'], row['hgcrev79_1979'])
            return None

        
def recode_educ(g):
    """
    Convert highest grade completed to educational category
    """
    
    if g<12: # less than high school
        return 1
        
    elif g==12: # high school
        return 2
        
    elif (g>12 and g<16): # some college
        return 3
        
    elif g==16: # college graduate
        return 4
        
    elif g>16: # advanced degree
        return 5


merged["hi_deg"] = merged.apply(get_hi_grade, axis=1)
merged["educ"] = merged.hi_deg.map(recode_educ)
merged.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age,q3_4_1979,hgc_1979,...,hgcrev08_2008,q3_4_2010,hgc_2010,hgcrev10_2010,q3_4_2012,hgc_2012,q3_4_2014,hgc_2014,hi_deg,educ
0,46,1994,4,1962,11,1994,5,31,9,9,...,19,-5,-5,-5,-5,-5,-5,-5,19,5
1,62,1996,8,1964,7,1996,9,32,8,8,...,13,13,13,13,14,14,16,16,12,2
2,78,1989,3,1957,4,1989,4,31,15,15,...,-5,-5,-5,-5,-5,-5,-5,-5,16,4
3,118,1981,12,1961,10,1982,1,20,8,8,...,-5,-5,-5,-5,-5,-5,-5,-5,9,1
4,159,1985,9,1960,4,1985,10,25,12,12,...,-5,-5,-5,-5,-5,-5,-5,-5,16,4


In [17]:
# how many missing education? NO MISSING :)
sum(merged.educ.isnull())

0

In [18]:
merged.educ.value_counts()

2    79
3    59
4    43
5    37
1    30
Name: educ, dtype: int64

## Pre-birth work experience

In [19]:
wkarray_df = pd.read_csv('../data/weeklyarrays.csv')
print wkarray_df.shape
print wkarray_df.columns[:10]

(12686, 3969)
Index([u'caseid_1979', u'hrs_worked_wk_num0000_xrnd',
       u'hrs_worked_wk_num0001_xrnd', u'hrs_worked_wk_num0002_xrnd',
       u'hrs_worked_wk_num0003_xrnd', u'hrs_worked_wk_num0004_xrnd',
       u'hrs_worked_wk_num0005_xrnd', u'hrs_worked_wk_num0006_xrnd',
       u'hrs_worked_wk_num0007_xrnd', u'hrs_worked_wk_num0008_xrnd'],
      dtype='object')


In [20]:
merged = iv_df.merge(wkarray_df, on='caseid_1979')

In [21]:
def get_unadj_exp(row): 
    """
    Return unadjusted work experience - count all work hours reported before one year pre-birth 
    """

    
    if row["childbirth_year"]-1 < 1978:
        if row['hrs_worked_wk_num0000_xrnd'] == 0: # if we know you never worked pre-1978
            return 0
        else:
            return None

    
    # assume child birth happened in the last week of the month
    end_wk = get_week_num(row["childbirth_year"]-1, row["childbirth_month"], 'last')
    
    # put all valid weekly hours into a list
    hours = [row['hrs_worked_wk_num%04d_xrnd'%w] for w in range(end_wk+1) if row['hrs_worked_wk_num%04d_xrnd'%w]>=0]    

    # count 
    total = end_wk + 1 # total number of weekly hours examined
    total_valid = len(hours) # number of valid weekly hours
        
    
    pct_valid = float(total_valid)/total
    if pct_valid == 1: # if no missing weeks, return sum of hours
        return int(sum(hours))

    elif pct_valid >= 0.9: # if missing <= 10%, expand 
        return int(round(sum(hours)/pct_valid))

    else:
        return None
    

merged["exp"] = merged.apply(get_unadj_exp, axis=1)
iv_df = iv_df.merge(merged[['caseid_1979', 'exp']], on='caseid_1979')
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age,exp
0,46,1994,4,1962,11,1994,5,31,19092.0
1,62,1996,8,1964,7,1996,9,32,31103.0
2,78,1989,3,1957,4,1989,4,31,19073.0
3,118,1981,12,1961,10,1982,1,20,3660.0
4,159,1985,9,1960,4,1985,10,25,12699.0


In [22]:
# how many and what proportion of respondents have missing unadjusted experience?
x = sum(iv_df.exp.isnull())
print x, float(x)/iv_df.shape[0]

5 0.0201612903226


In [23]:
# how many missing unadj experience come from mothers who gave first birth before 1978?
print iv_df[(iv_df.exp.isnull())&(iv_df.childbirth_year<=1978)].shape[0]

1


In [24]:
# how many Rs have 0 prebirth experience? 
print iv_df[iv_df.exp==0].shape[0]

#how many Rs had first birth before 1978 and have 0 prebirth experience
print iv_df[((iv_df.childbirth_year<=1978)&(iv_df.exp==0))].shape[0]

15
11


In [25]:
# correlation between exp and age at first birth (after removing cases with missing exp)
nomissing_df = iv_df[~iv_df.exp.isnull()]
np.corrcoef(nomissing_df.exp, nomissing_df.age)

array([[ 1.        ,  0.85108151],
       [ 0.85108151,  1.        ]])

## Two dummy variables: (1) whether working at one year pre-birth and (2) whether ever held job at one year pre-birth

In [26]:
# read job dates data
jobdates_df = pd.read_csv("../data/jobdates.csv")
jobdates_df.head()

Unnamed: 0,caseid_1979,start_wk_1979_job01_xrnd,start_wk_1979_job02_xrnd,start_wk_1979_job03_xrnd,start_wk_1979_job04_xrnd,start_wk_1979_job05_xrnd,stop_wk_1979_job01_xrnd,stop_wk_1979_job02_xrnd,stop_wk_1979_job03_xrnd,stop_wk_1979_job04_xrnd,...,start_wk_2012_job01_xrnd,start_wk_2012_job02_xrnd,start_wk_2012_job03_xrnd,start_wk_2012_job04_xrnd,start_wk_2012_job05_xrnd,stop_wk_2012_job01_xrnd,stop_wk_2012_job02_xrnd,stop_wk_2012_job03_xrnd,stop_wk_2012_job04_xrnd,stop_wk_2012_job05_xrnd
0,1,21,0,-4,-4,-4,61,20,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
1,2,19,-4,-4,-4,-4,60,-4,-4,-4,...,1680,-4,-4,-4,-4,1822,-4,-4,-4,-4
2,3,37,20,5,-4,-4,57,30,15,-4,...,1620,1610,-4,-4,-4,1837,1618,-4,-4,-4
3,4,-4,-4,-4,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,5,5,23,-4,-4,-4,67,35,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4


In [27]:
# how many respondents had birth prior to 1978 and had positive or missing hours prior to 1978?
iv_df[(iv_df.childbirth_year<=1978)&(iv_df.exp!=0)].shape[0]

1

In [28]:
# for the respondents who had birth prior to 1978 and had positive or missing hours prior to 1978,
# do they report any jobs with start/end date prior to 1/1/1978?

sample = iv_df[(iv_df.childbirth_year<=1978)&(iv_df.exp!=0)].caseid_1979.values.tolist()
years = range(1979, 1995) + range(1996, 2013, 2)

for i in sample:
    job_row = jobdates_df[jobdates_df.caseid_1979==i].iloc[0]
    
    for year in years:   
        for j in range(1, 6):
            start = int(job_row['start_wk_%i_job%02d_xrnd'%(year, j)])
            end = int(job_row['stop_wk_%i_job%02d_xrnd'%(year, j)])

            if start == 0 or end == 0:
                print i, year, j, start, end

694 1979 2 0 9


In [29]:
# read job dates reported in calendar Y/M/D in 1979
jobdates79_df = pd.read_csv('../data/jobdates79.csv')
jobdates79_df.head()

Unnamed: 0,caseid_1979,employer_startdate_01_m_1979,employer_startdate_01_d_1979,employer_startdate_01_y_1979,employer_startdate_02_m_1979,employer_startdate_02_d_1979,employer_startdate_02_y_1979,employer_startdate_03_m_1979,employer_startdate_03_d_1979,employer_startdate_03_y_1979,...,employer_stopdate_02_y_1979,employer_stopdate_03_m_1979,employer_stopdate_03_d_1979,employer_stopdate_03_y_1979,employer_stopdate_04_m_1979,employer_stopdate_04_d_1979,employer_stopdate_04_y_1979,employer_stopdate_05_m_1979,employer_stopdate_05_d_1979,employer_stopdate_05_y_1979
0,1,5,22,78,9,20,75,-4,-4,-4,...,78,-4,-4,-4,-4,-4,-4,-4,-4,-4
1,2,5,8,78,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
2,3,9,11,78,5,15,78,1,30,78,...,78,4,15,78,-4,-4,-4,-4,-4,-4
3,4,-4,-4,-4,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,5,2,1,78,6,5,78,-4,-4,-4,...,78,-4,-4,-4,-4,-4,-4,-4,-4,-4


In [30]:
# ever held job can be determined by work hours is zero - but this would not handle respondents w/ missing work hours
# let's use job starting and ending dates (in terms of continuous weeks)

count_missing = defaultdict(list) # count how many jobs have missing start or end date

def get_work_dummies(row):
    """
    Determine two dummy variables: (working at one year pre-brith, ever held job at one year pre-birth)
    
    3 scenarios:
    
    1. Pre-birth work experience is zero:
    Set both dummies to zero
    
    2. One year pre-birth is before 1978 and we don't have job starting/ending date information:
    If had positive work hours, assume YES on both variables 
    since we know that they worked at some point in this relatively narrow period
    *** DOES NOT APPLY IN THIS SUBSAMPLE ***
    
    3. One year pre-birth is after 1978, use job dates to determine whether any job overlap with one year prebirth 
    
    """
    ##################
    ### scenario 1 ###
    ##################
    if row['exp'] == 0: 
        return (0, 0)
    
    
    # find birth year and month
    dob_y = int(row['childbirth_year'])
    dob_m = int(row['childbirth_month'])
    
    jobs = []
    
    years = range(1979, 1995) + range(1996, 2013, 2)
    job_row = jobdates_df[jobdates_df.caseid_1979==row['caseid_1979']].iloc[0]
    
    ##################
    ### scenario 2: DOES NOT APPLY, THE IF STATEMENT BELOW JUMPS OVER THIS BLOCK OF CODES ###
    ##################   
    if dob_y - 1 < 1978:
        
        # look for jobs with a start week 0, i.e. started before 1978
        for year in years:   
            for j in range(1, 6):
                start = int(job_row['start_wk_%i_job%02d_xrnd'%(year, j)])
                end = int(job_row['stop_wk_%i_job%02d_xrnd'%(year, j)])

                if start == 0 or end == 0: # then use job start/stop date reported in calendar date
                    job79_row = jobdates79_df[jobdates79_df.caseid_1979==row['caseid_1979']].iloc[0]
                    start_y = int(job79_row['employer_startdate_%02d_y_%s'%(j, year)])
                    start_m = int(job79_row['employer_startdate_%02d_m_%s'%(j, year)])
                    stop_y = int(job79_row['employer_stopdate_%02d_y_%s'%(j, year)])
                    stop_m = int(job79_row['employer_stopdate_%02d_m_%s'%(j, year)])
                    
                    # three respondents had missing start month
                    # one person started the job one year before prebirth year, so it doesn't matter
                    # the other two's job started the same year as prebirth year. assume June
                    if start_m < 0:
                        start_m = 6
                        print 'missing start month: respondent id', row['caseid_1979']
                    
                    if datetime(start_y, start_m, 1) <= datetime(dob_y-1, dob_m, 1):
                        jobs.append(0) # assign 0 to mark that we found a job that started <= the month one year prebirth
                        
                        if stop_y == -4: # held the job until 1979
                            jobs.append(1) # assign 1 to mark that we found a job that overlaps with the month one year prebirth
                        elif datetime(stop_y, stop_m, 1) >= datetime(dob_y-1, dob_m, 1):
                            jobs.append(1)
    
    
    ##################
    ### scenario 3 ###
    ##################
    
    else:
    
        # find all weeks in the month that is one year pre-birth
        # DO NOT assume birth took place in the last week of the month
        birth_weeks = range(get_week_num(dob_y-1, dob_m, 'first'), get_week_num(dob_y-1, dob_m, 'last')+1)

        # first, find all jobs that overlap with birth weeks
        for year in years:   
            for j in range(1, 6):
                start = int(job_row['start_wk_%i_job%02d_xrnd'%(year, j)])
                end = int(job_row['stop_wk_%i_job%02d_xrnd'%(year, j)])

                # only consider jobs with valid starting/ending week number 
                if start >= 0 and end >= 0 and start <= end:
                    if start <= birth_weeks[-1]: # if the job started before one year pre-birth
                        job_weeks = range(start, end+1)
                        if len(set(job_weeks)&set(birth_weeks)) > 0:
                            jobs.append(1) # this job overlaps with one year pre-birth
                        else:
                            jobs.append(0) # this job only started before one year pre-birth, but no overlap
                elif start == -4 and end == -4:
                    pass # valid skip
                else:
                    count_missing[row['caseid_1979']].append((year, j, start, end))
                
    
    # outcome variables to return
    working = 0
    held_job = 0
    
    if 1 in jobs:
        working = 1
    if len(jobs) > 0:
        held_job = 1
        
    return (working, held_job)


dummies = iv_df.apply(get_work_dummies, axis=1)
iv_df['working'] = map(itemgetter(0), dummies)
iv_df['had_job'] = map(itemgetter(1), dummies)
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age,exp,working,had_job
0,46,1994,4,1962,11,1994,5,31,19092.0,1,1
1,62,1996,8,1964,7,1996,9,32,31103.0,1,1
2,78,1989,3,1957,4,1989,4,31,19073.0,1,1
3,118,1981,12,1961,10,1982,1,20,3660.0,1,1
4,159,1985,9,1960,4,1985,10,25,12699.0,1,1


In [31]:
# how many Rs had missing job dates (reported in continuous week number)?
len(count_missing)

27

In [32]:
# print the first five respondents with missing job dates
# elements in each tuple: (survey year, job number, start week, stop week)

for i in range(len(count_missing)):
    print count_missing.items()[i]
    if i > 5:
        break

(7822.0, [(1987, 2, 427, -3)])
(5138.0, [(1980, 2, 60, 59)])
(1948.0, [(1994, 1, 869, 868)])
(5533.0, [(1992, 4, 388, -3)])
(9891.0, [(1979, 1, 90, 63)])
(167.0, [(1991, 4, 654, -3)])
(1564.0, [(1989, 2, 600, 551)])


## Marital status at one year pre-birth

In [33]:
marstat_df = pd.read_csv('../data/marstat.csv')
marstat_df.head()

Unnamed: 0,caseid_1979,marstat_1979,marstat_1980,marstat_1981,marstat_1982,marstat_1983,marstat_1984,marstat_1985,marstat_1986,marstat_1987,...,mobg2m_xrnd,yrbg2m_xrnd,moen2m_xrnd,yren2m_xrnd,mobg3m_xrnd,yrbg3m_xrnd,marstat_2006,marstat_2008,marstat_2010,marstat_2012
0,1,0,-5,0,-5,-5,-5,-5,-5,-5,...,-999,-999,-999,-999,-999,-999,-5,-5,-5,-5
1,2,0,0,0,0,0,0,0,0,0,...,-998,-998,-998,-998,-998,-998,1,1,1,1
2,3,0,0,-5,1,1,1,1,1,1,...,-998,-998,-998,-998,-998,-998,1,1,-5,1
3,4,0,-5,1,1,2,2,2,3,2,...,5,1995,-996,-996,-996,-996,-5,-5,-5,-5
4,5,0,0,0,0,0,0,-5,1,-5,...,-998,-998,-998,-998,-998,-998,-5,-5,-5,-5


In [34]:
# read interview dates
intdate_df = pd.read_csv("../data/int_date.csv") 
intdate_df.head()

Unnamed: 0,caseid_1979,date_m_1979,date_d_1979,date_m_1980,date_d_1980,date_m_1981,date_d_1981,date_m_1982,date_d_1982,date_m_1983,...,date_y_2006,date_d_2008,date_m_2008,date_y_2008,date_d_2010,date_m_2010,date_y_2010,date_d_2012,date_m_2012,date_y_2012
0,1,3,3,-5,-5,6,25,-5,-5,-5,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5
1,2,2,28,3,3,2,23,2,6,2,...,2006,4,3,2008,10,3,2010,3,12,2012
2,3,2,8,5,20,-5,-5,3,11,3,...,2006,3,11,2008,-5,-5,-5,19,3,2013
3,4,2,8,-5,-5,3,24,3,3,1,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5
4,5,4,19,4,3,3,12,3,1,4,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5


In [35]:
#how many respondents have missing marriage dates?

missing = []
sample = iv_df.caseid_1979.values.tolist()

for i in sample:
    mar_row = marstat_df[marstat_df.caseid_1979==i].iloc[0]
    
    for j in range(1, 4):
        try: #there is no 'moen3m_xrnd'
            if mar_row['mobg%sm_xrnd'%j] in [-1, -2, -3, -5] \
                or mar_row['yrbg%sm_xrnd'%j] in [-1, -2, -3, -5] \
                or mar_row['moen%sm_xrnd'%j] in [-1, -2, -3, -5] \
                or mar_row['yren%sm_xrnd'%j] in [-1, -2, -3, -5]:
                missing.append(i)
        except KeyError:
            pass
            
print len(missing), len(set(missing))

11 11


In [36]:
# marital status
# how many marriage start/end time overlaps with the month that is one year prior to birth

def overlap(row):
    """
    Return True if a marriage start/end time overlaps with the month that is one year prior to birth
    """
    
    # prebirth
    y = int(row['childbirth_year']) - 1
    m = int(row['childbirth_month'])
    
    mar_row = marstat_df[marstat_df.caseid_1979==row['caseid_1979']].iloc[0]
    
    #Documentation:
    # -999: NEVER MARRIED
    # -998: 1ST MARRIAGE NEVER ENDED
    # -997: 1ST MARRIAGE ENDED NO 2ND MARRIAGE
    # -996: 2ND MARRIAGE NEVER ENDED
    # -995: 2ND MARRIAGE ENDED NO 3RD MARRIAGE
    
    return ((y == mar_row['yrbg1m_xrnd'] and m == mar_row['mobg1m_xrnd']) \
            or (y == mar_row['yren1m_xrnd'] and m == mar_row['moen1m_xrnd']) \
            or (y == mar_row['yrbg2m_xrnd'] and m == mar_row['mobg2m_xrnd']) \
            or (y == mar_row['yren2m_xrnd'] and m == mar_row['moen2m_xrnd']) \
            or (y == mar_row['yrbg3m_xrnd'] and m == mar_row['mobg3m_xrnd']))
    
res = iv_df.apply(overlap, axis=1)
sum(res)

3

In [37]:
# marital status
# 0: never married before
# 1: married
# 2: divorced/separated/widowed

# ASSUME marraige starts/ends in the first week of the month

#NLSY79 marital status key:
#0 NEVER MARRIED
#1 MARRIED
#2 SEPARATED
#3 DIVORCED
#5 REMARRIED
#6 WIDOWED
marstat_dict = {0:0, 1:1, 2:2, 3:2, 6:2} # there is no status 5 in the data
     
years = range(1979, 1995) + range(1996, 2013, 2)
    
    
# helper function
def all_valid(l):
    """
    Return True if no item in the list is a missing value, i.e. -1, -2, -3, -5.
    """
    
    missing = [item<0 for item in l]
    return sum(missing) == 0

   
def correct_marstat(row):
    """
    Sometimes yearly marital status after being married for a while became 0 (single). For example, respondent 9434.
    It should be 3 (divorced). Correct it to 3. 
    """
    
    married_before = False
    for y in years:
        if row['marstat_%s'%y] in [1, 2]:
            married_before = True
            
        if row['marstat_%s'%y] == 0 and married_before:
            row['marstat_%s'%y] = 3
        
    return row
    
    
def get_marstat(row):
    """
    This function returns the marital status at one year prior to first birth, 
    determined using marriage dates (and yearly marriage status if marriage dates are insufficient).
    """
    
    # prebirth
    y = int(row['childbirth_year']) - 1
    m = int(row['childbirth_month'])
        
    #Documentation:
        # -999: NEVER MARRIED
        # -998: 1ST MARRIAGE NEVER ENDED
        # -997: 1ST MARRIAGE ENDED NO 2ND MARRIAGE
        # -996: 2ND MARRIAGE NEVER ENDED
        # -995: 2ND MARRIAGE ENDED NO 3RD MARRIAGE
        # 0 TO 1979: <=1979
        # 1980, etc.
    
    ####################################
    ### Consider various situations: ###
    ####################################


    ### overlap cases ###
    # if overlap with marriage start date, assign married (value 1)
    if ((y == row['yrbg1m_xrnd'] and m == row['mobg1m_xrnd']) \
        or (y == row['yrbg2m_xrnd'] and m == row['mobg2m_xrnd']) \
        or (y == row['yrbg3m_xrnd'] and m == row['mobg3m_xrnd'])):
        return 1
    
    # if overlap with marriage end date, assign previously married (value 2)
    if ((y == row['yren1m_xrnd'] and m == row['moen1m_xrnd']) \
        or (y == row['yren2m_xrnd'] and m == row['moen2m_xrnd'])):
        return 0
    
    
    # never married
    if row['yrbg1m_xrnd'] == -999: 
        return 0
    
    
    # first marriage started after prebirth month 
    if all_valid([row['yrbg1m_xrnd'], row['mobg1m_xrnd']]): # first check marriage start date is valid
        if datetime(y, m, 1) < datetime(int(row['yrbg1m_xrnd']), int(row['mobg1m_xrnd']), 1):
            return 0
    
    
    # prebirth is before 1979 and never married in 1979
    if y < 1979 and row['marstat_1979'] == 0: 
        return 0
    
    
    # first marriage never ended
    if row['yren1m_xrnd'] == -998:
        if all_valid([row['yrbg1m_xrnd'], row['mobg1m_xrnd']]): 
            if datetime(y, m, 1) < datetime(int(row['yrbg1m_xrnd']), int(row['mobg1m_xrnd']), 1): 
                print 'this case has been covered. should not appear'
                return 0
            else:
                return 1
        
        
    # first marriage lasted a finite period of time, and prebirth is in this period
    if all_valid([row['yrbg1m_xrnd'], row['mobg1m_xrnd'], row['yren1m_xrnd'], row['moen1m_xrnd']]):
        if datetime(y, m, 1) < datetime(int(row['yrbg1m_xrnd']), int(row['mobg1m_xrnd']), 1):
            print 'this case has been covered. should not appear'
            return 0
        elif datetime(y, m, 1) >= datetime(int(row['yrbg1m_xrnd']), int(row['mobg1m_xrnd']), 1) and \
        datetime(y, m, 1) < datetime(int(row['yren1m_xrnd']), int(row['moen1m_xrnd']), 1):
            return 1

    
    # first marriage ended, no second marriage
    if row['yrbg2m_xrnd'] == -997:
        if all_valid([row['yren1m_xrnd'], row['moen1m_xrnd']]):
            if datetime(y, m, 1) >= datetime(int(row['yren1m_xrnd']), int(row['moen1m_xrnd']), 1):
                return 2
    
            
    # prebirth is between first marriage end date and second marriage start date, then R is previously married
    if all_valid([row['yren1m_xrnd'], row['moen1m_xrnd'], row['yrbg2m_xrnd'], row['mobg2m_xrnd']]):
        if datetime(y, m, 1) >= datetime(int(row['yren1m_xrnd']), int(row['moen1m_xrnd']), 1) and \
        datetime(y, m, 1) < datetime(int(row['yrbg2m_xrnd']), int(row['mobg2m_xrnd']), 1):
            return 2
    
    
    # second marriage never ended
    if row['yren2m_xrnd'] == -996:
        if all_valid([row['yrbg2m_xrnd'], row['mobg2m_xrnd']]):
            if datetime(y, m, 1) >= datetime(int(row['yrbg2m_xrnd']), int(row['mobg2m_xrnd']), 1): 
                return 1

        
    # second marriage lasted a finite period of time, and prebirth is in this period
    if all_valid([row['yrbg2m_xrnd'], row['mobg2m_xrnd'], row['yren2m_xrnd'], row['moen2m_xrnd']]):
        if datetime(y, m, 1) >= datetime(int(row['yrbg2m_xrnd']), int(row['mobg2m_xrnd']), 1) and \
        datetime(y, m, 1) < datetime(int(row['yren2m_xrnd']), int(row['moen2m_xrnd']), 1):
            return 1
     
    
    #2ND MARRIAGE ENDED NO 3RD MARRIAGE                
    if row['yrbg3m_xrnd'] == -995:
        if all_valid([row['yren2m_xrnd'], row['moen2m_xrnd']]):
            if datetime(y, m, 1) >= datetime(int(row['yren2m_xrnd']), int(row['moen2m_xrnd']), 1):
                return 2
            
            
    # prebirth is between second marriage end date and third marriage start date, then R is previously married
    if all_valid([row['yren2m_xrnd'], row['moen2m_xrnd'], row['yrbg3m_xrnd'], row['mobg3m_xrnd']]):
        if datetime(y, m, 1) >= datetime(int(row['yren2m_xrnd']), int(row['moen2m_xrnd']), 1) and \
        datetime(y, m, 1) < datetime(int(row['yrbg3m_xrnd']), int(row['mobg3m_xrnd']), 1):
            return 2
    
    
    ########################################################################################
    ### if marriage dates are insufficient, use yearly marital status and interview date ###
    
    row = correct_marstat(row)
    
    if y in years and all_valid([row['date_m_%i'%y]]):
        
        # the month one year prebirth overlaps with interview date
        if m == row['date_m_%i'%y]: 
            return marstat_dict[row['marstat_%i'%y]]

        
        # the month one year prebirth is before interview date, check marital status in y-1 and y
        elif m < row['date_m_%i'%y]:
            if y >= 1980 and y <= 1994:
                # if marital status in the two surrounding years are the same, use it as prebirth marital status
                if all_valid([row['marstat_%i'%(y-1)], row['marstat_%i'%y]]) and \
                marstat_dict[row['marstat_%i'%(y-1)]] == marstat_dict[row['marstat_%i'%y]]: 
                    return marstat_dict[row['marstat_%i'%y]]
         
        # the month one year prebirth is after interview date, check marital status in y and y+1
        elif m > row['date_m_%i'%y]:
            if y >= 1979 and y <= 1993:
                # if marital status in the two surrounding years are the same, use it as prebirth marital status
                if all_valid([row['marstat_%i'%y], row['marstat_%i'%(y+1)]]) and \
                marstat_dict[row['marstat_%i'%y]] == marstat_dict[row['marstat_%i'%(y+1)]]:
                    return marstat_dict[row['marstat_%i'%y]]
                
            elif y == 1994:
                if all_valid([row['marstat_1994'], row['marstat_1996']]) and \
                marstat_dict[row['marstat_1994']] == marstat_dict[row['marstat_1996']]:
                    return marstat_dict[row['marstat_1994']]
    
    elif y == 1995:
        if all_valid([row['marstat_1994'], row['marstat_1996']]) and \
        marstat_dict[row['marstat_1994']] == marstat_dict[row['marstat_1996']]:
            return marstat_dict[row['marstat_1994']]
    
    
    return None


merged = iv_df.merge(marstat_df, on='caseid_1979')
merged = merged.merge(intdate_df, on='caseid_1979')
marstat = merged.apply(get_marstat, axis=1)
marstat.value_counts(dropna=False)

 1.0    145
 0.0     84
 2.0     15
NaN       4
dtype: int64

In [38]:
# distribution of prebirth year in the None cases
iv_df['marstat'] = marstat
iv_df[iv_df.marstat.isnull()].childbirth_year.value_counts()

1998    2
1999    1
1997    1
Name: childbirth_year, dtype: int64

In [39]:
# get relevant information to manually examine the missing cases

partial_df = iv_df[iv_df.marstat.isnull()][['caseid_1979', 'childbirth_year', 'childbirth_month']]
partial_df['prebirth_y'] = partial_df.childbirth_year.map(lambda x: x-1)
partial_df['prebirth_m'] = partial_df.childbirth_month


# drop birth date columns
partial_df = partial_df[['caseid_1979', 'prebirth_y', 'prebirth_m']]


# add interview date in prebirth year (if applicable)
def get_int_date(row):
    if row['prebirth_y'] in years: # is a survey year
        return intdate_df[intdate_df.caseid_1979==row['caseid_1979']].iloc[0]['date_m_%i'%row['prebirth_y']]
    return None
partial_df['int_m_prebirth'] = partial_df.apply(get_int_date, axis=1)

    
# merge with marital status variables
partial_df = partial_df.merge(marstat_df, on='caseid_1979')    
print_full(partial_df)

Unnamed: 0,caseid_1979,prebirth_y,prebirth_m,int_m_prebirth,marstat_1979,marstat_1980,marstat_1981,marstat_1982,marstat_1983,marstat_1984,marstat_1985,marstat_1986,marstat_1987,marstat_1988,marstat_1989,marstat_1990,marstat_1991,marstat_1992,marstat_1993,marstat_1994,marstat_1996,marstat_1998,marstat_2000,marstat_2002,marstat_2004,mobg1m_xrnd,yrbg1m_xrnd,moen1m_xrnd,yren1m_xrnd,mobg2m_xrnd,yrbg2m_xrnd,moen2m_xrnd,yren2m_xrnd,mobg3m_xrnd,yrbg3m_xrnd,marstat_2006,marstat_2008,marstat_2010,marstat_2012
0,1366,1996,10,5.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,8,1996,-3,-3,-997,-997,-997,-997,-997,-997,1,1,3,3
1,4613,1998,1,4.0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,-5,1,-3,-3,6,2001,4,2002,-996,-996,-996,-996,1,1,1,-5
2,6214,1997,8,,0,0,0,0,0,0,0,0,0,0,0,0,-5,0,1,1,1,-5,-5,-5,-5,-3,-3,-998,-998,-998,-998,-998,-998,-998,-998,-5,1,1,1
3,7959,1997,7,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,-3,-3,-998,-998,-998,-998,-998,-998,-998,-998,1,1,1,1


### manually determine the missing cases

* caseid 1366, prebirth 1996-10, first marriage in 1996-8, end date missing, married in 1998-2008, divorced since 2010, so married

* caseid 4613, prebirth 1998-1, first marriage start date missing, ended in 2001-6, married again in 2002-4, married in 1987-2000, so married

* caseid 6214, prebirth 1997-8, first marriage date missing, first marriage never ended, married/noninterviewed since 1993, so married

* caseid 7959, prebirth 1997-7, first marriage date missing, first marriage never ended, married since 1996, so married

In [40]:
# put all manual decisions together

decisions = {1366: 1, 
             4613: 1, 
             6214: 1, 
             7959: 1
            }
            

def update_marstat(row):
    if row['caseid_1979'] in decisions.keys():
        return decisions[row['caseid_1979']]
    else:
        return row['marstat']
    
iv_df.marstat = iv_df.apply(update_marstat, axis=1)
iv_df.marstat.value_counts(dropna=False)

1.0    149
0.0     84
2.0     15
Name: marstat, dtype: int64

## Mothers' Education

In [41]:
parents_df = pd.read_csv('../data/parents.csv')

In [42]:
# highest grade achieved by R's mother
parents_df[parents_df.caseid_1979.isin(sample)].hgc_mother_1979.value_counts()

12    112
16     22
     ... 
2       2
18      1
Name: hgc_mother_1979, dtype: int64

In [43]:
def get_ma_educ(i):
    """
    Given respondent i, return highest grade achieved by R's mother. 
    If R never knew mother, code ma educ to -4, but not missing. 
    If R knew mother and invalid ma educ, set to missing.
    
    Then recode mother's education into categories
    only two respondents had -4, code them to category 1
    """
        
    g = parents_df[parents_df.caseid_1979==i].iloc[0]['hgc_mother_1979']    
        
    if g >= 0 and g<12:
        return 1
        
    elif g==12:
        return 2
        
    elif (g>12 and g<16):
        return 3
        
    elif g==16:
        return 4
        
    elif (g>16 and g<95):
        return 5

    elif g==-4: # never knew mother 
        return 1
    
    else:
        return None


iv_df['ma_educ'] = iv_df.caseid_1979.map(get_ma_educ)
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age,exp,working,had_job,marstat,ma_educ
0,46,1994,4,1962,11,1994,5,31,19092.0,1,1,1.0,2.0
1,62,1996,8,1964,7,1996,9,32,31103.0,1,1,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,118,1981,12,1961,10,1982,1,20,3660.0,1,1,0.0,
4,159,1985,9,1960,4,1985,10,25,12699.0,1,1,1.0,2.0


In [44]:
sum(iv_df.ma_educ.isnull())

10

## Mothers' employment status

Documentations:

* fam_19_1979: DID MOTHER/STEPMOTHER WORK FOR PAY ALL OF 1978, PART, OR NOT AT ALL?
    * 1 ALL YEAR
    * 2 PART OF YEAR
    * 3 NOT AT ALL
* fam_19c_1979: DID MOTHER/STEPMOTHER WORK > 35 HOURS PER WEEK IN 1978?
    * 1 35 HOURS OR MORE
    * 2 LESS THAN 35 HOURS

In [45]:
def get_ma_ft(i):
    """
    Given respondent id i, return whether mother worked full-time (1), no full-time (0), or no mother figure (2)
    """

    ma_ft = None
    row = parents_df[parents_df.caseid_1979==i].iloc[0]
    
    if row['fam_19_1979']==-4: #no mother figure
        ma_ft = 2
    elif row['fam_19_1979']==3: #mother didn't work
        ma_ft = 0
    elif row['fam_19_1979'] in [1, 2] and row['fam_19c_1979']==2: #mother worked part-time
        ma_ft = 0
    elif row['fam_19_1979'] in [1, 2] and row['fam_19c_1979']==1: #mother worked full-time
        ma_ft = 1
    
    return ma_ft


iv_df['ma_ft'] = iv_df.caseid_1979.map(get_ma_ft)
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age,exp,working,had_job,marstat,ma_educ,ma_ft
0,46,1994,4,1962,11,1994,5,31,19092.0,1,1,1.0,2.0,1.0
1,62,1996,8,1964,7,1996,9,32,31103.0,1,1,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,118,1981,12,1961,10,1982,1,20,3660.0,1,1,0.0,,1.0
4,159,1985,9,1960,4,1985,10,25,12699.0,1,1,1.0,2.0,1.0


In [46]:
iv_df.ma_ft.value_counts(dropna=False)

 0.0    125
 1.0    109
NaN      11
 2.0      3
Name: ma_ft, dtype: int64

## Whether to work at age 35

In [47]:
# read attitude variables
att_df = pd.read_csv('../data/attitude.csv')
att_df.head()

Unnamed: 0,caseid_1979,fer_3_1979,womens_roles_000001_1979,womens_roles_000002_1979,womens_roles_000003_1979,womens_roles_000004_1979,womens_roles_000005_1979,womens_roles_000006_1979,womens_roles_000007_1979,womens_roles_000008_1979,exp_1_1979,exp_2_1979,sample_race_78scrn,sample_sex_1979,exp_1_1980,exp_2_1980,exp_1_1981,exp_2_1981,womens_roles_000001_1982,womens_roles_000002_1982,womens_roles_000003_1982,womens_roles_000004_1982,womens_roles_000005_1982,womens_roles_000006_1982,womens_roles_000007_1982,womens_roles_000008_1982
0,1,2,2,2,1,1,4,2,1,4,3,1,3,2,-5,-5,1,-4,-5,-5,-5,-5,-5,-5,-5,-5
1,2,3,4,3,2,2,3,4,3,4,3,1,3,2,2,1,2,1,3,3,2,2,4,4,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,4,2,1,2,3,4,1,2,4,2,3,0,3,2,-5,-5,1,-4,1,1,4,2,4,2,4,2
4,5,3,2,2,2,1,3,3,3,2,2,-4,3,1,1,-4,1,-4,2,2,2,1,3,3,3,2


Documentations:
    
exp_1_1979: WHAT WOULD R LIKE TO BE DOING AT AGE 35?

* 1 PRESENT JOB
* 2 SOME OCCUPATION
* 3 MARRIED, FAMILY
* 4 OTHER (SPECIFY)

exp_2_1979: DOES R WANT TO WORK OUTSIDE HOME AT AGE 35?
* 1 YES
* 0 NO

In [48]:
att_df.exp_1_1979.unique()

array([ 3,  2,  1, -2,  4, -3])

In [49]:
att_df.exp_2_1979.unique()

array([ 1,  0, -4, -3, -2])

In [50]:
# crosstab
merged = iv_df.merge(att_df, on='caseid_1979')
pd.crosstab(merged.exp_1_1979, merged.exp_2_1979)

exp_2_1979,-4,0,1
exp_1_1979,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-2,10,0,0
1,1,0,0
...,...,...,...
3,0,15,42
4,8,0,0


In [51]:
def working35(i):
    """Given respondent id i, return whether respondent would like to work at age 35"""
    
    row = att_df[att_df.caseid_1979==i].iloc[0]
    work35 = None
    
    if row['exp_1_1979'] in [1, 2]:
        work35 = 1
    elif row['exp_1_1979']==3 and row['exp_2_1979']==1:
        work35 = 1
    elif row['exp_1_1979']==3 and row['exp_2_1979']==0:
        work35 = 0
    elif row['exp_1_1979']==4:
        work35 = 0
    
    return work35


iv_df['work35'] = iv_df.caseid_1979.map(working35)
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age,exp,working,had_job,marstat,ma_educ,ma_ft,work35
0,46,1994,4,1962,11,1994,5,31,19092.0,1,1,1.0,2.0,1.0,1.0
1,62,1996,8,1964,7,1996,9,32,31103.0,1,1,1.0,2.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,118,1981,12,1961,10,1982,1,20,3660.0,1,1,0.0,,1.0,1.0
4,159,1985,9,1960,4,1985,10,25,12699.0,1,1,1.0,2.0,1.0,1.0


In [52]:
iv_df.work35.value_counts(dropna=False)

 1.0    215
 0.0     23
NaN      10
Name: work35, dtype: int64

## Women's role

In [53]:
#what are the possible answers for each role question?

for i in [1, 2, 4, 6, 8]:
    print 'question', i
    print att_df['womens_roles_00000%s_1979'%i].unique()

question 1
[ 2  4  1  3 -2 -3]
question 2
[ 2  3  4  1 -2 -3 -1]
question 4
[ 1  2  3  4 -2 -3]
question 6
[ 2  4  3  1 -3 -2]
question 8
[ 4  3  2  1 -2 -3 -1]


In [54]:
count_missing_only_one = []

def get_wm_roles(i):
    """
    Given a respondent id i, return the summary index of women's roles. 
    
    If one variable has missing value, expand its value.
    If more than one missing, return None.
    """
    
    row = att_df[att_df.caseid_1979==i].iloc[0]
    
    num_missing = sum([row['womens_roles_00000%s_1979'%i] < 0 for i in [1, 2, 4, 6, 8]])
    
    if num_missing == 0: # no missing
        return sum([int(row['womens_roles_00000%s_1979'%i]) for i in [1, 2, 4, 6, 8]])
    
    elif num_missing == 1:
        count_missing_only_one.append(i)
        vals = [int(row['womens_roles_00000%s_1979'%i]) for i in [1, 2, 4, 6, 8] if int(row['womens_roles_00000%s_1979'%i]) > 0]
        return round(sum(vals)/4.0*5.0, 0)
    
    else:
        return None
        

iv_df['womensroles'] = iv_df.caseid_1979.map(get_wm_roles)
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age,exp,working,had_job,marstat,ma_educ,ma_ft,work35,womensroles
0,46,1994,4,1962,11,1994,5,31,19092.0,1,1,1.0,2.0,1.0,1.0,11.0
1,62,1996,8,1964,7,1996,9,32,31103.0,1,1,1.0,2.0,1.0,1.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,118,1981,12,1961,10,1982,1,20,3660.0,1,1,0.0,,1.0,1.0,8.0
4,159,1985,9,1960,4,1985,10,25,12699.0,1,1,1.0,2.0,1.0,1.0,9.0


In [55]:
iv_df.womensroles.value_counts(dropna=False)

10.0    45
11.0    34
        ..
19.0     1
18.0     1
Name: womensroles, dtype: int64

In [56]:
len(count_missing_only_one)

6

## Ideal number of children

In [57]:
att_df.fer_3_1979.unique()

array([ 2,  3,  4,  5,  1, 12,  0,  6, -2, -3, 10,  8,  7,  9, -1, 11, 15,
       20, 14])

In [58]:
iv_df = iv_df.merge(att_df[['caseid_1979', 'fer_3_1979']], on='caseid_1979')
iv_df.rename(columns={'fer_3_1979': 'ideal'}, inplace=True)#rename columns

def recode_ideal(x):
    """recode the variable ideal to integers or None."""
    if x < 0:
        return None
    else:
        return int(x)
    
iv_df.ideal = iv_df.ideal.map(recode_ideal)
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age,exp,working,had_job,marstat,ma_educ,ma_ft,work35,womensroles,ideal
0,46,1994,4,1962,11,1994,5,31,19092.0,1,1,1.0,2.0,1.0,1.0,11.0,2.0
1,62,1996,8,1964,7,1996,9,32,31103.0,1,1,1.0,2.0,1.0,1.0,8.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,118,1981,12,1961,10,1982,1,20,3660.0,1,1,0.0,,1.0,1.0,8.0,2.0
4,159,1985,9,1960,4,1985,10,25,12699.0,1,1,1.0,2.0,1.0,1.0,9.0,4.0


## Race

In [59]:
analytic_df14 = pd.read_csv("../data/analytic_df_seq14.csv") #includes race data
print analytic_df14.columns[:5]

Index([u'caseid_1979', u'sample_id_1979', u'sample_race_1979',
       u'sample_sex_1979', u'c1dob_m_xrnd'],
      dtype='object')


In [60]:
iv_df = iv_df.merge(analytic_df14[['caseid_1979', 'sample_race_1979']], on='caseid_1979')
iv_df = iv_df.rename(index=str, columns={"sample_race_1979": "race"})
iv_df.head()

Unnamed: 0,caseid_1979,childbirth_year,childbirth_month,y_dob,m_dob,start_y,start_m,age,exp,working,had_job,marstat,ma_educ,ma_ft,work35,womensroles,ideal,race
0,46,1994,4,1962,11,1994,5,31,19092.0,1,1,1.0,2.0,1.0,1.0,11.0,2.0,3
1,62,1996,8,1964,7,1996,9,32,31103.0,1,1,1.0,2.0,1.0,1.0,8.0,4.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,118,1981,12,1961,10,1982,1,20,3660.0,1,1,0.0,,1.0,1.0,8.0,2.0,3
4,159,1985,9,1960,4,1985,10,25,12699.0,1,1,1.0,2.0,1.0,1.0,9.0,4.0,3


# Output dataset

In [61]:
iv_df_main = pd.read_csv('../data/iv_df.csv')

print iv_df.shape
iv_df = pd.concat([iv_df_main, iv_df])
print iv_df.shape

(248, 18)
(3682, 25)


In [62]:
# export to csv
iv_df.to_csv('../data/iv_df_seq14.csv', index=False)