# OptOut Project: Construct age 20-40 sequences for mothers and for childless women

In [1]:
#setup
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from operator import itemgetter
from datetime import datetime, timedelta

# set some nicer defaults for matplotlib
from matplotlib import rcParams

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

## Week-to-week crosswalk

Create two functions to convert between week number and calendar year and month

In [2]:
#this is the file to switch between week number in NLSY79 and calendar year, month, and day
xl_file = pd.ExcelFile('../data/continuous_week_crosswalk_r26.xlsx') 
weekdf = xl_file.parse('weekdates1')
weekdf.head()

Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
0,1,1,1978,1,1
1,1,8,1978,2,2
2,1,15,1978,3,3
3,1,22,1978,4,4
4,1,29,1978,5,5


In [3]:
def get_week_num(year, month, which, df=weekdf):
    """
    Given a calendar year and month, return the corresponding week number in NLSY79.
    
    If which=='first', return the first week in that month.
    If which=='last', return the last week in that month. 
    """
    
    the_month = df[(df['Week Start:\nMonth']==month)&(df['Week Start: \nYear']==year)]
    
    if which == 'first':
        return int(the_month.iloc[0]['Continuous \nWeek Number'])
    elif which == 'last':
        return int(the_month.iloc[len(the_month)-1]['Continuous \nWeek Number'])

    
def get_year_month(weeknum, df=weekdf):
    """
    Given a week number, return the corresponding calendar month and year (yyyy, m).
    """
    
    the_week = df[df['Continuous \nWeek Number']==weeknum]
    return int(the_week.iloc[0]['Week Start: \nYear']), int(the_week.iloc[0]['Week Start:\nMonth'])

## Retrieve Extended Sample

In [4]:
analytic_df = pd.read_csv('../data/analytic_df.csv') #includes fertility data
print analytic_df.shape
analytic_df.head()

(3465, 100)


Unnamed: 0,caseid_1979,sample_id_1979,sample_race_1979,sample_sex_1979,c1dob_m_xrnd,c1dob_y_xrnd,c2dob_m_xrnd,c2dob_y_xrnd,c3dob_m_xrnd,c3dob_y_xrnd,...,date_m_2014,date_y_2014,y_deceased,last_yr,attr_before40,deceased_before40,deceased_child14,deceased_child18,too_late,attr_child18
0,2,5,3,2,3,1993,11,1994,-4,-4,...,12,2014,,2014,False,False,False,False,False,False
1,3,5,3,2,6,1981,10,1983,4,1986,...,5,2015,,2014,False,False,False,False,False,False
2,8,6,3,2,3,1976,5,1979,9,1982,...,1,2015,,2014,False,False,False,False,False,False
3,16,5,3,2,2,1990,8,1993,9,1996,...,1,2015,,2014,False,False,False,False,False,False
4,19,5,3,2,11,1987,-4,-4,-4,-4,...,-5,-5,,2010,False,False,False,False,False,False


## Read weekly arrays and within-job gap dates

In [5]:
#read weekly array data
wkarray_df = pd.read_csv('../data/weeklyarrays.csv')
print wkarray_df.shape
print wkarray_df.columns[:20]

(12686, 3969)
Index([u'caseid_1979', u'hrs_worked_wk_num0000_xrnd',
       u'hrs_worked_wk_num0001_xrnd', u'hrs_worked_wk_num0002_xrnd',
       u'hrs_worked_wk_num0003_xrnd', u'hrs_worked_wk_num0004_xrnd',
       u'hrs_worked_wk_num0005_xrnd', u'hrs_worked_wk_num0006_xrnd',
       u'hrs_worked_wk_num0007_xrnd', u'hrs_worked_wk_num0008_xrnd',
       u'hrs_worked_wk_num0009_xrnd', u'hrs_worked_wk_num0010_xrnd',
       u'hrs_worked_wk_num0011_xrnd', u'hrs_worked_wk_num0012_xrnd',
       u'hrs_worked_wk_num0013_xrnd', u'hrs_worked_wk_num0014_xrnd',
       u'hrs_worked_wk_num0015_xrnd', u'hrs_worked_wk_num0016_xrnd',
       u'hrs_worked_wk_num0017_xrnd', u'hrs_worked_wk_num0018_xrnd'],
      dtype='object')


In [6]:
#read within-job-gap data
gap_df = pd.read_csv("../data/withingaps.csv")
print gap_df.shape
print gap_df.columns[:20]

(12686, 839)
Index([u'caseid_1979', u'per1_start_1979_job01_xrnd',
       u'per1_start_1979_job02_xrnd', u'per1_start_1979_job03_xrnd',
       u'per1_start_1979_job04_xrnd', u'per1_start_1979_job05_xrnd',
       u'per1_stop_1979_job01_xrnd', u'per1_stop_1979_job02_xrnd',
       u'per1_stop_1979_job03_xrnd', u'per1_stop_1979_job04_xrnd',
       u'per1_stop_1979_job05_xrnd', u'per2_start_1979_job01_xrnd',
       u'per2_start_1979_job02_xrnd', u'per2_start_1979_job03_xrnd',
       u'per2_start_1979_job04_xrnd', u'per2_start_1979_job05_xrnd',
       u'per2_stop_1979_job01_xrnd', u'per2_stop_1979_job02_xrnd',
       u'per2_stop_1979_job03_xrnd', u'per2_stop_1979_job04_xrnd'],
      dtype='object')


## Identify maternity leaves

Maternity leave could cover the birth, start after the birth, or end before the birth. 

Let's assume any within-job gap that start or end within 4 months (16 weeks) of birth as maternity leave (there are relatively fewer maternity gaps that finished within 16 weeks before birth, so do not consider within-job gaps that ended within 16 weeks before birth as maternity leave).

In [7]:
merged = analytic_df.merge(gap_df, on='caseid_1979') # merge datasets
merged.columns

Index([u'caseid_1979', u'sample_id_1979', u'sample_race_1979',
       u'sample_sex_1979', u'c1dob_m_xrnd', u'c1dob_y_xrnd', u'c2dob_m_xrnd',
       u'c2dob_y_xrnd', u'c3dob_m_xrnd', u'c3dob_y_xrnd',
       ...
       u'per3_start_2014_job03_xrnd', u'per3_start_2014_job04_xrnd',
       u'per3_stop_2014_job01_xrnd', u'per3_stop_2014_job02_xrnd',
       u'per3_stop_2014_job03_xrnd', u'per3_stop_2014_job04_xrnd',
       u'per4_start_2014_job01_xrnd', u'per4_start_2014_job02_xrnd',
       u'per4_stop_2014_job01_xrnd', u'per4_stop_2014_job02_xrnd'],
      dtype='object', length=938)

In [8]:
###modify to handle months prior to 1978


def get_gaps(row):
    """
    Given a row of respondent data, 
    Return a list of tuples (start week, end week) of work gaps for respondent i
    
    COPY THE SAME FUNCTION AS ABOVE, EXCEPT: COMMENT OUT THE PART THAT HANDLES MISSING CASES
    """
    
    gaps = []
    
    years = range(1979, 1995)+range(1996, 2015, 2)
    
    #look for within-job gaps in all years. sometimes the gaps were recorded in later years
    for y in years: 
        for j in range(1, 5): #4 possible gap periods
            for k in range(1, 6): #5 possible jobs
                try: 
                    gap_start = row['per%s_start_%s_job0%s_xrnd'%(j, y, k)]
                    gap_stop = row['per%s_stop_%s_job0%s_xrnd'%(j, y, k)]
                    
                    if gap_start >= 0 and gap_stop >= 0: # valid, no missing
                        #gaps.append((gap_start, gap_stop))
                        gaps += range(gap_start, gap_stop+1)
                        
#                    elif gap_start == -4 and gap_stop == -4: # valid skip
#                        pass # valid skip
#                    else:
#                        pass
#                         missing_data['caseid_1979'].append(row['caseid_1979'])
#                         missing_data['start_col'].append('per%s_start_%s_job0%s_xrnd'%(j, y, k))
#                         missing_data['start'].append(gap_start) 
                        
#                         start_y = None
#                         start_m = None
#                         if gap_start > 0:
#                             start_y, start_m = get_year_month(gap_start)
#                         missing_data['start_y'].append(start_y)
#                         missing_data['start_m'].append(start_m)

#                         missing_data['end_col'].append('per%s_stop_%s_job0%s_xrnd'%(j, y, k))
#                         missing_data['end'].append(gap_stop)
                        
#                         end_y = None
#                         end_m = None
#                         if gap_stop > 0:
#                             end_y, end_m = get_year_month(gap_stop)
#                         missing_data['end_y'].append(end_y)
#                         missing_data['end_m'].append(end_m)

#                         print 'missing data - id:', row['caseid_1979'], 'year:', y, 'gap #:', j, 'job #:', k, 'weeks: ', (gap_start, gap_stop)
                        
                except KeyError: # some years do not report 5 jobs
                    pass
                
    return gaps


## Tranform weekly sequences


NLSY79 employment status:
* 100 TO 2615: ACTUAL SURVEY ROUND/JOB NUMBER
* 0: NO INFO REPORTED FOR WEEK
* 2: NOT WORKING (UNEMP V. OLF NOT DETERMINED)
* 3: ASSOC. WITH EMP, GAP DATES MISSING, ALL TIME NOT ACCTD FOR
* 4: UNEMPLOYED
* 5: OUT OF LABOR FORCE
* 7: ACTIVE MILITARY SERVICE


My status classification scheme:

* 0 - misisng
* 1 - full-time employment - hours >= 35
* 2 - part-time 1 - hours [20, 35)
* 3 - part-time 2 - hours < 20
* 4 - working with missing hours
* 5 - unemployed
* 6 - OOLF
* 7 - nonworking, but DK whether OOLF or unemployed
* 8 - maternity leave


NOTE: Working with 0 hour to part time 2 (status 3)

## Age 20-40 sequences for mothers

In [9]:
def find_modal(l):
    """
    Given a list l, return the most common element in the list. 
    if there is a tie, return all elements.
    """
    
    #first, count frequency of each unique item in the list
    counter = defaultdict(int)
    for item in l:
        counter[item] += 1

    #rank items from most frequent to least frequent
    popular_items = sorted(counter, key=counter.get, reverse = True)
    
    most_common = [popular_items[0]]
    
    for other in popular_items[1:]:
        if counter[other] < counter[most_common[0]]:
            break
        elif counter[other] == counter[most_common[0]]: #find an equally frequent item
            most_common.append(other)
        else:
            print 'impossible'

    return most_common

#for example
print find_modal([1, 1, 2, 1, 5])
print find_modal([1, 1, 2, 1, 5, 2, 2, 33])

[1]
[1, 2]


In [10]:
full_df = pd.read_csv('../data/full_wk_df.csv')

In [11]:
res_data = defaultdict(list) #create an empty dictionary to store data

count_ties = [] #how many we get ties of modal, store a tuple (respondent id, month # where a tie occured) 
status_order = [1, 6, 2, 3, 5, 8, 4, 7, 0]  


for i, row in full_df.iterrows():
    
    # keep track of progress
    if i%100 == 0:
        print i, 
    
    # keep basic info: id, sequence start year and month
    res_data['caseid_1979'].append(row['caseid_1979'])
    
    dob_row = analytic_df[analytic_df.caseid_1979==row['caseid_1979']].iloc[0]
    res_data['start_y'].append(dob_row['y_dob']+20)
    res_data['start_m'].append(dob_row['m_dob'])

    y = dob_row['y_dob']+20
    m = dob_row['m_dob']
    
    
    # look at weekly statuses
    for j in range(1, 241): # from month 1 to month 240 = 20 * 12
        
        if y < 1978: #prior to 1978
            res_data['month%s'%j].append(0)
            
        elif y > 2015: # post-2015, assign missing
            res_data['month%s'%j].append(0)
            
        else:
            start = get_week_num(y, m, 'first')
            end = get_week_num(y, m, 'last')
            wk_statuses = [row['week%i'%k] for k in range(start, end+1)]
            
            
            #find unique items in the given month 
            distinct = list(set(wk_statuses))
            distinct.sort()

            if len(distinct) == 1:
                res_data['month%s'%j].append(distinct[0])

            elif len(distinct) >= 2:
                modal = find_modal(wk_statuses)
                if len(modal) == 1:
                    res_data['month%s'%j].append(modal[0])
                else:
                    
                    modal_order = [(x, status_order.index(x)) for x in modal]
                    modal_order = sorted(modal_order, key=itemgetter(1))
                    res_data['month%s'%j].append(modal_order[0][0])
                    count_ties.append((row['caseid_1979'], j))

         
        #move to next month
        if m < 12:
            m += 1
        elif m == 12:
            m = 1
            y += 1

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400


In [12]:
cols = ['caseid_1979', 'start_y', 'start_m'] + ['month%s'%i for i in range(1, 241)]
res_df = pd.DataFrame(res_data, columns=cols) #restricted dataframe

In [13]:
res_df.head()

Unnamed: 0,caseid_1979,start_y,start_m,month1,month2,month3,month4,month5,month6,month7,...,month231,month232,month233,month234,month235,month236,month237,month238,month239,month240
0,2,1979,1,1,1,1,1,1,1,1,...,6,6,5,5,5,5,5,5,5,5
1,3,1981,8,6,6,6,6,6,6,6,...,1,1,6,6,6,6,6,6,6,6
2,8,1978,7,6,6,6,6,6,8,2,...,1,1,1,1,1,1,1,1,1,1
3,16,1978,10,1,1,1,1,1,1,1,...,3,3,3,3,3,3,2,2,2,2
4,19,1977,12,0,1,1,1,1,1,1,...,6,6,6,6,6,6,6,6,6,6


## Update pre-1978 monthly status

Coding rules:
* If FT, then FT
* If working less than FT, treat as working, missing hours
* otherwise, assign missing

In [14]:
df79 = pd.read_csv('../data/work_history79.csv')
df79.head()

Unnamed: 0,caseid_1979,employer_startdate_01_m_1979,employer_startdate_01_d_1979,employer_startdate_01_y_1979,employer_startdate_02_m_1979,employer_startdate_02_d_1979,employer_startdate_02_y_1979,employer_startdate_03_m_1979,employer_startdate_03_d_1979,employer_startdate_03_y_1979,...,nonemployed_prior_int_01_1979,nonemployed_prior_int_02_1979,nonemployed_prior_int_03_1979,nonemployed_prior_int_04_1979,nonemployed_prior_int_05_1979,nonemployed_since_int_01_1979,nonemployed_since_int_02_1979,nonemployed_since_int_03_1979,nonemployed_since_int_04_1979,nonemployed_since_int_05_1979
0,1,5,22,78,9,20,75,-4,-4,-4,...,-4,-4,-4,-4,-4,0,-4,-4,-4,-4
1,2,5,8,78,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,0,-4,-4,-4,-4
2,3,9,11,78,5,15,78,1,30,78,...,-4,-4,-4,-4,-4,0,0,0,-4,-4
3,4,-4,-4,-4,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,5,2,1,78,6,5,78,-4,-4,-4,...,-4,-4,-4,-4,-4,1,0,-4,-4,-4


In [15]:
# Decision rules:
#If your job started on the 15th or earlier, you worked that month.
#If your job ended on the 16th or later, you worked that month.
#If we don’t know the day that you started, count the start month as NOT working and the end month as working.


def update_pre1978_months(row):
    
    y = row['start_y'] - 1
    m = row['start_m']    
    
    if y >= 1978:
        return row
    
    j = 1
    work79_row = df79[df79.caseid_1979==row['caseid_1979']].iloc[0]
    
    while y < 1978:
        #print (y, m)

        overlap_jobs = []

        for i in range(1, 6): # iterate through jobs
            
            start_y = work79_row['employer_startdate_%02d_y_1979'%i]
            start_m = work79_row['employer_startdate_%02d_m_1979'%i]
            start_d = work79_row['employer_startdate_%02d_d_1979'%i]

            stop_y = work79_row['employer_stopdate_%02d_y_1979'%i]
            stop_m = work79_row['employer_stopdate_%02d_m_1979'%i]
            stop_d = work79_row['employer_stopdate_%02d_d_1979'%i]
            
            #print (start_y, start_m, start_d, stop_y, stop_m, stop_d)
            
            if start_y > 0 and start_y < 1900:
                start_y += 1900

            if stop_y > 0 and stop_y < 1900:
                stop_y += 1900
              
            if start_y >= 1978:
                continue
                
            if start_y > 0 and start_m > 0 and start_d > 0:

                if start_y == y and start_m == m and start_d <= 15: # job started on the 15th or earlier in the given month
                    overlap_jobs.append(i)

                elif start_y < y or (start_y == y and start_m < m): # job started before the given month 

                    if stop_y == -4 and stop_m == -4 and stop_d == -4: 
                        overlap_jobs.append(i)

                    elif stop_y > 0 and stop_m > 0 and stop_d > 0:
                        if stop_y == y and stop_m == m and stop_d >= 16: # job ended on the 16th or later, you worked that month
                            overlap_jobs.append(i)   
                        elif (stop_y > y) or (stop_y == y and stop_m > m): # job stopped after the given month
                            overlap_jobs.append(i)
                        
                    elif stop_y > 0 and stop_m > 0:
                        if (stop_y > y) or (stop_y == y and stop_m > m): # job stopped after the given month
                            overlap_jobs.append(i)


            elif start_y > 0 and start_m > 0 and start_d < 0:
                if start_y == y and start_m == m: # start month as not working
                    pass
                
                elif start_y < y or (start_y == y and start_m < m): # job started before the given month   
                    
                    if stop_y == -4 and stop_m == -4 and stop_d == -4: 
                        overlap_jobs.append(i) 
                    
                    elif stop_y > 0 and stop_m > 0 and stop_d > 0:
                        if stop_y == y and stop_m == m and stop_d >= 16: # job ended on the 16th or later, you worked that month
                            overlap_jobs.append(i)   
                        elif (stop_y > y) or (stop_y == y and stop_m > m): # job stopped after the given month
                            overlap_jobs.append(i)  
                    
                    elif stop_y > 0 and stop_m > 0:
                        if (stop_y > y) or (stop_y == y and stop_m > m): # job stopped after the given month
                            overlap_jobs.append(i)
 
                        
            elif start_y > 0 and start_y < y: # job started before the given month

                if stop_y == -4 and stop_m == -4 and stop_d == -4: 
                    overlap_jobs.append(i) 

                elif stop_y > 0 and stop_m > 0 and stop_d > 0:
                    if stop_y == y and stop_m == m and stop_d >= 16: # job ended on the 16th or later, you worked that month
                        overlap_jobs.append(i)   
                    elif (stop_y > y) or (stop_y == y and stop_m > m): # job stopped after the given month
                        overlap_jobs.append(i)  

                elif stop_y > 0 and stop_m > 0:
                    if (stop_y > y) or (stop_y == y and stop_m > m): # job stopped after the given month
                        overlap_jobs.append(i)
                        

        #print j, (y, m), overlap_jobs
        if len(overlap_jobs) > 0:
            hours = [work79_row['hours_per_week_job_%02d_1979'%k] for k in overlap_jobs 
                     if work79_row['hours_per_week_job_%02d_1979'%k]>=0]
            #print overlap_jobs, hours
            
            if sum(hours) >= 35:
                row['month%s'%j] = 1 # full-time
            else:
                row['month%s'%j] = 4
            

        j += 1
        if m < 12:
            m += 1
        else:
            m = 1
            y += 1
            
    
    return row
   

In [16]:
res_df = res_df.apply(update_pre1978_months, axis=1)
res_df.head()

Unnamed: 0,caseid_1979,start_y,start_m,month1,month2,month3,month4,month5,month6,month7,...,month231,month232,month233,month234,month235,month236,month237,month238,month239,month240
0,2,1979,1,1,1,1,1,1,1,1,...,6,6,5,5,5,5,5,5,5,5
1,3,1981,8,6,6,6,6,6,6,6,...,1,1,6,6,6,6,6,6,6,6
2,8,1978,7,6,6,6,6,6,8,2,...,1,1,1,1,1,1,1,1,1,1
3,16,1978,10,1,1,1,1,1,1,1,...,3,3,3,3,3,3,2,2,2,2
4,19,1977,12,0,1,1,1,1,1,1,...,6,6,6,6,6,6,6,6,6,6


In [17]:
res_df.to_csv('../data/age_seq_mothers.csv', index=False)

## Age 20-40 sequences for childless women

In [18]:
childless_df = pd.read_csv('../data/childless_df.csv')
childless_df.head()

Unnamed: 0,caseid_1979,sample_id_1979,sample_race_1979,sample_sex_1979,c1dob_m_xrnd,c1dob_y_xrnd,c2dob_m_xrnd,c2dob_y_xrnd,c3dob_m_xrnd,c3dob_y_xrnd,...,date_d_2012,date_m_2012,date_y_2012,date_d_2014,date_m_2014,date_y_2014,y_deceased,last_yr,attr_before40,deceased_before40
0,14,5,3,2,-4,-4,-4,-4,-4,-4,...,21,11,2012,5,1,2015,,2014,False,False
1,21,5,3,2,-4,-4,-4,-4,-4,-4,...,27,10,2012,21,1,2015,,2014,False,False
2,22,5,3,2,-4,-4,-4,-4,-4,-4,...,29,10,2012,24,1,2015,,2014,False,False
3,28,5,3,2,-4,-4,-4,-4,-4,-4,...,-5,-5,-5,-5,-5,-5,,2010,False,False
4,29,5,3,2,-4,-4,-4,-4,-4,-4,...,1,11,2012,23,4,2015,,2014,False,False


In [19]:
# repeat for childless women, starting from full_df

# merge datasets
merged = childless_df.merge(gap_df, on='caseid_1979')
merged = merged.merge(wkarray_df, on='caseid_1979')


# Create a dictionary to store data
data = defaultdict(list)


for i, row in merged.iterrows(): #iterate through weekly array rows

    #keep track of progress
    if i%100 == 0: 
        print i,
    
    
    #keep three columns: id, start_y, start_m
    data['caseid_1979'].append(row['caseid_1979'])
    data['start_y'].append(row['y_dob']+20)
    data['start_m'].append(row['m_dob'])

    y = row['y_dob']+20
    m = row['m_dob']
    
        
    #maternal leaves
    gaps = get_gaps(row)
        
        
    for week_num in range(1, 1984): #iterate through all weekly work variables
         
        #if the week is maternity leave
        if week_num in gaps:
            data['week%s'%week_num].append(8) 
            continue
        
        #otherwise:
        my_status = None
        
        #get the hours worked this week 
        hours = row['hrs_worked_wk_num%04d_xrnd'%week_num]
    
        #get the employment status this week
        status = row['status_wk_num%04d_xrnd'%week_num]

        #if (status >= 100 and status <= 2615):
        if status >= 100:
            status = 'w' #working
        
        
        ### define employment status according to my classification scheme ###
#         0 - misisng
#         1 - full-time employment - hours >= 35
#         2 - part-time 1 - hours [20, 35)
#         3 - part-time 2 - hours < 20
#         4 - working with missing hours
#         5 - unemployed
#         6 - out of labor force
#         7 - nonworking, but DK whether OOLF or unemployed
#         8 - maternity leave
        
        
        # missing
        if hours == 0 and status == 0:
            my_status = 0
            
        # full time
        elif hours >= 35 and status in ['w', 3]:
            my_status = 1
        elif hours == 0 and status == 7:
            my_status = 1
     
    
        # part time 1
        elif hours >= 20 and hours < 35 and status in ['w', 3]:
            my_status = 2    

        # part time 2
        elif hours >= 0 and hours < 20 and status in ['w', 3]:
            my_status = 3    
    
    
        # working, missing hours
        elif hours < 0 and status in ['w', 3]: 
            my_status = 4

            
        ### NLSY codes:
        # 2: NOT WORKING (UNEMP V. OLF NOT DETERMINED)
        # 4: UNEMPLOYED
        # 5: OUT OF LABOR FORCE
            
            
        # unemployed
        elif status == 4:
            my_status = 5
            
            if hours > 0:
                print row['caseid_1979'], week_num, hours, status
            
        # OOLF
        elif status == 5:
            my_status = 6
            
            if hours > 0:
                print row['caseid_1979'], week_num, hours, status
            
        # nonworking
        elif status == 2:
            my_status = 7
            
            if hours > 0:
                print row['caseid_1979'], week_num, hours, status            

        else:
            # (hours 3, status 3) should also be part-time, 
            # but we didn't encounter any
            raise Exception, 'unknown hours and status: (%s, %s)'%(hours, status)
        
        
        data['week%s'%week_num].append(my_status) #make the week number start from 0, easier for indexing in Python

        
cols = ['caseid_1979', 'start_y', 'start_m'] + ['week%s'%n for n in range(1, 1984)]
full_df_childless = pd.DataFrame(data, columns=cols)

0 100 200 300 400 500 600


In [20]:
res_data = defaultdict(list) #create an empty dictionary to store data

count_ties = [] #how many we get ties of modal, store a tuple (respondent id, month # where a tie occured) 
status_order = [1, 6, 2, 3, 5, 8, 4, 7, 0] 


for i, row in full_df_childless.iterrows():
    
    # keep track of progress
    if i%100 == 0:
        print i, 
    
    # keep basic info: id, sequence start year and month
    res_data['caseid_1979'].append(row['caseid_1979'])
    res_data['start_y'].append(row['start_y'])
    res_data['start_m'].append(row['start_m'])
    
    y = row['start_y']
    m = row['start_m']    
    
    
    # look at weekly statuses
    for j in range(1, 241): # from month 1 to month 240 = 20 * 12
        
        if y < 1978: #prior to 1978
            res_data['month%s'%j].append(0)
            
        elif y > 2015: # post-2015
            res_data['month%s'%j].append(0)
            
        else:
            start = get_week_num(y, m, 'first')
            end = get_week_num(y, m, 'last')
            wk_statuses = [row['week%i'%k] for k in range(start, end+1)]
            
            
            #find unique items in the given month 
            distinct = list(set(wk_statuses))
            distinct.sort()

            if len(distinct) == 1:
                res_data['month%s'%j].append(distinct[0])

            elif len(distinct) >= 2:
                modal = find_modal(wk_statuses)
                if len(modal) == 1:
                    res_data['month%s'%j].append(modal[0])
                else:
                    
                    modal_order = [(x, status_order.index(x)) for x in modal]
                    modal_order = sorted(modal_order, key=itemgetter(1))
                    res_data['month%s'%j].append(modal_order[0][0])
                    count_ties.append((row['caseid_1979'], j))

         
        #move to next month
        if m < 12:
            m += 1
        elif m == 12:
            m = 1
            y += 1

cols = ['caseid_1979', 'start_y', 'start_m'] + ['month%s'%i for i in range(1, 241)]
res_df_childless = pd.DataFrame(res_data, columns=cols) #restricted dataframe

0 100 200 300 400 500 600


In [21]:
res_df_childless = res_df_childless.apply(update_pre1978_months, axis=1)
res_df_childless.head()

Unnamed: 0,caseid_1979,start_y,start_m,month1,month2,month3,month4,month5,month6,month7,...,month231,month232,month233,month234,month235,month236,month237,month238,month239,month240
0,14,1983,10,3,3,3,3,3,3,3,...,1,1,1,1,1,1,1,1,1,1
1,21,1981,6,2,2,1,3,3,3,3,...,1,1,1,1,1,1,1,1,1,1
2,22,1983,1,1,1,1,1,1,8,8,...,1,1,1,1,1,1,1,1,1,1
3,28,1984,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,29,1980,10,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,1,1


In [22]:
res_df_childless.to_csv('../data/age_seq_childless.csv', index=False)