# OptOut Project: Sequence Construction for Plotting Purpose

* Include an earlier year to show in state distribution plot
* Construct monthly sequences use weekly sequences for post-1978 part and work history raw variables for pre-1978 part

In [29]:
#setup
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
from collections import defaultdict
from operator import itemgetter
from datetime import datetime, timedelta

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

# Week-to-week crosswalk

* Extend the crosswalk to cover pre-1978 weeks
* Create two functions to convert between week number and calendar year and month

In [2]:
#this is the file to switch between week number in NLSY79 and calendar year, month, and day
xl_file = pd.ExcelFile('../data/continuous_week_crosswalk_2012.xlsx') 

weekdf = xl_file.parse('weekdates1')
weekdf.head()

Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
0,1,1,1978,1,1
1,1,8,1978,2,2
2,1,15,1978,3,3
3,1,22,1978,4,4
4,1,29,1978,5,5


In [3]:
# the earliest date when the 18-year seq starts is 1971/6/1
# so one year before that is 1970/6/1

# construct new crosswalk pre-1978, use negative week number

earliest = datetime(1970, 6, 1)

data = defaultdict(list)

i = -1
week_start = datetime(1978, 1, 1)

while week_start > earliest:

    prev_week_start = week_start - timedelta(days=7) # count back 7 days
    
    data['Week Start:\nMonth'].append(prev_week_start.month)
    data['Week Start: \nDay'].append(prev_week_start.day)
    data['Week Start: \nYear'].append(prev_week_start.year)
    
    data['Calendar Year \nWeek Number '].append(None) # leave out the Calendar Year Week Number column. Don't need it
    data['Continuous \nWeek Number'].append(i)
    
    i -= 1 # update index
    week_start = prev_week_start # update week_start
    
    
weekdf_pre1978 = pd.DataFrame(data, columns=weekdf.columns)
weekdf_pre1978.head() # print first five rows

Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
0,12,25,1977,,-1
1,12,18,1977,,-2
2,12,11,1977,,-3
3,12,4,1977,,-4
4,11,27,1977,,-5


In [4]:
weekdf_pre1978.tail() # print last five rows

Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
391,6,28,1970,,-392
392,6,21,1970,,-393
393,6,14,1970,,-394
394,6,7,1970,,-395
395,5,31,1970,,-396


In [5]:
# reverse row orders - put the earliest week as the first row
weekdf_pre1978 = weekdf_pre1978.reindex(index=weekdf_pre1978.index[::-1])
weekdf_pre1978.head()

Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
395,5,31,1970,,-396
394,6,7,1970,,-395
393,6,14,1970,,-394
392,6,21,1970,,-393
391,6,28,1970,,-392


In [6]:
# row bind with our original weekdf
print weekdf.shape
weekdf = pd.concat([weekdf_pre1978, weekdf])
print weekdf.shape
weekdf.head()

(1879, 5)
(2275, 5)


Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
395,5,31,1970,,-396
394,6,7,1970,,-395
393,6,14,1970,,-394
392,6,21,1970,,-393
391,6,28,1970,,-392


In [7]:
# test our new crosswalk

def get_week_num(year, month, which, df=weekdf):
    """
    Given a calendar year and month, return the corresponding week number in NLSY79.
    
    If which=='first', return the first week in that month.
    If which=='last', return the last week in that month. 
    """
    
    the_month = df[(df['Week Start:\nMonth']==month)&(df['Week Start: \nYear']==year)]
    
    if which == 'first':
        return int(the_month.iloc[0]['Continuous \nWeek Number'])
    elif which == 'last':
        return int(the_month.iloc[len(the_month)-1]['Continuous \nWeek Number'])
    

def get_year_month(weeknum, df=weekdf):
    """
    Given a week number, return the corresponding calendar month and year (yyyy, m).
    """
    
    the_week = df[df['Continuous \nWeek Number']==weeknum]
    return int(the_week.iloc[0]['Week Start: \nYear']), int(the_week.iloc[0]['Week Start:\nMonth'])


print get_week_num(1976, 2, 'first')
print get_year_month(-100)
print get_week_num(1976, 2, 'last')
print get_year_month(-96)

-100
(1976, 2)
-96
(1976, 2)


# Read datasets


* our customized weekly status dataframe
* NLSY79 weekly statuses
* within job gaps to identify maternity leaves
* work history raw variables reported in 1979
* child birth dates

My status classification scheme:

* 1 - full-time employment
* 2 - working with missing hours
* 3 - part-time employent
* 4 - nonwokring 
* 5 - maternity leave

In [8]:
full_df = pd.read_csv('../data/full_wk_df.csv')
print full_df.shape
full_df.head()

(3434, 1882)


Unnamed: 0,id,start_y,start_m,week1,week2,week3,week4,week5,week6,week7,...,week1870,week1871,week1872,week1873,week1874,week1875,week1876,week1877,week1878,week1879
0,2,1993,4,4,4,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
1,3,1981,7,4,4,4,4,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,4,1980,9,4,4,4,4,4,4,4,...,0,0,0,0,0,0,0,0,0,0
3,8,1976,4,4,4,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
4,16,1990,3,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [9]:
#read weekly array data
wkarray_df = pd.read_csv('../data/weeklyarrays.csv')
print wkarray_df.shape
print wkarray_df.columns[:20]

(12686, 3969)
Index([u'caseid_1979', u'hrs_worked_wk_num0000_xrnd',
       u'hrs_worked_wk_num0001_xrnd', u'hrs_worked_wk_num0002_xrnd',
       u'hrs_worked_wk_num0003_xrnd', u'hrs_worked_wk_num0004_xrnd',
       u'hrs_worked_wk_num0005_xrnd', u'hrs_worked_wk_num0006_xrnd',
       u'hrs_worked_wk_num0007_xrnd', u'hrs_worked_wk_num0008_xrnd',
       u'hrs_worked_wk_num0009_xrnd', u'hrs_worked_wk_num0010_xrnd',
       u'hrs_worked_wk_num0011_xrnd', u'hrs_worked_wk_num0012_xrnd',
       u'hrs_worked_wk_num0013_xrnd', u'hrs_worked_wk_num0014_xrnd',
       u'hrs_worked_wk_num0015_xrnd', u'hrs_worked_wk_num0016_xrnd',
       u'hrs_worked_wk_num0017_xrnd', u'hrs_worked_wk_num0018_xrnd'],
      dtype='object')


In [57]:
#read within-job-gap data
gap_df = pd.read_csv("../data/withingaps.csv")
print gap_df.shape
gap_df.head()

(12686, 839)


Unnamed: 0,caseid_1979,per1_start_1979_job01_xrnd,per1_start_1979_job02_xrnd,per1_start_1979_job03_xrnd,per1_start_1979_job04_xrnd,per1_start_1979_job05_xrnd,per1_stop_1979_job01_xrnd,per1_stop_1979_job02_xrnd,per1_stop_1979_job03_xrnd,per1_stop_1979_job04_xrnd,...,per3_start_2014_job03_xrnd,per3_start_2014_job04_xrnd,per3_stop_2014_job01_xrnd,per3_stop_2014_job02_xrnd,per3_stop_2014_job03_xrnd,per3_stop_2014_job04_xrnd,per4_start_2014_job01_xrnd,per4_start_2014_job02_xrnd,per4_stop_2014_job01_xrnd,per4_stop_2014_job02_xrnd
0,1,-4,-4,-4,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
1,2,-4,-4,-4,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
2,3,-4,-4,-4,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
3,4,-4,-4,-4,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,5,64,-4,-4,-4,-4,65,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4


In [10]:
df79 = pd.read_csv('../data/work_history79.csv')
df79.head()

Unnamed: 0,caseid_1979,employer_startdate_01_m_1979,employer_startdate_01_d_1979,employer_startdate_01_y_1979,employer_startdate_02_m_1979,employer_startdate_02_d_1979,employer_startdate_02_y_1979,employer_startdate_03_m_1979,employer_startdate_03_d_1979,employer_startdate_03_y_1979,...,nonemployed_prior_int_01_1979,nonemployed_prior_int_02_1979,nonemployed_prior_int_03_1979,nonemployed_prior_int_04_1979,nonemployed_prior_int_05_1979,nonemployed_since_int_01_1979,nonemployed_since_int_02_1979,nonemployed_since_int_03_1979,nonemployed_since_int_04_1979,nonemployed_since_int_05_1979
0,1,5,22,78,9,20,75,-4,-4,-4,...,-4,-4,-4,-4,-4,0,-4,-4,-4,-4
1,2,5,8,78,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,0,-4,-4,-4,-4
2,3,9,11,78,5,15,78,1,30,78,...,-4,-4,-4,-4,-4,0,0,0,-4,-4
3,4,-4,-4,-4,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,5,2,1,78,6,5,78,-4,-4,-4,...,-4,-4,-4,-4,-4,1,0,-4,-4,-4


In [11]:
# read dataframe with child births
analytic_df = pd.read_csv('../data/analytic_df.csv')
analytic_df.head()

Unnamed: 0,caseid_1979,sample_id_1979,sample_race_1979,sample_sex_1979,c1dob_m_xrnd,c1dob_y_xrnd,c2dob_m_xrnd,c2dob_y_xrnd,c3dob_m_xrnd,c3dob_y_xrnd,...,y_dob,m_dob,y_age35,y_deceased,last_yr,attr_before35,deceased_before35,y_child18,deceased_child18,attr_child18
0,2,5,3,2,3,1993,11,1994,-4,-4,...,1959,1,1994,,2014,False,False,2011,False,False
1,3,5,3,2,6,1981,10,1983,4,1986,...,1961,8,1996,,2014,False,False,1999,False,False
2,4,5,3,2,8,1980,3,1997,-4,-4,...,1962,8,1997,,1998,False,False,1998,False,False
3,8,6,3,2,3,1976,5,1979,9,1982,...,1958,7,1993,,2014,False,False,1994,False,False
4,16,5,3,2,2,1990,8,1993,9,1996,...,1958,10,1993,,2014,False,False,2008,False,False


## Explore job date missingness

* missing job start day: 34 cases
* missing job start month: 3 cases
* missing job stop day: 14 cases

In [12]:
sample_df = full_df[['id', 'start_y', 'start_m']]
sample_df.head()

Unnamed: 0,id,start_y,start_m
0,2,1993,4
1,3,1981,7
2,4,1980,9
3,8,1976,4
4,16,1990,3


In [13]:
# get only respondents whose prebirth sequence proceeds 1978 - 664 cases

pre1978_df = sample_df[sample_df.start_y<1979]
print pre1978_df.shape

(664, 3)


In [14]:
missing = defaultdict(int)

def find_job_dates(Id):
    """
    Given job id, return a list of job dates
    """
    
    row = df79[df79.caseid_1979==Id].iloc[0] # with extra variables
    
    job_dates = [] 
    
    for i in range(1, 6):
        start_y = row['employer_startdate_%02d_y_1979'%i]
        start_m = row['employer_startdate_%02d_m_1979'%i]
        start_d = row['employer_startdate_%02d_d_1979'%i]
        
        stop_y = row['employer_stopdate_%02d_y_1979'%i]
        stop_m = row['employer_stopdate_%02d_m_1979'%i]
        stop_d = row['employer_stopdate_%02d_d_1979'%i]
        
        # edit years - years are often reported in two digit format
        if start_y < 1900 and start_y > 0:
            start_y += 1900
        if stop_y < 1900 and stop_y > 0:
            stop_y += 1900
        
        if start_m < 0 and start_m != -4:
            print 'missing job start month:', row['caseid_1979'], (start_y, start_m, start_d)
            missing['missing job start month'] += 1
            start_m = 6

        if stop_m < 0 and stop_m != -4:
            print 'missing job start month:', row['caseid_1979'], (stop_y, stop_m, stop_d)
            missing['missing job stop month'] += 1   
            stop_m = 6
    
        if start_d < 0 and start_d != -4: # assume job started on the 15th
            print 'missing job start day:', row['caseid_1979'], (start_y, start_m, start_d)
            missing['missing job start day'] += 1
            start_d = 15
            
        if stop_d < 0 and stop_d != -4: # assume job started on the 15th
            print 'missing job stop day:', row['caseid_1979'], (stop_y, stop_m, stop_d)
            missing['missing job stop day'] += 1
            stop_d = 15            
            
    return job_dates


res = pre1978_df.id.map(find_job_dates)

missing job start day: 205 (1977, 9, -2)
missing job start day: 244 (1977, 11, -2)
missing job start day: 244 (1978, 2, -2)
missing job start day: 244 (1978, 2, -2)
missing job start day: 244 (1977, 2, -2)
missing job stop day: 244 (1979, 1, -2)
missing job start day: 382 (1977, 10, -2)
missing job stop day: 382 (1978, 2, -2)
missing job start day: 619 (1978, 11, -2)
missing job stop day: 619 (1978, 11, -2)
missing job start day: 1111 (1976, 4, -2)
missing job start day: 1225 (1977, 3, -3)
missing job start day: 1469 (1978, 6, -3)
missing job stop day: 1469 (1978, 9, -3)
missing job start day: 4179 (1977, 8, -2)
missing job start day: 4199 (1978, 1, -3)
missing job stop day: 4199 (1978, 2, -3)
missing job start day: 4280 (1977, 2, -2)
missing job start day: 5064 (1976, 6, -2)
missing job start day: 5121 (1978, 10, -3)
missing job stop day: 5121 (1978, 10, -2)
missing job start month: 5425 (1974, -2, -2)
missing job start day: 5425 (1974, 6, -2)
missing job start day: 5615 (1977, 11, -2

In [15]:
missing

defaultdict(int,
            {'missing job start day': 34,
             'missing job start month': 3,
             'missing job stop day': 14})

# Construct one-year-prebirth monthly sequences

My status classification scheme:

* 1 - full-time employment
* 2 - working with missing hours
* 3 - part-time employent [**NEW: treat working with hour 0 as part-time**]
* 4 - nonwokring 
* 5 - maternity leave

In [16]:
# helper functions

def find_job_dates(Id):
    """
    Given job id, return a list of job dates
    """
    
    row = df79[df79.caseid_1979==Id].iloc[0] # with extra variables
    
    job_dates = [] 
    
    for i in range(1, 6):
        start_y = row['employer_startdate_%02d_y_1979'%i]
        start_m = row['employer_startdate_%02d_m_1979'%i]
        start_d = row['employer_startdate_%02d_d_1979'%i]
        
        stop_y = row['employer_stopdate_%02d_y_1979'%i]
        stop_m = row['employer_stopdate_%02d_m_1979'%i]
        stop_d = row['employer_stopdate_%02d_d_1979'%i]
        
        # edit years - years are often reported in two digit format
        if start_y < 1900 and start_y > 0:
            start_y += 1900
        if stop_y < 1900 and stop_y > 0:
            stop_y += 1900
        
        if start_m < 0 and start_m != -4:
            # print 'missing job start month:', row['caseid_1979'], (start_y, start_m, start_d)
            start_m = 6

        if stop_m < 0 and stop_m != -4:
            # print 'missing job start month:', row['caseid_1979'], (stop_y, stop_m, stop_d)
            stop_m = 6
    
        if start_d < 0 and start_d != -4: # assume job started on the 15th
            # print 'missing job start day:', row['caseid_1979'], (start_y, start_m, start_d)
            start_d = 15
            
        if stop_d < 0 and stop_d != -4: # assume job started on the 15th
            # print 'missing job stop day:', row['caseid_1979'], (stop_y, stop_m, stop_d)
            stop_d = 15            
        
        if start_y > 0 and start_m > 0 and start_d > 0:
            if stop_y == -4 and stop_m == -4 and stop_d == -4: # job still current at 1979 interview
                job_dates.append((i, start_y, start_m, start_d, 1979, 1, 1))
            elif stop_y > 0 and stop_m > 0 and stop_d > 0:
                job_dates.append((i, start_y, start_m, start_d, stop_y, stop_m, stop_d))
            else:
                raise Exception, 'should be no more missing stop date'
        elif start_y == -4 and start_m == -4 and start_d == -4: # valid skip
            pass 
        else:
            raise Exception, 'should be no more missing start date'
            
    return job_dates


In [17]:
def is_in_job_period(week_num, job_date):
    """
    Given a week number and the start and stop dates of a job, 
    return True if the given week overlaps the job duration
    """
    i, start_y, start_m, start_d, stop_y, stop_m, stop_d = job_date
    
    the_week = weekdf[weekdf['Continuous \nWeek Number']==week_num].iloc[0]
    week_start = datetime(int(the_week['Week Start: \nYear']), int(the_week['Week Start:\nMonth']), int(the_week['Week Start: \nDay']))
    week_stop = week_start + timedelta(days=6)
    
    if week_stop < datetime(start_y, start_m, start_d): # week ended before job started
        return False
    elif week_start > datetime(stop_y, stop_m, stop_d): # week started after job ended
        return False
    else: # otherwise, there is at leats one day overlap
        return True
    
    
# test it
print is_in_job_period(week_num=-252, job_date=(1, 1975, 9, 20, 1979, 1, 1)) # week -252 starts on 1973/3/4
print is_in_job_period(week_num=-17, job_date=(1, 1975, 9, 20, 1979, 1, 1)) # week -17 starts on 1977/9/4
print is_in_job_period(week_num=200, job_date=(1, 1975, 9, 20, 1979, 1, 1)) # week 200 starts on 1981/10/25

False
True
False


In [18]:
def get_weekly_status(week_num, job_dates, row):
    """
    Given a week number, find all jobs that overlap with the week.
    
    If no overlap, weekly status is 4 (nonemployed)
    
    If there is overlap(s), add up hours per week from all overlapped jobs. 
    - If hours >= 35, weekly status is 1 (full-time)
    - If hours is between [0, 35), weekly status is 3 (part-time)
    - If hours is missing, weekly status is 2 (working with missing hours)
    """
    
    overlap_jobs = [job_date for job_date in job_dates if is_in_job_period(week_num, job_date)]
        
    if len(overlap_jobs) == 0: # this month does not overlap with any job
        return 4 # nonemployed

    else:    
        overlap_job_nums = map(itemgetter(0), overlap_jobs)
        hours = [row['hours_per_week_job_%02d_1979'%j] for j in overlap_job_nums]
        missing_hours = [h < 0 for h in hours]

        if sum(missing_hours) > 0: # has missing hours
            # print 'working with missing hours:', row['caseid_1979'], 'week #:', week_num
            return 2 # working with missing hours
        else:
            if sum(hours) >= 35:
                return 1 # full-time
            else:
                return 3 # part-time

In [19]:
def find_modal(l):
    """
    Given a list l, return the most common element in the list. 
    if there is a tie, return all elements.
    """
    
    #first, count frequency of each unique item in the list
    counter = defaultdict(int)
    for item in l:
        counter[item] += 1

    #rank items from most frequent to least frequent
    popular_items = sorted(counter, key=counter.get, reverse = True)
    
    most_common = [popular_items[0]]
    
    for other in popular_items[1:]:
        if counter[other] < counter[most_common[0]]:
            break
        elif counter[other] == counter[most_common[0]]: #find an equally frequent item
            most_common.append(other)
        else:
            print 'impossible'

    return most_common

#for example
print find_modal([1, 1, 2, 1, 5])
print find_modal([1, 1, 2, 1, 5, 2, 2, 33])

[1]
[1, 2]


### Helper functions for pre-1978 within-job gaps

**Decision rules:** If has nonemployed period prior to interview and working up to 1 month before the birth and 4 months after the birth, assign these months to status 5 - maternity leave

In [31]:
def diff_month(d1, d2):
    """
    Return the number of months between the two dates
    """
    
    return (d1.year - d2.year) * 12 + d1.month - d2.month


def has_nonemployed_period(row):
    """
    Return True if respondent reported any period nonemployed
    """
    
    vals = [row['nonemployed_prior_int_%02d_1979'%i] for i in range(1, 6)]
    # vals += [row['nonemployed_since_int_%02d_1979'%i] for i in range(1, 6)]
    
    return 1 in vals


def is_close_to_birth(y, m, row):
    """
    Return True if the given month is up to 1 month before any birth and 4 months after any birth.
    """
    
    for i in range(1, 12):
        if row['c%idob_y_xrnd'%i] > 0: # there is a birth
            
            if row['c%idob_m_xrnd'%i] > 0:
                d = diff_month(datetime(int(row['c%idob_y_xrnd'%i]), int(row['c%idob_m_xrnd'%i]), 1), datetime(y, m, 1))
                return (d >= -4 and d <= 1)
            
            else: # if birth month missing, assume June
                d = diff_month(datetime(int(row['c%idob_y_xrnd'%i]), 6, 1), datetime(y, m, 1))
                return (d >= -4 and d <= 1)
    




In [27]:
# merge datasets into one dataframe

full_df['caseid_1979'] = full_df.id # make a common id column, eaiser for merging
merged = full_df.merge(df79, on='caseid_1979')
merged = merged.merge(wkarray_df[['caseid_1979', 'hrs_worked_wk_num0000_xrnd']], on='caseid_1979')
merged = merged.merge(analytic_df, on='caseid_1979')

In [41]:
# Now, construct prebirth monthly sequences

random.seed(7222014) # set random seed

res_data = defaultdict(list) #create an empty dictionary to store data
count_ties = []

for i, row in merged.iterrows():
    
    # keep basic info: id, sequence start year and month
    res_data['id'].append(row['id'])
    
    y = row['start_y']
    m = row['start_m']
    
    # look at weekly statuses    
    for j in range(-1, -13, -1): # from month -12 to month -1

        #first, move to previous month (we are starting y, m from post-birth)
        if m == 1:
            m = 12
            y -= 1
        else:
            m -= 1
        

        status = None
        
        ### modification: prior to 1978 ###
        if y < 1978: 
            hrs_pre1978 = wkarray_df[wkarray_df.caseid_1979==row['id']].iloc[0]['hrs_worked_wk_num0000_xrnd']
            
            if hrs_pre1978 == 0: # never worked before 1978
                status = 4 # assign 4 - nonemployed
                
            else:
                weeks = range(get_week_num(y, m, 'first'), get_week_num(y, m, 'last')+1)
                wk_statuses = [get_weekly_status(w, job_dates, row) for w in weeks]

                #find unique items in the given month 
                distinct = list(set(wk_statuses))
                distinct.sort()

                if len(distinct) == 1:
                    status = distinct[0]

                elif len(distinct) >= 2:
                    print 'there are multiple statuses:', row['caseid_1979'], wk_statuses
                    modal = find_modal(wk_statuses)
                    if len(modal) == 1:
                        status = modal[0]
                    else:
                        print 'there is a pre-1978 tie:', row['caseid_1979']
                        count_ties.append(row['caseid_1979'])
                        status = random.choice(modal)
                        
                    
                # pre-1978/1/1 maternity leaves
                # only for pre-1978 sequence, if R has some nonemployed period, 
                # given R is working that month, and that month is within 4 months after birth or 1 month prior to birth,
                # we assign maternity leave to that month
                if has_nonemployed_period(row) and status in [1, 2, 3] and is_close_to_birth(y, m, row):
                    print 'changed to maternity leave:', row['caseid_1979'], y, m
                    status = 5 

        
        else: 
            
            # post-1978/1/1 maternity leaves are already identified in weekly statuses in full_df
            
            start = get_week_num(y, m, 'first')
            end = get_week_num(y, m, 'last')
            wk_statuses = [row['week%i'%k] for k in range(start, end+1)]
            
            #### replace status 99 (working, w/ hour 0) as status 3 (part-time) #### 
            wk_statuses = [v if v != 99 else 3 for v in wk_statuses]
            ##########################################################

            #find unique items in the given month 
            distinct = list(set(wk_statuses))
            distinct.sort()

            if len(distinct) == 1:
                status = distinct[0]

            elif len(distinct) >= 2:
                modal = find_modal(wk_statuses)
                if len(modal) == 1:
                    status = modal[0]
                else:
                    status = random.choice(modal)
                    count_ties.append(row['caseid_1979'])
                    

        res_data['month%s'%j].append(status)



cols = ['id'] + ['month%s'%i for i in range(-12, 0)]
res_df = pd.DataFrame(res_data, columns=cols) #restricted dataframe
res_df.head()

there are multiple statuses: 205 [4, 1, 1, 1]
there are multiple statuses: 1238 [4, 3, 3, 3]
there are multiple statuses: 1406 [4, 1, 1, 1]
changed to maternity leave: 1495 1977 10
there are multiple statuses: 1495 [4, 2, 2, 2]
changed to maternity leave: 1495 1977 9
there are multiple statuses: 2447 [4, 2, 2, 2]
there are multiple statuses: 2742 [4, 2, 2, 2]
there are multiple statuses: 3445 [4, 1, 1, 1]
changed to maternity leave: 4105 1977 11
changed to maternity leave: 4105 1977 10
there are multiple statuses: 4105 [4, 2, 2, 2]
there are multiple statuses: 4325 [4, 1, 1, 1]
there are multiple statuses: 4400 [4, 1, 1, 1]
changed to maternity leave: 4400 1977 9
there are multiple statuses: 4476 [4, 1, 1, 1]
there are multiple statuses: 4538 [4, 2, 2, 2]
there are multiple statuses: 4780 [4, 2, 2, 2]
there are multiple statuses: 4974 [4, 3, 3, 3]
there are multiple statuses: 5782 [4, 1, 1, 1]
there are multiple statuses: 6177 [4, 2, 2, 2]
changed to maternity leave: 6177 1977 9
there 

Unnamed: 0,id,month-12,month-11,month-10,month-9,month-8,month-7,month-6,month-5,month-4,month-3,month-2,month-1
0,2,1,1,1,1,1,1,1,1,1,4,4,5
1,3,1,1,1,1,1,1,1,1,1,1,4,4
2,4,4,4,4,4,4,4,4,4,4,4,4,4
3,8,4,4,4,4,4,4,4,4,4,4,4,4
4,16,1,1,1,1,1,1,1,1,1,1,1,1


In [42]:
# how many ties?
len(count_ties)

505

In [43]:
# export to csv
res_df.to_csv("../data/monthly_df_prebirth.csv", index=False)

## Explore descriptive statistics of the prebirth sequence

In [44]:
def count_status(row, status):
    """
    Count how many months have the given status
    """
    
    has_status = [row['month%s'%i]==status for i in range(-12, 0)]
    return sum(has_status)

In [51]:
# status 0 - completely missing
counts = res_df.apply(lambda row: count_status(row, 0), axis=1)
counts.value_counts()

0     3345
1       15
3       11
2       11
12      10
9        9
7        7
4        7
5        6
8        4
6        4
11       3
10       2
dtype: int64

In [52]:
# The respondents with status 0 in prebirth sequence all started their sequences after 1978/1/1
# There should be no missing pre-1978
tmp = full_df[counts>0]
pd.crosstab(tmp.start_y, tmp.start_m)

start_m,1,2,3,4,5,6,7,8,9,10,11,12
start_y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1978,0,1,0,0,1,1,1,2,0,4,1,1
1979,4,2,5,3,4,3,1,3,5,1,3,7
1980,8,4,2,3,2,0,1,3,0,3,3,0
1981,1,1,1,0,0,0,0,0,0,0,0,0
1986,0,0,0,0,0,0,0,0,0,0,0,1
1989,0,0,0,0,0,0,0,0,0,0,1,0
1990,0,0,0,0,1,0,0,0,0,0,0,0
1994,0,0,0,0,0,1,0,0,0,0,0,0


In [53]:
# status 1 - full-time
counts = res_df.apply(lambda row: count_status(row, 1), axis=1)
counts.value_counts()

0     1663
12     513
11     255
10     155
2      117
1      104
3      100
9       96
8       93
4       90
5       87
6       87
7       74
dtype: int64

In [54]:
# status 2 - work with missing hours
counts = res_df.apply(lambda row: count_status(row, 2), axis=1)
counts.value_counts()

0     3399
4        7
1        6
2        5
3        4
6        4
5        3
7        2
8        2
11       1
9        1
dtype: int64

In [55]:
# status 3 - part-time
counts = res_df.apply(lambda row: count_status(row, 3), axis=1)
counts.value_counts()

0     2626
2      112
1      106
3       98
4       86
5       68
7       58
6       54
11      48
9       48
10      46
12      43
8       41
dtype: int64

In [56]:
# status 4 - nonemployed
counts = res_df.apply(lambda row: count_status(row, 4), axis=1)
counts.value_counts()

12    1130
0     1038
1      156
2      141
3      126
9      118
10     116
8      113
7      109
4      107
5       97
11      96
6       87
dtype: int64