# Test Stata and Python Output of Maternity Leaves

In [1]:
#setup
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from operator import itemgetter
from datetime import datetime

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()



## Week-to-week crosswalk

Create two functions to convert between week number and calendar year and month

In [3]:
#this is the file to switch between week number in NLSY79 and calendar year, month, and day
xl_file = pd.ExcelFile('../data/continuous_week_crosswalk_2012.xlsx') 
weekdf = xl_file.parse('weekdates1')
weekdf.head()

Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
0,1,1,1978,1,1
1,1,8,1978,2,2
2,1,15,1978,3,3
3,1,22,1978,4,4
4,1,29,1978,5,5


In [4]:
def get_week_num(year, month, which, df=weekdf):
    """
    Given a calendar year and month, return the corresponding week number in NLSY79.
    
    If which=='first', return the first week in that month.
    If which=='last', return the last week in that month. 
    """
    
    the_month = weekdf[(weekdf['Week Start:\nMonth']==month)&(weekdf['Week Start: \nYear']==year)]
    
    if which == 'first':
        return int(the_month.iloc[0]['Continuous \nWeek Number'])
    elif which == 'last':
        return int(the_month.iloc[len(the_month)-1]['Continuous \nWeek Number'])
    
    
def get_year_month(weeknum, df=weekdf):
    """
    Given a week number, return the corresponding calendar month and year (yyyy, m).
    """
    
    the_week = weekdf[weekdf['Continuous \nWeek Number']==weeknum]
    return int(the_week.iloc[0]['Week Start: \nYear']), int(the_week.iloc[0]['Week Start:\nMonth'])

## Retrieve analytic sample

In [5]:
analytic_df = pd.read_csv('../data/analytic_df.csv') #includes fertility data
print analytic_df.shape
analytic_df.head()

(3434, 36)


Unnamed: 0,caseid_1979,sample_id_1979,sample_race_1979,sample_sex_1979,c1dob_m_xrnd,c1dob_y_xrnd,c2dob_m_xrnd,c2dob_y_xrnd,c3dob_m_xrnd,c3dob_y_xrnd,...,y_dob,m_dob,y_age35,y_deceased,last_yr,attr_before35,deceased_before35,y_child18,deceased_child18,attr_child18
0,2,5,3,2,3,1993,11,1994,-4,-4,...,1959,1,1994,,2014,False,False,2011,False,False
1,3,5,3,2,6,1981,10,1983,4,1986,...,1961,8,1996,,2014,False,False,1999,False,False
2,4,5,3,2,8,1980,3,1997,-4,-4,...,1962,8,1997,,1998,False,False,1998,False,False
3,8,6,3,2,3,1976,5,1979,9,1982,...,1958,7,1993,,2014,False,False,1994,False,False
4,16,5,3,2,2,1990,8,1993,9,1996,...,1958,10,1993,,2014,False,False,2008,False,False


## Read weekly arrays and within-job gap dates

In [6]:
#read weekly array data
wkarray_df = pd.read_csv('../data/weeklyarrays.csv')
print wkarray_df.shape
print wkarray_df.columns[:20]

(12686, 3969)
Index([u'caseid_1979', u'hrs_worked_wk_num0000_xrnd',
       u'hrs_worked_wk_num0001_xrnd', u'hrs_worked_wk_num0002_xrnd',
       u'hrs_worked_wk_num0003_xrnd', u'hrs_worked_wk_num0004_xrnd',
       u'hrs_worked_wk_num0005_xrnd', u'hrs_worked_wk_num0006_xrnd',
       u'hrs_worked_wk_num0007_xrnd', u'hrs_worked_wk_num0008_xrnd',
       u'hrs_worked_wk_num0009_xrnd', u'hrs_worked_wk_num0010_xrnd',
       u'hrs_worked_wk_num0011_xrnd', u'hrs_worked_wk_num0012_xrnd',
       u'hrs_worked_wk_num0013_xrnd', u'hrs_worked_wk_num0014_xrnd',
       u'hrs_worked_wk_num0015_xrnd', u'hrs_worked_wk_num0016_xrnd',
       u'hrs_worked_wk_num0017_xrnd', u'hrs_worked_wk_num0018_xrnd'],
      dtype='object')


In [7]:
#read within-job-gap data
gap_df = pd.read_csv("../data/withingaps.csv")
print gap_df.shape
print gap_df.columns[:20]

(12686, 839)
Index([u'caseid_1979', u'per1_start_1979_job01_xrnd',
       u'per1_start_1979_job02_xrnd', u'per1_start_1979_job03_xrnd',
       u'per1_start_1979_job04_xrnd', u'per1_start_1979_job05_xrnd',
       u'per1_stop_1979_job01_xrnd', u'per1_stop_1979_job02_xrnd',
       u'per1_stop_1979_job03_xrnd', u'per1_stop_1979_job04_xrnd',
       u'per1_stop_1979_job05_xrnd', u'per2_start_1979_job01_xrnd',
       u'per2_start_1979_job02_xrnd', u'per2_start_1979_job03_xrnd',
       u'per2_start_1979_job04_xrnd', u'per2_start_1979_job05_xrnd',
       u'per2_stop_1979_job01_xrnd', u'per2_stop_1979_job02_xrnd',
       u'per2_stop_1979_job03_xrnd', u'per2_stop_1979_job04_xrnd'],
      dtype='object')


### Write a function to determine weeks of maternity leave - DISREGARD ALL MISSING GAPS

In [9]:
###modify to handle months prior to 1978


def get_gaps(row, d=missing_data):
    """
    Given a row of respondent data, 
    Return a list of tuples (start week, end week) of work gaps for respondent i
    
    COPY THE SAME FUNCTION AS ABOVE, EXCEPT: COMMENT OUT THE PART THAT HANDLES MISSING CASES
    """
    
    gap_row = gap_df[gap_df.caseid_1979==row['caseid_1979']].iloc[0]
    gaps = []
    
    years = range(1979, 1995)+range(1996, 2015, 2)
    
    #look for within-job gaps in all years. sometimes the gaps were recorded in later years
    for y in years: 
        for j in range(1, 5): #4 possible gap periods
            for k in range(1, 6): #5 possible jobs
                try: 
                    gap_start = gap_row['per%s_start_%s_job0%s_xrnd'%(j, y, k)]
                    gap_stop = gap_row['per%s_stop_%s_job0%s_xrnd'%(j, y, k)]
                    
                    if gap_start >= 0 and gap_stop >= 0: # valid, no missing
                        gaps.append((gap_start, gap_stop))
#                    elif gap_start == -4 and gap_stop == -4: # valid skip
#                        pass # valid skip
#                    else:
#                        pass
#                         missing_data['caseid_1979'].append(row['caseid_1979'])
#                         missing_data['start_col'].append('per%s_start_%s_job0%s_xrnd'%(j, y, k))
#                         missing_data['start'].append(gap_start) 
                        
#                         start_y = None
#                         start_m = None
#                         if gap_start > 0:
#                             start_y, start_m = get_year_month(gap_start)
#                         missing_data['start_y'].append(start_y)
#                         missing_data['start_m'].append(start_m)

#                         missing_data['end_col'].append('per%s_stop_%s_job0%s_xrnd'%(j, y, k))
#                         missing_data['end'].append(gap_stop)
                        
#                         end_y = None
#                         end_m = None
#                         if gap_stop > 0:
#                             end_y, end_m = get_year_month(gap_stop)
#                         missing_data['end_y'].append(end_y)
#                         missing_data['end_m'].append(end_m)

#                         print 'missing data - id:', row['caseid_1979'], 'year:', y, 'gap #:', j, 'job #:', k, 'weeks: ', (gap_start, gap_stop)
                        
                except KeyError: # some years do not report 5 jobs
                    pass
                
    return gaps



In [10]:
#select only week array data for our selected sample
merged_wkarray_df = analytic_df.merge(wkarray_df, on='caseid_1979')
print merged_wkarray_df.shape

(3434, 4004)


In [11]:
ml_data = defaultdict(list) # store maternity leaves data


for i, row in merged_wkarray_df.iterrows():
    
    if i % 100 == 0:
        print i, 
    
    gaps = get_gaps(row)
    
    for j in range(1, 12): # all births
        dob_y = row['c%idob_y_xrnd'%j]
        dob_m = row['c%idob_m_xrnd'%j]
        
        if dob_y>0 and dob_m>0:
            
            # note: job gap dates do not go prior to 1978 
            # (we do not count the three cases with dates prior to 1/1/1978, as explained above)
            
            if dob_y >= 1978: 
                
                for (gap_start, gap_stop) in gaps:
                    birth_wks = range(get_week_num(dob_y, dob_m, 'first'), get_week_num(dob_y, dob_m, 'last')+1)
                    gap_wks = range(gap_start, gap_stop+1)
                    is_ma_leave = False

                    if set(birth_wks)&set(gap_wks): #the gap overlaps with the birth
                        is_ma_leave = True

                    elif gap_start>birth_wks[-1] and gap_start-birth_wks[-1]<=16: #gap started within 16 weeks after birth
                        is_ma_leave = True

                    if is_ma_leave:
                        ml_data['caseid_1979'].append(row['caseid_1979'])
                        ml_data['start_gap'].append(gap_start)
                        ml_data['stop_gap'].append(gap_stop)



0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400


In [40]:
ml_df = pd.DataFrame(ml_data)
print ml_df.shape
ml_df.head()

(1718, 3)


Unnamed: 0,caseid_1979,start_gap,stop_gap
0,2,792,800
1,4,1002,1013
2,8,67,92
3,8,218,256
4,16,645,650


## Compare with Stata output

In [57]:
ml_df_stata = pd.read_stata('../data/matleaves_long.dta')
print ml_df_stata.shape
ml_df_stata.head()

(2792, 4)


Unnamed: 0,caseid_1979,start_gap,stop_gap,matleave
0,2,792.0,800.0,1.0
1,4,1002.0,1013.0,1.0
2,8,67.0,92.0,1.0
3,8,218.0,256.0,2.0
4,10,271.0,272.0,1.0


In [58]:
# edit
ml_df_stata = ml_df_stata[['caseid_1979', 'start_gap', 'stop_gap']]
ml_df_stata.start_gap = ml_df_stata.start_gap.astype(int)
ml_df_stata.stop_gap = ml_df_stata.stop_gap.astype(int)
ml_df_stata.head()

Unnamed: 0,caseid_1979,start_gap,stop_gap
0,2,792,800
1,4,1002,1013
2,8,67,92
3,8,218,256
4,10,271,272


In [78]:
sample = analytic_df.caseid_1979.unique().tolist()
ml_df_stata = ml_df_stata[ml_df_stata.caseid_1979.isin(sample)]
print ml_df_stata.shape

(1691, 3)


In [74]:
df = pd.concat([ml_df, ml_df_stata])
# df = df.reset_index(drop=True)
left_df = df.drop_duplicates(keep=False)
print left_df.shape

(16, 3)


In [75]:
left_df

Unnamed: 0,caseid_1979,start_gap,stop_gap
482,2710,1363,1365
733,4050,920,921
768,4286,704,704
787,4424,860,860
828,4589,428,430
988,5406,320,321
1002,5442,759,760
1083,5873,602,602
1139,6334,707,708
1558,9858,76,76


In [84]:
analytic_df[analytic_df.caseid_1979==4589].iloc[0]

caseid_1979           4589
sample_id_1979           5
sample_race_1979         3
sample_sex_1979          2
c1dob_m_xrnd            11
c1dob_y_xrnd          1980
c2dob_m_xrnd             3
c2dob_y_xrnd          1986
c3dob_m_xrnd            -4
c3dob_y_xrnd            -4
c4dob_m_xrnd            -4
c4dob_y_xrnd            -4
c5dob_m_xrnd            -4
c5dob_y_xrnd            -4
c6dob_m_xrnd            -4
c6dob_y_xrnd            -4
c7dob_m_xrnd            -4
c7dob_y_xrnd            -4
c8dob_m_xrnd            -4
c8dob_y_xrnd            -4
c9dob_m_xrnd            -4
c9dob_y_xrnd            -4
c10dob_m_xrnd           -4
c10dob_y_xrnd           -4
c11dob_m_xrnd           -4
c11dob_y_xrnd           -4
y_dob                 1959
m_dob                   10
y_age35               1994
y_deceased             NaN
last_yr               2014
attr_before35        False
deceased_before35    False
y_child18             1998
deceased_child18     False
attr_child18         False
Name: 1542, dtype: object

In [83]:
for i, row in left_df.iterrows():
    python_df = ml_df[ml_df.caseid_1979==row['caseid_1979']]
    stata_df = ml_df_stata[ml_df_stata.caseid_1979==row['caseid_1979']]
    
    if python_df.shape[0] != 0:
        print 'python'
        print python_df
    if stata_df.shape[0] != 0:
        print 'stata'
        print stata_df
        
    print 
    

python
     caseid_1979  start_gap  stop_gap
482         2710       1363      1365

python
     caseid_1979  start_gap  stop_gap
733         4050        920       921
734         4050        920       923
stata
      caseid_1979  start_gap  stop_gap
1048         4050        920       923

python
     caseid_1979  start_gap  stop_gap
768         4286        704       704
769         4286        721       722
stata
      caseid_1979  start_gap  stop_gap
1093         4286        721       722

python
     caseid_1979  start_gap  stop_gap
787         4424        860       860
788         4424        861       863
789         4424        864       864
790         4424        864       866
stata
      caseid_1979  start_gap  stop_gap
1122         4424        861       863
1123         4424        864       864
1124         4424        864       866

python
     caseid_1979  start_gap  stop_gap
828         4589        428       430

python
     caseid_1979  start_gap  stop_gap
987         540