# Weekly Sequences for Hope's Project

## Sequence Construction

In [1]:
#setup
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from operator import itemgetter
from datetime import datetime, timedelta

# set some nicer defaults for matplotlib
from matplotlib import rcParams

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

## Week-to-week crosswalk

Create two functions to convert between week number and calendar year and month

In [2]:
#this is the file to switch between week number in NLSY79 and calendar year, month, and day
xl_file = pd.ExcelFile('../data/continuous_week_crosswalk_r26.xlsx') 
weekdf = xl_file.parse('weekdates1')
weekdf.head()

Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
0,1,1,1978,1,1
1,1,8,1978,2,2
2,1,15,1978,3,3
3,1,22,1978,4,4
4,1,29,1978,5,5


In [3]:
def get_week_num(year, month, which, df=weekdf):
    """
    Given a calendar year and month, return the corresponding week number in NLSY79.
    
    If which=='first', return the first week in that month.
    If which=='last', return the last week in that month. 
    """
    
    the_month = df[(df['Week Start:\nMonth']==month)&(df['Week Start: \nYear']==year)]
    
    if which == 'first':
        return int(the_month.iloc[0]['Continuous \nWeek Number'])
    elif which == 'last':
        return int(the_month.iloc[len(the_month)-1]['Continuous \nWeek Number'])

    
def get_year_month(weeknum, df=weekdf):
    """
    Given a week number, return the corresponding calendar month and year (yyyy, m).
    """
    
    the_week = df[df['Continuous \nWeek Number']==weeknum]
    return int(the_week.iloc[0]['Week Start: \nYear']), int(the_week.iloc[0]['Week Start:\nMonth'])

## Retrieve Extended Sample

In [4]:
analytic_df = pd.read_csv('../data/sample_hope.csv') #includes fertility data
print analytic_df.shape
analytic_df.head()

(4926, 26)


Unnamed: 0,caseid_1979,sample_id_1979,sample_race_1979,sample_sex_1979,c1dob_m_xrnd,c1dob_y_xrnd,c2dob_m_xrnd,c2dob_y_xrnd,c3dob_m_xrnd,c3dob_y_xrnd,...,c7dob_m_xrnd,c7dob_y_xrnd,c8dob_m_xrnd,c8dob_y_xrnd,c9dob_m_xrnd,c9dob_y_xrnd,c10dob_m_xrnd,c10dob_y_xrnd,c11dob_m_xrnd,c11dob_y_xrnd
0,1,5,3,2,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
1,2,5,3,2,3,1993,11,1994,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
2,3,5,3,2,6,1981,10,1983,4,1986,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
3,4,5,3,2,8,1980,3,1997,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,8,6,3,2,3,1976,5,1979,9,1982,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4


## Read weekly arrays and within-job gap dates

In [5]:
#read weekly array data
wkarray_df = pd.read_csv('../data/weeklyarrays.csv')
print wkarray_df.shape
print wkarray_df.columns[:20]

(12686, 3969)
Index([u'caseid_1979', u'hrs_worked_wk_num0000_xrnd',
       u'hrs_worked_wk_num0001_xrnd', u'hrs_worked_wk_num0002_xrnd',
       u'hrs_worked_wk_num0003_xrnd', u'hrs_worked_wk_num0004_xrnd',
       u'hrs_worked_wk_num0005_xrnd', u'hrs_worked_wk_num0006_xrnd',
       u'hrs_worked_wk_num0007_xrnd', u'hrs_worked_wk_num0008_xrnd',
       u'hrs_worked_wk_num0009_xrnd', u'hrs_worked_wk_num0010_xrnd',
       u'hrs_worked_wk_num0011_xrnd', u'hrs_worked_wk_num0012_xrnd',
       u'hrs_worked_wk_num0013_xrnd', u'hrs_worked_wk_num0014_xrnd',
       u'hrs_worked_wk_num0015_xrnd', u'hrs_worked_wk_num0016_xrnd',
       u'hrs_worked_wk_num0017_xrnd', u'hrs_worked_wk_num0018_xrnd'],
      dtype='object')


## Tranform weekly sequences


NLSY79 employment status:
* 100 TO 2615: ACTUAL SURVEY ROUND/JOB NUMBER
* 0: NO INFO REPORTED FOR WEEK
* 2: NOT WORKING (UNEMP V. OLF NOT DETERMINED)
* 3: ASSOC. WITH EMP, GAP DATES MISSING, ALL TIME NOT ACCTD FOR
* 4: UNEMPLOYED
* 5: OUT OF LABOR FORCE
* 7: ACTIVE MILITARY SERVICE


My status classification scheme:

If the status is some employer/job number between 100 and 2510, I record the status as ‘w’ (i.e. working). 

If the hours > 35, I record it as ‘ft’. 

If the hours is positive and < 35, I record it as pt. 

Then, 
* hours 0, status 0 —> missing 0
* hours ft and status ‘w’, or 3 —> fulltime 1
* hours 0 and status 7 (military) —> fulltime 1
* hours <=0 and status ‘w’, or 3 —> working without hours 2
* hours pt and status ‘w’ or 3 —> parttime 3
* hours <=0 and status ==2, 4, or 5 —> nonworking 4

In [7]:
# merge in work history arrays
merged = analytic_df.merge(wkarray_df, on='caseid_1979')
print merged.shape

(4926, 3994)


In [8]:
wkarray_df.columns[-5:]

Index([u'status_wk_num1979_xrnd', u'status_wk_num1980_xrnd',
       u'status_wk_num1981_xrnd', u'status_wk_num1982_xrnd',
       u'status_wk_num1983_xrnd'],
      dtype='object')

In [9]:
# Create a dictionary to store data
data = defaultdict(list)


for i, row in merged.iterrows(): #iterate through weekly array rows

    #keep track of progress
    if i%100 == 0: 
        print i,
    
    
    #keep three columns: id, start_y, start_m
    data['caseid_1979'].append(row['caseid_1979'])
        
        
    for week_num in range(1, 1984): #iterate through all weekly work variables
        
        #otherwise:
        my_status = None
        
        #get the hours worked this week 
        hours = row['hrs_worked_wk_num%04d_xrnd'%week_num]
    
        #get the employment status this week
        status = row['status_wk_num%04d_xrnd'%week_num]

        #if (status >= 100 and status <= 2615):
        if status >= 100:
            status = 'w' #working
        
        
        # missing
        if hours == 0 and status == 0:
            my_status = 0
         
        
        # full time
        elif hours >= 35 and status in ['w', 3]:
            my_status = 1
        elif hours == 0 and status == 7:
            my_status = 1
     

        # working, missing hours
        elif hours <= 0 and status in ['w', 3]: 
            my_status = 2
    
    
        # part time 
        elif hours >= 0 and hours < 35 and status in ['w', 3]:
            my_status = 3    
    
            
        # nonworking
        elif hours <= 0 and status in [2, 4, 5]:
            my_status = 4
            

        else:
            # a few cases of positive hours and status 4/5, label as nonworking
            my_status = 4
        
        
        data['week%s'%week_num].append(my_status)

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900


In [10]:
#create the new weekly array dataframe with my edited employment status

#week number range: 1 - 1983

#order the columns in the new dataframe
cols = ['caseid_1979'] + ['week%s'%n for n in range(1, 1984)]

full_df = pd.DataFrame(data, columns=cols)
print full_df.iloc[0]

caseid_1979    1
week1          3
week2          3
week3          3
week4          3
week5          3
week6          3
week7          3
week8          3
week9          3
week10         3
week11         3
week12         3
week13         3
week14         3
week15         3
week16         3
week17         3
week18         3
week19         3
week20         3
week21         1
week22         1
week23         1
week24         1
week25         1
week26         1
week27         1
week28         1
week29         1
              ..
week1954       0
week1955       0
week1956       0
week1957       0
week1958       0
week1959       0
week1960       0
week1961       0
week1962       0
week1963       0
week1964       0
week1965       0
week1966       0
week1967       0
week1968       0
week1969       0
week1970       0
week1971       0
week1972       0
week1973       0
week1974       0
week1975       0
week1976       0
week1977       0
week1978       0
week1979       0
week1980       0
week1981      

In [11]:
# export 
full_df.to_csv('../data/full_wk_df_hope.csv', index=False)