# OptOut Project: Sequence Construction for Plotting Purpose

* Include an earlier year to show in state distribution plot
* Construct monthly sequences

In [1]:
#setup
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import random
from collections import defaultdict

# Week-to-week crosswalk

Create two functions to convert between week number and calendar year and month

In [2]:
#this is the file to switch between week number in NLSY79 and calendar year, month, and day
xl_file = pd.ExcelFile('../data/continuous_week_crosswalk_2012.xlsx') 

weekdf = xl_file.parse('weekdates1')
weekdf.head()

Unnamed: 0,Week Start: Month,Week Start: Day,Week Start: Year,Calendar Year Week Number,Continuous Week Number
0,1,1,1978,1,1
1,1,8,1978,2,2
2,1,15,1978,3,3
3,1,22,1978,4,4
4,1,29,1978,5,5


In [3]:
#column names
weekdf.columns

Index([u'Week Start:\nMonth', u'Week Start: \nDay', u'Week Start: \nYear',
       u'Calendar Year \nWeek Number ', u'Continuous \nWeek Number'],
      dtype='object')

In [4]:
#confirm the continuous week number equal the number of rows
print max(weekdf['Continuous \nWeek Number'].values), weekdf.shape[0]

1879 1879


Weekdf has a total of 1826 rows, and the Continuous Week number should have a maximal value of 1826 too. Modify weekdf.

In [5]:
def get_week_num(year, month, which, df=weekdf):
    """
    Given a calendar year and month, return the corresponding week number in NLSY79.
    
    If which=='first', return the first week in that month.
    If which=='last', return the last week in that month. 
    """
    
    the_month = weekdf[(weekdf['Week Start:\nMonth']==month)&(weekdf['Week Start: \nYear']==year)]
    
    if which == 'first':
        return int(the_month.iloc[0]['Continuous \nWeek Number'])
    elif which == 'last':
        return int(the_month.iloc[len(the_month)-1]['Continuous \nWeek Number'])

In [6]:
def get_year_month(weeknum, df=weekdf):
    """
    Given a week number, return the corresponding calendar month and year (yyyy, m).
    """
    
    the_week = weekdf[weekdf['Continuous \nWeek Number']==weeknum]
    return int(the_week.iloc[0]['Week Start: \nYear']), int(the_week.iloc[0]['Week Start:\nMonth'])

# Read weekly data
My status classification scheme:

* 1 - full-time employment
* 2 - working with missing hours
* 3 - part-time employent
* 4 - nonwokring 
* 5 - maternity leave

In [7]:
full_df = pd.read_csv('../data/full_wk_df.csv')
print full_df.shape
full_df.head()

(3434, 1882)


Unnamed: 0,id,start_y,start_m,week1,week2,week3,week4,week5,week6,week7,...,week1870,week1871,week1872,week1873,week1874,week1875,week1876,week1877,week1878,week1879
0,2,1993,4,4,4,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
1,3,1981,7,4,4,4,4,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,4,1980,9,4,4,4,4,4,4,4,...,0,0,0,0,0,0,0,0,0,0
3,8,1976,4,4,4,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
4,16,1990,3,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [8]:
#read weekly array data
wkarray_df = pd.read_csv('../data/weeklyarrays.csv')
print wkarray_df.shape
print wkarray_df.columns[:20]

(12686, 3969)
Index([u'caseid_1979', u'hrs_worked_wk_num0000_xrnd',
       u'hrs_worked_wk_num0001_xrnd', u'hrs_worked_wk_num0002_xrnd',
       u'hrs_worked_wk_num0003_xrnd', u'hrs_worked_wk_num0004_xrnd',
       u'hrs_worked_wk_num0005_xrnd', u'hrs_worked_wk_num0006_xrnd',
       u'hrs_worked_wk_num0007_xrnd', u'hrs_worked_wk_num0008_xrnd',
       u'hrs_worked_wk_num0009_xrnd', u'hrs_worked_wk_num0010_xrnd',
       u'hrs_worked_wk_num0011_xrnd', u'hrs_worked_wk_num0012_xrnd',
       u'hrs_worked_wk_num0013_xrnd', u'hrs_worked_wk_num0014_xrnd',
       u'hrs_worked_wk_num0015_xrnd', u'hrs_worked_wk_num0016_xrnd',
       u'hrs_worked_wk_num0017_xrnd', u'hrs_worked_wk_num0018_xrnd'],
      dtype='object')


# Construct one-year-prebirth monthly sequences

In [9]:
def find_modal(l):
    """
    Given a list l, return the most common element in the list. 
    if there is a tie, return all elements.
    """
    
    #first, count frequency of each unique item in the list
    counter = defaultdict(int)
    for item in l:
        counter[item] += 1

    #rank items from most frequent to least frequent
    popular_items = sorted(counter, key=counter.get, reverse = True)
    
    most_common = [popular_items[0]]
    
    for other in popular_items[1:]:
        if counter[other] < counter[most_common[0]]:
            break
        elif counter[other] == counter[most_common[0]]: #find an equally frequent item
            most_common.append(other)
        else:
            print 'impossible'

    return most_common

#for example
print find_modal([1, 1, 2, 1, 5])
print find_modal([1, 1, 2, 1, 5, 2, 2, 33])

[1]
[1, 2]


In [10]:
random.seed(7222014)

res_data = defaultdict(list) #create an empty dictionary to store data
count_ties = [] #how many we get ties of modal

for i, row in full_df.iterrows():
    
    # keep track of progress
    if i%100 == 0:
        print i,
    
    # keep basic info: id, sequence start year and month
    res_data['id'].append(row['id'])
    
    y = row['start_y']
    m = row['start_m']
    
    # look at weekly statuses    
    for j in range(-1, -13, -1): # from month -12 to month -1

        #first, move to previous month (we are starting y, m from post-birth)
        if m == 1:
            m = 12
            y -= 1
        else:
            m -= 1
        
        
        ### modification: prior to 1978 ###
        if y < 1978: 
            hrs_pre1978 = wkarray_df[wkarray_df.caseid_1979==row['id']].iloc[0]['hrs_worked_wk_num0000_xrnd']
            
            if hrs_pre1978 == 0: # never worked before 1978
                res_data['month%s'%j].append(4) # assign 4 - nonemployed
            else:
                res_data['month%s'%j].append(0) # otherwise, assign missing
        
        else: 
            start = get_week_num(y, m, 'first')
            end = get_week_num(y, m, 'last')
            wk_statuses = [row['week%i'%k] for k in range(start, end+1)]
            
            #### replace status 99 (working, w/ hour 0) as status 3 (part-time) #### 
            wk_statuses = [v if v != 99 else 3 for v in wk_statuses]
            ##########################################################

            #find unique items in the given month 
            distinct = list(set(wk_statuses))
            distinct.sort()

            if len(distinct) == 1:
                res_data['month%s'%j].append(distinct[0])

            elif len(distinct) >= 2:
                modal = find_modal(wk_statuses)
                if len(modal) == 1:
                    res_data['month%s'%j].append(modal[0])
                else:
                    res_data['month%s'%j].append(random.choice(modal))
                    count_ties.append(row['id'])


cols = ['id'] + ['month%s'%i for i in range(-12, 0)]
res_df = pd.DataFrame(res_data, columns=cols) #restricted dataframe

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400


In [11]:
res_df.head()

Unnamed: 0,id,month-12,month-11,month-10,month-9,month-8,month-7,month-6,month-5,month-4,month-3,month-2,month-1
0,2,1,1,1,1,1,1,1,1,1,4,4,5
1,3,1,1,1,1,1,1,1,1,1,1,4,4
2,4,4,4,4,4,4,4,4,4,4,4,4,4
3,8,4,4,4,4,4,4,4,4,4,4,4,4
4,16,1,1,1,1,1,1,1,1,1,1,1,1


In [12]:
res_df.to_csv("../data/monthly_df_prebirth.csv", index=False)