# OptOut: Construct Pre-1978 Employment Sequences 

Using raw work history varaibles

In [1]:
#setup
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from operator import itemgetter
from datetime import datetime

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()



In [7]:
def diff_month(d1, d2):
    """
    Return the number of months between the two dates
    """
    
    return (d1.year - d2.year) * 12 + d1.month - d2.month

# Find subsample with missing pre-1978 sequences

In [2]:
# read existing monthly sequences
monthly_df = pd.read_csv("../data/monthly_df.csv")
monthly_df.head()

Unnamed: 0,id,start_y,start_m,month1,month2,month3,month4,month5,month6,month7,...,month207,month208,month209,month210,month211,month212,month213,month214,month215,month216
0,2,1993,4,5,1,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
1,3,1981,7,4,4,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
2,4,1980,9,4,4,4,4,4,4,4,...,3,3,3,3,3,3,3,0,0,0
3,8,1976,4,4,4,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
4,16,1990,3,1,1,5,5,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [11]:
pre1978_df = monthly_df[monthly_df.start_y<1978]
print pre1978_df.shape
pre1978_df.head(10) # pre-1978 monthly statuses are either 0 (missing) or 4 (nonemployed)

(481, 219)


Unnamed: 0,id,start_y,start_m,month1,month2,month3,month4,month5,month6,month7,...,month207,month208,month209,month210,month211,month212,month213,month214,month215,month216
3,8,1976,4,4,4,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
20,86,1977,12,4,4,4,4,4,4,4,...,1,1,1,1,1,4,1,1,1,1
62,204,1975,8,4,4,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
64,208,1975,9,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,0,0,0
66,224,1975,1,4,4,4,4,4,4,4,...,3,3,3,3,4,3,3,3,3,3
67,227,1975,10,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
76,244,1975,8,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
90,285,1977,11,4,4,4,4,4,4,4,...,1,1,1,1,1,1,1,1,1,1
117,382,1976,5,0,0,0,0,0,0,0,...,4,4,4,4,4,4,1,1,1,1
123,397,1976,2,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4


In [13]:
def get_num_missing(row):
    """
    Return how many months pre-1978 have missing status 0
    """
    
    # find number of months between the start of sequence and 1978/1/1
    l = diff_month(datetime(1978, 1, 1), datetime(int(row['start_y']), int(row['start_m']), 1))
    
    # the months from 1 to l are the pre-1978 months
    is_missing = [row['month%s'%i]==0 for i in range(1, l+1)]
    return sum(is_missing)
    
pre1978_df['num_missing'] = pre1978_df.apply(get_num_missing, axis=1)
pre1978_df.num_missing.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0     363
29      6
16      5
32      5
14      5
1       4
25      4
24      4
6       4
23      4
36      3
21      3
19      3
15      3
42      3
12      3
10      3
9       3
5       3
3       3
26      3
18      2
7       2
4       2
28      2
30      2
60      2
33      2
34      2
35      2
37      2
54      2
38      2
40      2
47      1
2       1
59      1
52      1
51      1
50      1
8       1
43      1
46      1
45      1
44      1
65      1
41      1
20      1
22      1
31      1
27      1
79      1
Name: num_missing, dtype: int64

In [14]:
# only need to consider respondents with positive number of missing months pre-1978
# 118 respondents only
pre1978_df = pre1978_df[pre1978_df.num_missing>0]
print pre1978_df.shape
pre1978_df.head()

(118, 220)


Unnamed: 0,id,start_y,start_m,month1,month2,month3,month4,month5,month6,month7,...,month208,month209,month210,month211,month212,month213,month214,month215,month216,num_missing
76,244,1975,8,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,29
117,382,1976,5,0,0,0,0,0,0,0,...,4,4,4,4,4,1,1,1,1,20
156,491,1974,2,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,47
202,627,1976,9,0,0,0,0,0,0,0,...,4,4,4,4,4,4,4,4,4,16
294,932,1977,9,0,0,0,0,3,3,3,...,1,1,1,1,1,1,1,1,1,4


# Read work history raw variables

In [5]:
df79 = pd.read_csv('../data/work_history79.csv')
df79.head()

Unnamed: 0,caseid_1979,employer_startdate_01_m_1979,employer_startdate_01_d_1979,employer_startdate_01_y_1979,employer_startdate_02_m_1979,employer_startdate_02_d_1979,employer_startdate_02_y_1979,employer_startdate_03_m_1979,employer_startdate_03_d_1979,employer_startdate_03_y_1979,...,nonemployed_prior_int_01_1979,nonemployed_prior_int_02_1979,nonemployed_prior_int_03_1979,nonemployed_prior_int_04_1979,nonemployed_prior_int_05_1979,nonemployed_since_int_01_1979,nonemployed_since_int_02_1979,nonemployed_since_int_03_1979,nonemployed_since_int_04_1979,nonemployed_since_int_05_1979
0,1,5,22,78,9,20,75,-4,-4,-4,...,-4,-4,-4,-4,-4,0,-4,-4,-4,-4
1,2,5,8,78,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,0,-4,-4,-4,-4
2,3,9,11,78,5,15,78,1,30,78,...,-4,-4,-4,-4,-4,0,0,0,-4,-4
3,4,-4,-4,-4,-4,-4,-4,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
4,5,2,1,78,6,5,78,-4,-4,-4,...,-4,-4,-4,-4,-4,1,0,-4,-4,-4


In [16]:
pre1978_df['caseid_1979'] = pre1978_df.id
pre1978_df = pre1978_df.merge(df79, on='caseid_1979')
pre1978_df.employer_startdate_01_y_1979.value_counts()

-4     44
 77    23
 78    18
 79    16
 76    12
 74     2
 75     1
 72     1
 69     1
Name: employer_startdate_01_y_1979, dtype: int64

**However, having valid skip in first job does not mean having valid skip in all jobs.**

In [17]:
# for those with valid skip in first job, do they have valid skip in all jobs?
# show distribution of start year for each job

sub_df = pre1978_df[pre1978_df.employer_startdate_01_y_1979==-4]
for i in range(1, 6):
    print sub_df['employer_startdate_%02d_y_1979'%i].value_counts()
    print

-4    44
Name: employer_startdate_01_y_1979, dtype: int64

77    23
78    13
79     2
76     2
75     2
74     2
Name: employer_startdate_02_y_1979, dtype: int64

-4     26
 77    10
 78     4
 79     2
 76     2
Name: employer_startdate_03_y_1979, dtype: int64

-4     41
 77     2
 78     1
Name: employer_startdate_04_y_1979, dtype: int64

-4     43
 78     1
Name: employer_startdate_05_y_1979, dtype: int64



# Update monthly work hours pre-1978

### First, explore job dates
Three respondents had missing job start month, but the start year is before the sequence start year, so the missing does not matter

In [50]:
def find_job_dates(row):
    """Explore job dates and missingness"""
    
    job_dates = [] # (start_y, start_m, stop_y, stop_m)
    
    for i in range(1, 6): # iterate through jobs
        start_y = row['employer_startdate_%02d_y_1979'%i]
        start_m = row['employer_startdate_%02d_m_1979'%i]
        stop_y = row['employer_stopdate_%02d_y_1979'%i]
        stop_m = row['employer_stopdate_%02d_m_1979'%i]
        
        if start_y > 0 and start_m > 0:
            
            # job still current at 1979 interview, okay to assign 1979/1 as stop month 
            # for our purpose of reconstructing pre-1978 sequence
            if stop_y == -4 and stop_m == -4: 
                job_dates.append((start_y, start_m, 1979, 1))
                
            elif stop_y > 0 and stop_m > 0:
                job_dates.append((start_y, start_m, stop_y, stop_m))
                
            else:
                print 'missing job stop date:', row['caseid_1979'], 'seq start at:', (row['start_y'], row['start_m']), 'job dates:', (start_y, start_m, stop_y, stop_m)
                
        elif start_y == -4 and start_m == -4:
            pass
        
        else:
            print 'missing job start date:', row['caseid_1979'], 'seq start at:', (row['start_y'], row['start_m']), 'job dates:', (start_y, start_m, stop_y, stop_m)
            job_dates.append((start_y, start_m, stop_y, stop_m))
            
    print job_dates
    
res = pre1978_df.apply(find_job_dates, axis=1)

[(77, 11, 1979, 1), (78, 2, 1979, 1), (78, 2, 1979, 1), (77, 2, 79, 1)]
[(78, 6, 78, 9), (77, 10, 78, 2)]
[(78, 10, 1979, 1), (78, 5, 78, 10), (72, 2, 78, 4), (78, 1, 78, 1)]
[(74, 6, 1979, 1)]
[(77, 9, 78, 10)]
[(76, 9, 1979, 1)]
[(77, 10, 78, 1)]
[(76, 4, 79, 2)]
[(77, 2, 1979, 1)]
[(77, 3, 1979, 1)]
[(78, 8, 79, 1), (77, 3, 78, 8)]
[(77, 9, 78, 8)]
[(76, 4, 1979, 1)]
[(79, 3, 1979, 1), (77, 8, 79, 3)]
[(78, 8, 1979, 1), (77, 9, 78, 7)]
[(77, 11, 78, 2)]
[(78, 11, 1979, 1), (78, 5, 78, 7), (78, 2, 78, 4), (77, 11, 78, 1)]
[(77, 3, 1979, 1)]
[(75, 10, 79, 1)]
[(77, 9, 1979, 1)]
[(78, 8, 1979, 1), (77, 9, 78, 8)]
[(79, 1, 1979, 1), (73, 2, 79, 1)]
[(78, 8, 78, 12), (77, 9, 78, 8)]
[(78, 8, 1979, 1), (78, 7, 78, 8), (78, 7, 78, 7), (74, 6, 78, 1)]
[(77, 9, 78, 3)]
[(77, 3, 1979, 1)]
[(78, 10, 1979, 1), (77, 11, 78, 2), (78, 8, 78, 9)]
[(79, 1, 1979, 1), (78, 7, 78, 11), (77, 11, 78, 4)]
[(77, 7, 78, 7)]
[(78, 11, 1979, 1), (78, 5, 78, 10), (77, 10, 78, 5)]
[(79, 1, 79, 2), (76, 4, 78, 8

### Look into 1980 work history raw variables and see if any respondent reported pre-1978 jobs in 1980 survey

No, all jobs reported in 1980 survey started in 1979 or 1980 (possibly continuing from the jobs reported in 1979, but for our purpose of reconstructing pre-1978 sequences, we don't need the jobs reported in 1980 survey).

In [20]:
df80 = pd.read_csv('../data/work_history80.csv')
df80.head()

Unnamed: 0,caseid_1979,emp1previd_1980,same_employer_1_1980,employer_startdate_01_m_1980,employer_startdate_01_d_1980,employer_startdate_01_y_1980,working_at_job_01_1980,employer_stopdate_01_m_1980,employer_stopdate_01_d_1980,employer_stopdate_01_y_1980,...,qes_2e_05_1980,employer_startdate_05_m_1980,employer_startdate_05_d_1980,employer_startdate_05_y_1980,nonemployed_prior_int_05_1980,working_at_job_05_1980,employer_stopdate_05_m_1980,employer_stopdate_05_d_1980,employer_stopdate_05_y_1980,hours_per_week_job_05_1980
0,1,-5,-5,-5,-5,-5,-5,-5,-5,-5,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5
1,2,1,-4,2,28,79,1,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
2,3,-4,-4,4,1,80,1,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
3,4,-5,-5,-5,-5,-5,-5,-5,-5,-5,...,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5
4,5,1,-4,4,19,79,1,-4,-4,-4,...,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4


In [21]:
pre1978_df = pre1978_df.merge(df80, on='caseid_1979')

In [52]:
start_years = []

def find_job_dates_v2(row):
    """Check job dates reported in 1980 survey"""
    
    job_dates = []
    for i in range(1, 6):
        start_y = row['employer_startdate_%02d_y_1980'%i]
        start_m = row['employer_startdate_%02d_m_1980'%i]
#         stop_y = row['employer_stopdate_%02d_y_1980'%i]
#         stop_m = row['employer_stopdate_%02d_m_1980'%i]

        if start_y > 0 and start_m > 0:
            start_years.append(start_y)
            job_dates.append((start_y, start_m))
        elif start_y == -4 and start_m == -4:
            pass
        else:
            print 'missing job start date:', row['caseid_1979'], 'seq start at:', (row['start_y'], row['start_m']), 'job dates:', (start_y, start_m)
            job_dates.append((start_y, start_m))
            
    print job_dates
    
res = pre1978_df.apply(find_job_dates_v2, axis=1)

[(79, 3)]
[(79, 3)]
[(79, 2)]
[(79, 4)]
[(79, 3)]
[(79, 2)]
[(79, 8)]
[(79, 6), (79, 4)]
[(79, 3)]
[(79, 2)]
[(79, 6)]
[]
[(79, 3)]
[(79, 12), (79, 8), (79, 4), (79, 3)]
[(79, 3), (79, 9)]
[]
[(79, 3)]
[(79, 2)]
[(79, 7), (79, 3)]
[(79, 2)]
[(79, 3)]
missing job start date: 2193 seq start at: (1977, 7) job dates: (-5, -5)
missing job start date: 2193 seq start at: (1977, 7) job dates: (-5, -5)
missing job start date: 2193 seq start at: (1977, 7) job dates: (-5, -5)
missing job start date: 2193 seq start at: (1977, 7) job dates: (-5, -5)
missing job start date: 2193 seq start at: (1977, 7) job dates: (-5, -5)
[(-5, -5), (-5, -5), (-5, -5), (-5, -5), (-5, -5)]
[(79, 10), (79, 6)]
[(79, 3)]
[(80, 1), (79, 4)]
[(79, 2)]
[(79, 9), (79, 2)]
[(79, 6), (79, 2)]
[]
[(80, 2), (79, 3)]
[(79, 7)]
[(79, 3)]
[(79, 10), (79, 5), (79, 3)]
[(79, 8), (79, 4)]
[(79, 2)]
[(79, 10), (79, 3)]
missing job start date: 4179 seq start at: (1975, 12) job dates: (-5, -5)
missing job start date: 4179 seq start at:

In [53]:
set(start_years)

{79, 80}

### Now, update monthly work hours

In [54]:
def is_in_job_period(y, m, job_date):
    """
    Given a year and month and the start/stop y/m of a job, 
    return True if the givne year and month is within the job duration
    """
    
    i, start_y, start_m, stop_y, stop_m = job_date
    return datetime(y, m, 1) >= datetime(start_y, start_m, 1) and datetime(y, m, 1) <= datetime(stop_y, stop_m, 1)


# update monthly statuses
for i, row in pre1978_df.iterrows():
    
    job_dates = [] # (start_y, start_m, stop_y, stop_m)
    for i in range(1, 6):
        start_y = row['employer_startdate_%02d_y_1979'%i]
        start_m = row['employer_startdate_%02d_m_1979'%i]
        stop_y = row['employer_stopdate_%02d_y_1979'%i]
        stop_m = row['employer_stopdate_%02d_m_1979'%i]
        
        # edit years - years are often reported in two digit format
        if start_y < 1900 and start_y > 0:
            start_y += 1900
        if stop_y < 1900 and stop_y > 0:
            stop_y += 1900
        
        if start_m < 0 and start_m != -4:
            print 'missing job start date:', row['caseid_1979'], 'seq start at:', (row['start_y'], row['start_m']), 'job dates:', (start_y, start_m, stop_y, stop_m)
            
            # job start year was before sequence start year, so the missing does not matter
            # assign sequence start year as job start year - sufficient for our purpose of reconstructing pre-1978 sequence
            start_y = row['start_y']
            start_m = row['start_m']
        
        if start_y > 0 and start_m > 0:
            if stop_y == -4 and stop_m == -4: # job still current at 1979 interview
                job_dates.append((i, start_y, start_m, 1979, 1))
            elif stop_y > 0 and stop_m > 0:
                job_dates.append((i, start_y, start_m, stop_y, stop_m))
            else:
                # no missing
                print 'missing job stop date'


                  
    # sequence start month 
    y = int(row['start_y'])
    m = int(row['start_m'])
    
    # number of months between sequence start month and 1978/1/1
    l = diff_month(datetime(1978, 1, 1), datetime(y, m, 1))
    
    # the pre-1978 months are month 1 to month l. iterate through them
    for i in range(1, l+1):
        overlap_jobs = [job_date for job_date in job_dates if is_in_job_period(y, m, job_date)]
        
        if len(overlap_jobs) == 0: # this month does not overlap with any job
            row['month%s'%i] = 4 # nonemployed
            
        else:    
            overlap_job_nums = map(itemgetter(0), overlap_jobs)
            hours = [row['hours_per_week_job_%02d_1979'%j] for j in overlap_job_nums]
            missing_hours = [h < 0 for h in hours]
            
            if sum(missing_hours) > 0: # has missing hours
                print 'missing hours:', row['caseid_1979'], y, m
                row['month%s'%i] = 2 # working with missing hours
            else:
                if sum(hours) >= 35:
                    row['month%s'%i] = 1 # full-time
                else:
                    row['month%s'%i] = 3 # part-time
        
        
        # go to next month
        if m == 12:
            m = 1
            y += 1
        else:
            m += 1

missing job start date: 5425 seq start at: (1976, 1) job dates: (1974, -2, 1978, 2)
missing hours: 5615 1977 11
missing hours: 5615 1977 12
missing job start date: 6642 seq start at: (1974, 11) job dates: (1972, -1, -4, -4)
missing hours: 6642 1974 11
missing hours: 6642 1974 12
missing hours: 6642 1975 1
missing hours: 6642 1975 2
missing hours: 6642 1975 3
missing hours: 6642 1975 4
missing hours: 6642 1975 5
missing hours: 6642 1975 6
missing hours: 6642 1975 7
missing hours: 6642 1975 8
missing hours: 6642 1975 9
missing hours: 6642 1975 10
missing hours: 6642 1975 11
missing hours: 6642 1975 12
missing hours: 6642 1976 1
missing hours: 6642 1976 2
missing hours: 6642 1976 3
missing hours: 6642 1976 4
missing hours: 6642 1976 5
missing hours: 6642 1976 6
missing hours: 6642 1976 7
missing hours: 6642 1976 8
missing hours: 6642 1976 9
missing hours: 6642 1976 10
missing hours: 6642 1976 11
missing hours: 6642 1976 12
missing hours: 6642 1977 1
missing hours: 6642 1977 2
missing hour

In [55]:
# get number of missing months after we've updated pre-1978 months
pre1978_df['num_missing_after'] = pre1978_df.apply(get_num_missing, axis=1)
pre1978_df.num_missing_after.value_counts()

0    118
Name: num_missing_after, dtype: int64

**<font color='red'>No more missing, but we have some months (shown above) working with missing hours (status 2)</font>**

## Update within-job gaps

Here are the variables I'm looking at:

* Variable: **ANY PERIODS NOT EMPLOYED  PRIOR TO LAST INT? (DATE IS < LAST INT) JOB #01**

ORIGINAL QUESTION NAME: S10Q08B1

BEFORE JAN. 1, 1978, WERE THERE ANY PERIODS OF ONE MONTH OR MORE DURING WHICH 
YOU WERE NOT WORKING FOR (EMPLOYER), NOT COUNTING PAID VACATION OR PAID SICK 
LEAVE?

UNIVERSE: R has employer; job began before DLI



* Variable: **ANY PERIODS NOT EMPLOYED  SINCE LAST INT? JOB #01**

ORIGINAL QUESTION NAME: S10Q09A

FOR ONE REASON OR ANOTHER, PEOPLE OFTEN DO NOT WORK FOR A WEEK, A MONTH, OR EVEN
LONGER. FOR EXAMPLE, STRIKES, LAYOFFS, AND EXTENDED ILLNESSES CAN CAUSE PEOPLE 
TO MISS WORK FOR A WEEK OR LONGER.

BETWEEN (DATE STARTED/JAN. 1, 1978) AND (DATE JOB ENDED/NOW), WERE THERE ANY 
PERIODS OF A FULL WEEK OR MORE DURING WHICH YOU DID NOT WORK FOR THIS EMPLOYER, 
NOT COUNTING PAID VACATIONS AND PAID SICK LEAVE?

UNIVERSE: R has employer

In [46]:
# any period nonemployed prior to last interview?
for i in range(1, 6):
    print pre1978_df['nonemployed_prior_int_%02d_1979'%i].value_counts()
    print 

-4    80
 0    27
 1    11
Name: nonemployed_prior_int_01_1979, dtype: int64

-4    71
 0    35
 1    12
Name: nonemployed_prior_int_02_1979, dtype: int64

-4    97
 0    15
 1     6
Name: nonemployed_prior_int_03_1979, dtype: int64

-4    115
 0      3
Name: nonemployed_prior_int_04_1979, dtype: int64

-4    118
Name: nonemployed_prior_int_05_1979, dtype: int64



In [47]:
# any period nonemployed since last interview?
for i in range(1, 6):
    print pre1978_df['nonemployed_since_int_%02d_1979'%i].value_counts()
    print 

 0    65
-4    43
 1    10
Name: nonemployed_since_int_01_1979, dtype: int64

 0    58
-4    44
 1    16
Name: nonemployed_since_int_02_1979, dtype: int64

-4    86
 0    25
 1     5
-3     2
Name: nonemployed_since_int_03_1979, dtype: int64

-4    111
 0      5
 1      2
Name: nonemployed_since_int_04_1979, dtype: int64

-4    117
 0      1
Name: nonemployed_since_int_05_1979, dtype: int64



In [57]:
# how many respondents had nonemployed period?

def has_nonemployed_period(row):
    """
    Return True if respondent reported any period nonemployed
    
    I THINK WE ONLY CARE ABOUT NONEMPLOYED PERIOR PRIOR TO INTERVIEW
    """
    
    vals = [row['nonemployed_prior_int_%02d_1979'%i] for i in range(1, 6)]
    # vals += [row['nonemployed_since_int_%02d_1979'%i] for i in range(1, 6)]
    
    return 1 in vals

res = pre1978_df.apply(has_nonemployed_period, axis=1)
res.value_counts()gi

False    89
True     29
dtype: int64