In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Read Data
atus_data = pd.read_csv('SUM0319.txt', sep=",", header=0)
# Obtain year of survey
atus_data['year'] = atus_data.TUCASEID.astype(str).str[0:4].astype(int)
# Calculate birth year from Age of participant and year of survey
atus_data['birth_year'] = atus_data.year - atus_data.TEAGE
# Find Generation of Participant
gen_conditions = [
    (atus_data['birth_year'] >= 1946) & (atus_data['birth_year'] <= 1964),
    (atus_data['birth_year'] >= 1965) & (atus_data['birth_year'] <= 1980),
    (atus_data['birth_year'] >= 1981) & (atus_data['birth_year'] <= 1996),
    (atus_data['birth_year'] >= 1997) & (atus_data['birth_year'] <= 2012)
                 ]
gen_choices = [
    "Boomers",
    "Generation X",
    "Millennials",
    "Generation Z"
]
atus_data['generation'] = np.select(gen_conditions, gen_choices, default = 'Silent Generation')

# Find Sex of Participants
sex_conditions = [
    (atus_data['TESEX'] == 1),
    (atus_data['TESEX'] == 2)
                 ]
sex_choices = [
    "Male",
    "Female"
]
atus_data['Sex'] = np.select(sex_conditions, sex_choices)
# Find relevant coding of activities using Lexicon and group relevant activities together
sleep_cols = list(atus_data.columns[[bool(re.search('t0101.*', column)) for column in atus_data.columns]])
house_work_cols = list(atus_data.columns[[bool(re.search('t02.*', column)) for column in atus_data.columns]])
child_care_cols = (list(atus_data.columns[[bool(re.search('t0301.*', column)) for column in atus_data.columns]]) + 
              list(atus_data.columns[[bool(re.search('t0302.*', column)) for column in atus_data.columns]]) +
              list(atus_data.columns[[bool(re.search('t0303.*', column)) for column in atus_data.columns]]))
work_cols = list(atus_data.columns[[bool(re.search('t05.*', column)) for column in atus_data.columns]])
education_cols = list(atus_data.columns[[bool(re.search('t06.*', column)) for column in atus_data.columns]])
leisure_cols = (list(atus_data.columns[[bool(re.search('t12.*', column)) for column in atus_data.columns]]) + 
           list(atus_data.columns[[bool(re.search('t13.*', column)) for column in atus_data.columns]]))

# Find time spent on each activity (in hours)
atus_data['sleep'] = atus_data[sleep_cols].sum(axis = 1)/60
atus_data['house_work'] = atus_data[house_work_cols].sum(axis = 1)/60
atus_data['child_care'] = atus_data[child_care_cols].sum(axis = 1)/60
atus_data['work'] = atus_data[work_cols].sum(axis = 1)/60
atus_data['education'] = atus_data[education_cols].sum(axis = 1)/60
atus_data['leisure'] = atus_data[leisure_cols].sum(axis = 1)/60

In [3]:
worker = atus_data[(atus_data.TELFS == 1) & (atus_data.TEAGE >= 30) & (atus_data.TEAGE <= 40)]
worker.iloc[:,np.r_[0:24,455:465]].to_csv("processed_atus_data.csv",index=False)
