In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Note: Values of -4, -3, -2, -1 indicate variable, blank, don't know or refused therefore convert to NaN
na_values = [-4, -3, -2, -1]

In [4]:
# Import summary csv to df (basic summary of activity times (each activity has a separate column))
dfsum = pd.read_csv("data/atussum_0315/atussum_0315.csv",
                    index_col=False,
                    na_values=na_values)

In [5]:
# Import respondent csv to df (more info on the respondent of the survey (i.e. TULINENO = 1))
dfresp = pd.read_csv("data/atusresp_0315/atusresp_0315.csv",
                     index_col=False,
                     na_values=na_values)

In [6]:
# Import activity csv to df
dfact = pd.read_csv("data/atusact_0315/atusact_0315.csv",
                    index_col=False,
                    na_values=na_values,
                    dtype={'TRCODEP': str})

In [7]:
# Import who file csv to df (who was involved in activity)
dfwho = pd.read_csv("data/atuswho_0315/atuswho_0315.csv",
                    index_col=False,
                    na_values=na_values)

In [8]:
# Import CPS file csv to df
dfcps = pd.read_csv("data/atuscps_0315/atuscps_0315.csv",
                    index_col=False,
                    na_values=na_values)

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
# Import activity code dictionary csv to df
dfactcodes = pd.read_csv("data/activity_codes.csv",
                         index_col=False,
                         sep=';',
                         dtype={'CODE': str, 'NAME': str})

In [10]:
# Import education level code dictionary csv to df
dfeducodes = pd.read_csv("data/edu_codes.csv",
                         index_col=False,
                         sep=';',
                         dtype={'NAME': str})

In [11]:
# Import who code dictionary csv to df
dfwhocodes = pd.read_csv("data/who_codes.csv",
                         index_col=False,
                         sep=';',
                         dtype={'NAME': str})

In [12]:
# Import occupation and industry code dictionary csv to df
dfindocc = pd.read_csv("data/indocc_codes.csv",
                       index_col=False,
                       sep=';',
                       dtype={'FLAG': str, 'NAME': str})

In [13]:
# Import state codes csv to df
dfstatecodes = pd.read_csv("data/state_codes.csv",
                           index_col=False,
                           sep=',',
                           dtype={'NAME': str})

In [14]:
# Add codepoint level (1, 2 or 3) and sort
dfactcodes['LEVEL'] = dfactcodes.CODE.str.len() / 2
dfactcodes = dfactcodes.sort_values('CODE').reset_index(drop=True)

In [15]:
# Useful columns for actvity df
dfact_use = dfact[['TUCASEID', 'TUACTIVITY_N',              # Case ID, Actvity number
                   'TUACTDUR24', 'TUACTDUR',                # Activity duration (24h cap), Activity duration
                   'TEWHERE', 'TRCODEP',                    # Activity where code, Activity code
                   'TUSTARTTIM', 'TUSTOPTIME']].copy()      # Start time, Stop time

In [16]:
# Useful columns for who file df
dfwho_use = dfwho[['TUCASEID', 'TUACTIVITY_N',              # Case ID, Actvity number
                   'TULINENO', 'TUWHO_CODE']].copy()        # Person line number, Who code

In [17]:
# Useful columns for respondent df
dfresp_use = dfresp[['TUCASEID', 'TRNUMHOU',                # Case ID, Number of people in household
                     'TRMJOCGR', 'TRDTOCC1', 'TRMJIND1',    # Major occupation, Detailed occupation, Major industry
                     'TUFWK', 'TUABSOT', 'TEIO1COW',        # Work code, Job code, Individual class of worker code
                     'TEERNPER',                            # Easiest way for you to report your earnings
                     'TUDIS',                               # Disability status
                     'TRERNHLY', 'TRERNWA']].copy()         # Hourly earnings, Weekly earnings

In [18]:
# Calculate actual weekly and hourly earnaings i.e. divide by 100 to get dollar amount
dfresp_use.TRERNHLY = dfresp_use.TRERNHLY / 100.
dfresp_use.TRERNWA = dfresp_use.TRERNWA / 100.

In [19]:
# Add weekly earning category
bins = [0, 200, 400, 600, 800, 1000, 1500, 2000, 3000]
labels = ["1", "2", "3", "4", "5", "6", "7", "8"]
dfresp_use['TRERNWA_CAT'] = pd.cut(dfresp_use['TRERNWA'], bins, labels=labels, right=True)

In [20]:
# Useful columns for activity summary df
dfsum_use = dfsum[['TUCASEID', 'TEAGE', 'TESEX',            # Case ID, Respondent age, Respondent sex
                   'TUYEAR', 'TRHOLIDAY', 'TUDIARYDAY',     # Year of study, Holiday boolean, Day of week
                   'GEMETSTA', 'GTMETSTA',                  # Metropolitan status (old), (new)
                   'TEHRUSLT', 'TELFS',                     # Hours worked per week, Labor force status
                   'TRDPFTPT',                              # FT or PT employment code
                   'TRSPPRES', 'TESPEMPNOT',                # Presence of S/P, employment status of S/P
                   'TESCHENR','TESCHLVL',                   # Enrolled in school, School level
                   'PEEDUCA', 'PTDTRACE',                   # Highest education level, Race code
                   'TRCHILDNUM',                            # Number of household children < 18
                   'TUFNWGTP']].copy()                      # Final ATUS weight

In [21]:
# Add age category
bins = [0, 17, 25, 30, 35, 40, 45, 50, 60, 70, 100]
labels = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
dfsum_use['TEAGE_CAT'] = pd.cut(dfsum_use['TEAGE'], bins, labels=labels, right=True)

In [22]:
# Useful columns for CPS df
dfcps_use = dfcps[dfcps.TULINENO == 1][['TUCASEID', 'GESTFIPS']].copy()     # Case ID, State code

In [23]:
# Activity totals columns for activity summary df
dfsum_acttotals = dfsum.filter(regex='TUCASEID|t')

In [24]:
# Add activity columns for supergroups (level 1 and 2)
df_actgrouped = pd.DataFrame()

for c in dfactcodes[dfactcodes.LEVEL == 2].CODE.values:
    df_actgrouped['t' + c] = dfsum_acttotals.filter(like='t' + c).sum(axis='columns')
for c in dfactcodes[dfactcodes.LEVEL == 1].CODE.values:
    df_actgrouped['t' + c] = dfsum_acttotals.filter(like='t' + c).sum(axis='columns')

In [25]:
# Merge Activity Summary and Respondent df with Activity totals merged at end
dfmerged = dfsum_use.merge(dfresp_use,
                           on='TUCASEID',
                           how='left',
                           copy=False) \
                    .merge(dfcps_use,
                           on='TUCASEID',
                           how='left',
                           copy=False) \
                    .merge(dfsum_acttotals,
                           on='TUCASEID',
                           how='left',
                           copy=False) \
                    .join(df_actgrouped,
                          how='left')

In [26]:
# Multiply activity times and other variables by weights (TUFNWGTP) to allow for sampling biases (append _W)
dfsum_acttotals_W = dfsum_acttotals.iloc[:,1:].multiply(dfsum_use.TUFNWGTP, axis='index')

dfsum_use_W = dfsum_use[['TEHRUSLT', 'TEAGE', 'TRCHILDNUM']].multiply(dfsum_use.TUFNWGTP, axis='index')

dfresp_use_W = dfresp_use[['TRNUMHOU', 'TRERNHLY', 'TRERNWA']].multiply(dfsum_use.TUFNWGTP, axis='index')

df_actgrouped_W = df_actgrouped.multiply(dfsum_use.TUFNWGTP, axis='index')

# Join all data frames together
dfmerged_W = dfmerged.join(dfsum_use_W,
                           how='left',
                           rsuffix='_W') \
                     .join(dfresp_use_W,
                           how='left',
                           rsuffix='_W') \
                     .join(dfsum_acttotals_W,
                           how='left',
                           rsuffix='_W') \
                     .join(df_actgrouped_W,
                           how='left',
                           rsuffix='_W')

In [27]:
# Export final dataframe as .csv
dfmerged_W.to_csv("data/alldata_0315.csv")

In [28]:
dfmerged_W.head(10)

Unnamed: 0,TUCASEID,TEAGE,TESEX,TUYEAR,TRHOLIDAY,TUDIARYDAY,GEMETSTA,GTMETSTA,TEHRUSLT,TELFS,...,t09_W,t10_W,t11_W,t12_W,t13_W,t14_W,t15_W,t16_W,t18_W,t50_W
0,20030100013280,60,1,2003,0,6,1.0,,30.0,2,...,0.0,0.0,40777310.0,2650525000.0,1631093000.0,0.0,0.0,0.0,0.0,0.0
1,20030100013344,41,2,2003,0,7,2.0,,30.0,1,...,0.0,0.0,156179000.0,1023840000.0,0.0,0.0,0.0,0.0,34706450.0,0.0
2,20030100013352,26,2,2003,0,7,1.0,,12.0,2,...,0.0,0.0,287289600.0,1302379000.0,0.0,0.0,0.0,229831600.0,38305270.0,0.0
3,20030100013848,36,2,2003,0,5,2.0,,,4,...,0.0,0.0,165550600.0,1754836000.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20030100014165,51,1,2003,0,5,2.0,,80.0,1,...,0.0,0.0,282291600.0,184103200.0,184103200.0,0.0,0.0,0.0,239334200.0,0.0
5,20030100014169,32,2,2003,0,5,2.0,,40.0,2,...,0.0,0.0,224602600.0,1053905000.0,0.0,0.0,0.0,0.0,0.0,0.0
6,20030100014209,44,2,2003,0,2,1.0,,52.0,1,...,0.0,0.0,155593500.0,0.0,0.0,3275653.0,0.0,0.0,83529140.0,0.0
7,20030100014427,21,2,2003,0,3,1.0,,40.0,1,...,0.0,0.0,98616400.0,2070944000.0,0.0,0.0,0.0,197232800.0,394465600.0,0.0
8,20030100014550,33,2,2003,0,7,2.0,,40.0,1,...,0.0,0.0,106980700.0,183395600.0,0.0,0.0,0.0,0.0,45848890.0,0.0
9,20030100014758,39,2,2003,0,5,1.0,,40.0,1,...,0.0,0.0,64155790.0,2168466000.0,0.0,0.0,0.0,0.0,748484200.0,0.0
