# Current Population Survey

In [None]:
import pandas as pd
import numpy as np
import os
from scipy.special import expit

In [None]:
# seed
seed = 0
rng = np.random.default_rng(seed)

#### read data

Source data can be downloaded on ``https://cps.ipums.org/cps/``

In [None]:
# read data
pth = "./source_data/cps_00001.csv"
df = pd.read_csv(pth)

In [None]:
# sort and filter
df = df.sort_values(["CPSID", "CPSIDP", "MONTH"])
df = df.drop_duplicates(subset="CPSID", keep="first")

In [None]:
# drop and sort columns
drop_cols = ["YEAR", "MONTH", "SERIAL", "CPSID", "CPSIDV", "PERNUM", "ASECFLAG", "ASECWTH", "ASECWT", "CPSIDP"]
df = df.drop(columns=drop_cols)
cols = ['AGE', 'SEX', 'EDUC', 'MARST', 'EMPSTAT', 'UHRSWORKT', 'INCWAGE', 'NUMPREC', 'HHINCOME', 'WKSWORK1', 'FIRMSIZE']
df = df[cols]

In [None]:
# sample
df = df.sample(6000, random_state=seed).reset_index(drop=True)

#### process features

In [None]:
# binarize sex (1 male, 0 female)
df['SEX'] = (df["SEX"] == 1).astype(int)

# binarize education (1 bachelor or higer, 0 otherwise)
df["EDUC"] = (df["EDUC"] >= 101).astype(int)

# binarize marital status (1 maried, 0 otherwise)
df["MARST"] = df["MARST"].isin([1, 2]).astype(int)

# binarize employment status (1 employed, 0 otherwise)
df["EMPSTAT"] = df["EMPSTAT"].isin([10, 12]).astype(int)

# handle outliers
df["UHRSWORKT"] = df['UHRSWORKT'].replace({997: 0, 999: 0})

# log transform income from wage
df['LOG_INCWAGE'] = np.log1p(df['INCWAGE'])
df = df.drop('INCWAGE', axis=1)
df['ZERO_INCWAGE'] = (df['LOG_INCWAGE'] == 0).astype(int)

# log transform household income
df.loc[df['HHINCOME'] < 0, 'HHINCOME'] = 0
df['LOG_HHINCOME'] = np.log1p(df['HHINCOME'])
df = df.drop('HHINCOME', axis=1)
df['ZERO_HHINCOME'] = (df['LOG_HHINCOME'] == 0).astype(int)

# handle outliers for weeks worked
df['WKSWORK1'] = df['WKSWORK1'].clip(lower=0, upper=52).astype(int)

In [None]:
# store features
df.to_csv("./cps_tmp.csv")

#### generate treatments and outcome

In [None]:
# read features
df = pd.read_csv("./cps_tmp.csv", index_col=0)

In [None]:
# employment indicator
emp = ((df['UHRSWORKT'] > 0) | (df['WKSWORK1'] > 0) | (df['LOG_INCWAGE'] > 0)).astype(float)

In [None]:
# scale features
cont_cols = ['AGE', 'UHRSWORKT', 'LOG_INCWAGE', 'NUMPREC', 'LOG_HHINCOME', 'WKSWORK1', 'FIRMSIZE']
bin_cols = ['SEX', 'EDUC', 'MARST', 'EMPSTAT', 'ZERO_INCWAGE', 'ZERO_HHINCOME']
df[cont_cols] = (df[cont_cols] - df[cont_cols].mean()) / df[cont_cols].std()

In [None]:
# copy for data generation
gen_df = df.copy()

In [None]:
# s(x)
gen_df['s'] = (
    -0.35 * gen_df['AGE']                    
    -0.80 * gen_df['LOG_HHINCOME']            
    +0.60 * gen_df['EDUC']                    
    -0.25 * gen_df['AGE'] * gen_df['EDUC']    
    -0.30 * gen_df['ZERO_HHINCOME'])          

In [None]:
# v(x)
gen_df['v'] = (
    + 0.7 * gen_df['NUMPREC']        
    + 0.3 * gen_df['MARST']         
    + 0.2 * gen_df['SEX']           
    - 0.15 * (gen_df['NUMPREC']**2 - 1))  

In [None]:
# e(x)
logit_e = 0.6 * gen_df['v'] + 0.4 * gen_df['s']      
gen_df['e'] = 0.1 + 0.7 * expit(logit_e)            
gen_df['T'] = np.random.binomial(1, gen_df['e'])    

In [None]:
# mu_0(x)
gen_df['M0'] = (
  - 2.1                                              
  + 1.6 * gen_df['AGE']                              
  + 2.0 * emp                                        
  + 1.1 * emp * gen_df['LOG_INCWAGE']                
  + 0.75 * emp * np.tanh(gen_df['UHRSWORKT'])        
  + 0.75 * emp * np.tanh(gen_df['WKSWORK1'])         
  + 0.5 * np.tanh(gen_df['FIRMSIZE'])                
  + 0.6 * emp * np.tanh(gen_df['UHRSWORKT']) * np.tanh(gen_df['WKSWORK1']) 
  + 0.5 * gen_df['FIRMSIZE'] * gen_df['AGE'])        

In [None]:
# tau(x)
gen_df['cate'] = 4 * expit(1.5 * gen_df['s'] - 0.25) + 0.35 * gen_df['s']

In [None]:
# mu_1(x)
gen_df['M1'] = gen_df['M0'] + gen_df['cate']

In [None]:
# y(x)
sigma_y = 0.6
gen_df['Y0'] = gen_df['M0'] + rng.normal(0, sigma_y, len(gen_df))
gen_df['Y1'] = gen_df['M1'] + rng.normal(0, sigma_y, len(gen_df))
gen_df['Y'] = np.where(gen_df['T']==1, gen_df['Y1'], gen_df['Y0'])

In [None]:
# set variables 
df["T"] = gen_df["T"]
df["M0"] = gen_df["M0"]
df["M1"] = gen_df["M1"]
df["cate"] = gen_df["cate"]
df["Y"] = gen_df["Y"]
df["e"] = gen_df["e"]
df["s"] = gen_df["s"]

In [None]:
# store
df.to_csv("./cps.csv")