In [1]:
import pandas as pd
import numpy as np
from scipy.special import expit
import random 

In [2]:
covar_type = {'adequacy': 'cat',
 'alcohol': 'bin',
 'anemia': 'bin',
 'birattnd': 'cat',
 'birmon': 'cyc',
 'bord': 'bin',
 'brstate': 'cat',
 'brstate_reg': 'cat', 
 'cardiac': 'bin',
 'chyper': 'bin',
 'cigar6': 'cat',
 'crace': 'cat',
 'csex': 'bin',
 'data_year': 'cat',
 'dfageq': 'cat',
 'diabetes': 'bin',
 'dlivord_min': 'ord',
 'dmar': 'bin',
 'drink5': 'cat',
 'dtotord_min': 'ord',
 'eclamp': 'bin',
 'feduc6': 'cat',
 'frace': 'cat',
 'gestat10': 'cat',
 'hemo': 'bin',
 'herpes': 'bin',
 'hydra': 'bin',
 'incervix': 'bin',
 'infant_id': 'index do not use',
 'lung': 'bin',
 'mager8': 'cat',
 'meduc6': 'cat',
 'mplbir': 'cat',
 'mpre5': 'cat',
 'mrace': 'cat',
 'nprevistq': 'cat',
 'orfath': 'cat',
 'ormoth': 'cat',
 'othermr': 'bin',
 'phyper': 'bin',
 'pldel': 'cat',
 'pre4000': 'bin',
 'preterm': 'bin',
 'renal': 'bin',
 'rh': 'bin',
 'stoccfipb': 'cat',
 'stoccfipb_reg': 'cat',
 'tobacco': 'bin',
 'uterine': 'bin'}

In [3]:
df_t = pd.read_csv('twin_pairs_T_3years_samesex.csv')
df_x = pd.read_csv('twin_pairs_X_3years_samesex.csv')
df_y = pd.read_csv('twin_pairs_Y_3years_samesex.csv')

mask = np.logical_and(df_t['dbirwt_0'].values < 2000, df_t['dbirwt_1'].values < 2000)

df_t = df_t[mask]
df_x = df_x[mask]
df_y = df_y[mask]

del df_t['Unnamed: 0']
del df_x['Unnamed: 0']
del df_x['Unnamed: 0.1']
del df_y['Unnamed: 0']

del df_x['infant_id_0']
del df_x['infant_id_1']
del df_x['data_year']
del df_x['mplbir_reg']
del df_x['stoccfipb']
del df_x['stoccfipb_reg']

del df_x['bord_0']
del df_x['bord_1']

noncat_cols = []
cat_cols = []

for col in df_x.columns:
    df_x[col] = df_x[col].fillna(round(df_x[col].mean()))
    
    col_type = covar_type[col]
    if col_type != 'cat':
        noncat_cols.append(col)
    else:
        cat_cols.append(col)

df_x.columns

Index(['pldel', 'birattnd', 'brstate', 'mager8', 'ormoth', 'mrace', 'meduc6',
       'dmar', 'mplbir', 'mpre5', 'adequacy', 'orfath', 'frace', 'birmon',
       'gestat10', 'csex', 'anemia', 'cardiac', 'lung', 'diabetes', 'herpes',
       'hydra', 'hemo', 'chyper', 'phyper', 'eclamp', 'incervix', 'pre4000',
       'preterm', 'renal', 'rh', 'uterine', 'othermr', 'tobacco', 'alcohol',
       'cigar6', 'drink5', 'crace', 'nprevistq', 'dfageq', 'feduc6',
       'dlivord_min', 'dtotord_min', 'brstate_reg'],
      dtype='object')

In [5]:
np.random.seed(1)

x_cols = list(df_x.columns)
x_cols = list(filter(lambda a: a != 'gestat10', x_cols))
z_cols = 'gestat10'

x = df_x[x_cols].values
z = df_x[z_cols].values

w_o = np.random.normal(0, 0.1, size=len(x_cols))
w_h = np.random.normal(5, 0.1)

p = np.dot(w_o, x.T) + np.dot(w_h, z/10.0 - 0.1)
p = expit(p)

t = np.random.binomial(1, p)
print(t.mean())

df_y.values[np.arange(df_y.shape[0]), t]
y_factual = df_y.values[np.arange(df_y.shape[0]), t]
y_cfactual = df_y.values[np.arange(df_y.shape[0]), 1-t]

df_y.columns = ['mu0', 'mu1']
df_y['treatment'] = t
df_y['y_factual'] = y_factual
df_y['y_cfactual'] = y_cfactual

df_y = df_y[['treatment', 'y_factual', 'y_cfactual', 'mu0', 'mu1']]

0.6284212283044058


In [6]:
# df_x_cat = []
# for col in cat_cols:
#     df_x_cat.append(pd.get_dummies(df_x[col],prefix=col))
    
# df_x_cat = pd.concat(df_x_cat, axis=1)
# df_x_noncat = df_x[noncat_cols]
# df_x = pd.concat([df_x_cat, df_x_noncat], axis=1)

print(df_x.shape)
binary_cols = list([elt[0] for elt in covar_type.items() if elt[1] == 'bin' and elt[0] in x_cols])
print(list(sorted([x_cols.index(c) for c in binary_cols])))

(11984, 44)
[7, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]


In [7]:
df_final = pd.merge(df_y, df_x, left_index=True, right_index=True)
df_final.reset_index(drop=True, inplace=True)
df_final.to_csv('twins.csv', index=False)

In [8]:
def partition(list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]

num_replications = 10
partitions = partition(list(df_final.index), num_replications)

In [9]:
df_split = []
for i in range(num_replications):
    df_split.append(df_final.iloc[partitions[0]])

In [11]:
for i in range(num_replications):
    df_split[i].to_csv('twins_{}.csv'.format(i+1), index=False, header=False)