## cross-fitting 

In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb

In [2]:
dir_path = '/export/projects2/jeryang_narrative_project/globe/log'
os.chdir(dir_path)

### first experiment

In [3]:
df1 = pd.read_csv('./df_exp1.csv')
df1 = df1.sort_values('subscriber_id')

df1_s = pd.read_csv('./df_surrogate_exp1.csv')
df1_s = df1_s.sort_values('subscriber_id')

In [4]:
# estimate an outcome model
Y = df1_s['rev3'].values # for 3-year revenue

#Y = df1_s['rev'].values # for 18-month revenue

#Y = df1_s['rev_6'].values # for surrogate index on 18-month revenue constructed with surrogates in the first 6 months
#Y = df1_s['rev_5'].values # for surrogate index on 18-month revenue constructed with surrogates in the first 5 months
#Y = df1_s['rev_4'].values # for surrogate index on 18-month revenue constructed with surrogates in the first 4 months
#Y = df1_s['rev_3'].values # for surrogate index on 18-month revenue constructed with surrogates in the first 3 months
#Y = df1_s['rev_2'].values # for surrogate index on 18-month revenue constructed with surrogates in the first 2 months
#Y = df1_s['rev_1'].values # for surrogate index on 18-month revenue constructed with surrogates in the first 1 month

#Y = df1_s['rev_m6'].values # for 6-month revenue
#Y = df1_s['rev_m5'].values # for 5-month revenue
#Y = df1_s['rev_m4'].values # for 4-month revenue
#Y = df1_s['rev_m3'].values # for 3-month revenue
#Y = df1_s['rev_m2'].values # for 2-month revenue
#Y = df1_s['rev_m1'].values # for 1-month revenue

ind = df1['subscriber_id'].values
p = df1['p_treated'].values
T = df1['treated'].values

df_est = df1.drop(columns=['subscriber_id', 'regi_user_key',
                 'treated', 'churn', 'time', 
                 'rev', 'rev1', 'rev2', 'rev3', 'subscription_status', 'last_stop_date'])

X1 = df_est[df_est.columns[df_est.dtypes != 'object']].values
X2 = pd.get_dummies(df_est,prefix=df_est.columns[df_est.dtypes == 'object']).values
X = np.column_stack((X1,X2))
X = np.nan_to_num(X)

In [5]:
df_est = np.column_stack((Y,T,X))

In [6]:
df_est

array([[9.05547424e+02, 0.00000000e+00, 3.90000000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [8.86307129e+02, 0.00000000e+00, 4.60000000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.12008484e+02, 0.00000000e+00, 1.56000000e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [8.74160767e+01, 0.00000000e+00, 4.05900000e+04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.88006897e+02, 0.00000000e+00, 4.06320000e+04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [8.21595581e+02, 0.00000000e+00, 4.06330000e+04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [7]:
n = 3 # number of cross-fitting folds

df_n = np.array_split(df_est, n)

mu = {}

for i in range(n):
    
    df_train = np.vstack(np.delete(df_n, i)) # use -i folds to train the model 
    df_est = df_n[i] 
    
    df_train0 = df_train[df_train[:,1] == 0] # separate the training data by action (treated = 0 or 1)
    df_train1 = df_train[df_train[:,1] == 1]
    
    df_train_y0 = df_train0[:,0]
    df_train_y1 = df_train1[:,0]
    
    df_train_x0 = df_train0[:,2:]
    df_train_x1 = df_train1[:,2:]
    
    model0 = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 100)
    model1 = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 100)
    
    model0.fit(df_train_x0, df_train_y0)
    model1.fit(df_train_x1, df_train_y1)
    
    mu0 = model0.predict(df_est[:,2:]) # using the trained model to predict on the ith fold
    mu1 = model1.predict(df_est[:,2:])  
    
    mu[i] = np.stack((mu0, mu1))

In [8]:
mu = np.column_stack((mu[0],mu[1],mu[2]))
mu = np.transpose(mu)

In [9]:
mu # the columns are predicted y0 and y1

array([[775.7561 , 374.27005],
       [751.1249 , 745.0304 ],
       [689.084  , 702.1704 ],
       ...,
       [575.2128 , 700.2734 ],
       [564.3778 , 302.9008 ],
       [521.2075 , 649.86346]], dtype=float32)

### second experiment 

In [10]:
df2 = pd.read_csv('./df_exp2.csv')
df2_s = pd.read_csv('./df_surrogate_exp2.csv')

In [11]:
Y = df2_s['rev3'].values
ind = df2['subscriber_id'].values
p = df2['prob'].values
T = df2['condition'].values

In [12]:
df_est = df2.drop(columns=['subscriber_id', 
                 'treated', 'churn', 'time', 'condition', 'prob', 'hte', 'action', 'score_c', 'rev1', 
                 'rev','subscription_status','score_discrete'])

X1 = df2[df2.columns[df2.dtypes != 'object']].values
X2 = pd.get_dummies(df2,prefix=df2.columns[df2.dtypes == 'object']).values
X = np.column_stack((X1,X2))
X = np.nan_to_num(X)

In [13]:
df_est = np.column_stack((Y,T,X))

In [14]:
df_est

array([[802.850710362645, 'control', 1.0, ..., 0.0, 0.0, 0.0],
       [1302.83643412332, 'control', 2.0, ..., 0.0, 0.0, 0.0],
       [844.195749983284, 'control', 3.0, ..., 0.0, 0.0, 0.0],
       ...,
       [686.74324533559, '$5.99/8 weeks', 95552.0, ..., 0.0, 0.0, 0.0],
       [778.821398417977, 'control', 95553.0, ..., 0.0, 0.0, 0.0],
       [778.527627748459, 'control', 95554.0, ..., 0.0, 0.0, 0.0]],
      dtype=object)

In [15]:
n = 3 # number of cross-fitting folds

df_n = np.array_split(df_est, n)

mu = {}

for i in range(n):
    
    df_train = np.vstack(np.delete(df_n, i)) # use -i folds to train the model 
    df_est = df_n[i] 
    
    df_train0 = df_train[df_train[:,1] == 'control'] # separate the training data by action 
    df_train1 = df_train[df_train[:,1] == 'gift card']
    df_train2 = df_train[df_train[:,1] == 'thank you email only']
    df_train3 = df_train[df_train[:,1] == '$3.99/8 weeks']
    df_train4 = df_train[df_train[:,1] == '$4.99/8 weeks']
    df_train5 = df_train[df_train[:,1] == '$5.99/8 weeks']
    df_train6 = df_train[df_train[:,1] == '$5.99/4 weeks']
    
    df_train_y0 = df_train0[:,0]
    df_train_y1 = df_train1[:,0]
    df_train_y2 = df_train2[:,0]
    df_train_y3 = df_train3[:,0]
    df_train_y4 = df_train4[:,0]
    df_train_y5 = df_train5[:,0]
    df_train_y6 = df_train6[:,0]
    
    df_train_x0 = df_train0[:,2:]
    df_train_x1 = df_train1[:,2:]
    df_train_x2 = df_train2[:,2:]
    df_train_x3 = df_train3[:,2:]
    df_train_x4 = df_train4[:,2:]
    df_train_x5 = df_train5[:,2:]
    df_train_x6 = df_train6[:,2:]
    
    model0 = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 100)
    model1 = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 100)
    model2 = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 100)
    model3 = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 100)
    model4 = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 100)
    model5 = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 100)
    model6 = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 100)

    model0.fit(df_train_x0, df_train_y0)
    model1.fit(df_train_x1, df_train_y1)
    model2.fit(df_train_x2, df_train_y2)
    model3.fit(df_train_x3, df_train_y3)
    model4.fit(df_train_x4, df_train_y4)
    model5.fit(df_train_x5, df_train_y5)
    model6.fit(df_train_x6, df_train_y6)
    
    mu0 = model0.predict(df_est[:,2:]) # using the trained model to predict on the ith fold
    mu1 = model1.predict(df_est[:,2:])  
    mu2 = model2.predict(df_est[:,2:]) 
    mu3 = model3.predict(df_est[:,2:])  
    mu4 = model4.predict(df_est[:,2:])  
    mu5 = model5.predict(df_est[:,2:]) 
    mu6 = model6.predict(df_est[:,2:])  
    
    mu[i] = np.stack((mu0, mu1, mu2, mu3, mu4, mu5, mu6))

In [16]:
mu = np.column_stack((mu[0],mu[1],mu[2]))
mu = np.transpose(mu)

In [17]:
mu # the columns are predicted y0,y1,y2,y3,y4,y5,y6

array([[ 822.3403 ,  536.0684 ,  858.39404, ...,  600.83813,  762.11475,
         814.8152 ],
       [ 944.4714 ,  533.451  , 1107.6703 , ..., 1264.919  ,  912.13306,
         741.6337 ],
       [ 827.1564 ,  533.2358 ,  968.63293, ..., 1174.4739 ,  869.001  ,
         785.72064],
       ...,
       [ 872.9984 ,  444.99426, 1021.24713, ...,  534.91907,  654.7807 ,
         947.27124],
       [ 822.2045 ,  446.55606,  742.58563, ...,  609.23444,  799.2239 ,
         857.54456],
       [ 829.71155,  453.30045,  800.1953 , ...,  615.69403,  801.4896 ,
         886.19696]], dtype=float32)