In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from xgboost import XGBClassifier
import sklearn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

### Installation 

In [2]:
# !pip install xgboost

In [3]:
# !brew install libomp

## Preprocessing

In [4]:
def read_data_file(data_num):
    data = pd.read_csv("hw4_data_files/data" + str(data_num) + ".csv").drop(columns=['Unnamed: 0'])
    return data

In [5]:
def reformat_df(data):
    obj_cols = data.columns[data.dtypes == 'object'].values
    for col in obj_cols:
        one_hot = pd.get_dummies(data[col], prefix=col)
        data = data.join(one_hot)
        data = data.drop(columns=[col])
        
    ss_cols = [col for col in data.columns if col != 'Y' and col != 'T']
    bool_cols = [col for col in data if data[col].dropna().value_counts().index.isin([0,1]).all()]
    reg_cols = [col for col in data.columns if col not in ss_cols] + bool_cols
    
    data_stand = data[ss_cols]
    scaler = preprocessing.StandardScaler()
    data_stand = pd.DataFrame(scaler.fit_transform(data_stand), columns = data_stand.columns)  
    
    for col in reg_cols:
        data_stand[col] = data[col]  
    return data_stand

In [6]:
def split_df(data):
    treated = data[data['T'] == 1]
    control = data[data['T'] == 0]
    X_treated = treated.drop(columns=['Y', 'T'])
    X_control = control.drop(columns=['Y', 'T'])
    Y_treated = treated['Y']
    Y_control = control['Y']
    return X_treated, Y_treated, X_control, Y_control

## IPW

* 'learning_rate': [0.005, 0.01, 0.02, 0.05],
* 'max_depth': [5, 6, 7, 8, 10, 12, 15],
* 'colsample_bytree': [0, 0.1, 0.2, 0.3],
* 'min_child_weight': [1, 2],
* 'n_estimators': [300, 500, 700, 1000],
* 'gamma': [0, 0.5]

In [7]:
def calc_ipw(X_treated, Y_treated, X_control, Y_control, X, T):
    xgb_model = XGBClassifier(objective='binary:logistic', eval_metric="logloss", use_label_encoder=False,
                             colsample_bytree=0.1, max_depth=3, eta=0.1)
    xgb_model.fit(X, T)
    
    probabilities = xgb_model.predict_proba(X_control)
    prob_control = probabilities[:, 0]
    
    probabilities = xgb_model.predict_proba(X)
    propensity_scores = probabilities[:, 1]
    
    w = prob_control / (1 - prob_control)
    att = np.mean(Y_treated) - np.sum(Y_control * w) / np.sum(w)  
    return propensity_scores, att

## S-learner

In [8]:
def S_learner(data, X_treated, Y_treated, X_control, Y_control):
    new_X = data.drop(columns=['Y'])
    new_Y = data['Y']

    # model = XGBRegressor(max_depth=3, colsample_bytree=0.1, eta=0.1)
    model = LinearRegression()
    
    model.fit(new_X, new_Y)
    
    treatment_df = X_treated.copy()
    treatment_df['T'] = 1
        
    control_df = X_treated.copy()
    control_df['T'] = 0
    
    pred_treatment = model.predict(treatment_df)
    pred_control = model.predict(control_df)
    
    att = np.mean(pred_treatment) - np.mean(pred_control)
    return att

## T-learner

In [9]:
def T_learner(X_treated, Y_treated, X_control, Y_control):
    # model_treated = XGBRegressor()
    # model_control = XGBRegressor()
    
    model_treated = LinearRegression()
    model_control = LinearRegression()
    
    model_treated.fit(X_treated, Y_treated)
    model_control.fit(X_control, Y_control)
    
    pred_treatment = model_treated.predict(X_treated)
    pred_control = model_control.predict(X_treated)
    
    att = np.mean(pred_treatment) - np.mean(pred_control)
    return att

## Matching

In [10]:
def Matching(X_treated, Y_treated, X_control, Y_control):
    knn_model = KNeighborsRegressor(n_neighbors=1)
    knn_model.fit(X_control, Y_control)
    Y_pred = knn_model.predict(X_treated)
    att = np.mean(Y_treated - Y_pred)
    return att

# ATT

In [11]:
def calc_att_df(data_num):
    data1 = read_data_file(data_num)
    df = reformat_df(data1)

    X_treated, Y_treated, X_control, Y_control = split_df(df)
    X = df.drop(columns=['Y', 'T'])
    Y = df['Y']
    T = df['T']
    
    att_results = []
    
    print("Start calculating Att for dataset no." + str(data_num))
    
    propensity_scores_ipw, ipw_att = calc_ipw(X_treated, Y_treated, X_control, Y_control, X, T)
    print("IPW ATT:" ,ipw_att)
    att_results.append(ipw_att)

    s_learner_att = S_learner(df, X_treated, Y_treated, X_control, Y_control)
    print("S-Learner ATT:", s_learner_att)
    att_results.append(s_learner_att)

    t_learner_att = T_learner(X_treated, Y_treated, X_control, Y_control)
    print("T-Learner ATT:", t_learner_att)
    att_results.append(t_learner_att)
    
    matching_att = Matching(X_treated, Y_treated, X_control, Y_control)
    print("Matching ATT:", matching_att)
    att_results.append(matching_att)
        
    return att_results, propensity_scores_ipw

In [12]:
def get_att_results():
    att_results_1, propensity_scores_1 = calc_att_df(1)
    best_att_1 = 0.2 * att_results_1[1] + 0.5 * att_results_1[3] + 0.3 * att_results_1[2]
    att_results_1.append(best_att_1)
    print("")
    att_results_2, propensity_scores_2 = calc_att_df(2)
    best_att_2 = 0.2 * att_results_2[1] + 0.5 * att_results_2[3] + 0.3 * att_results_2[2]
    att_results_2.append(best_att_2)
    
    dict_df = {'Type': range(1,6), 'data1': att_results_1, 'data2':att_results_2}
    att_res = pd.DataFrame(data=dict_df)
    att_res.to_csv("final/ATT_results.csv", index=False)
    
    prop = [propensity_scores_1, propensity_scores_2]
    prop_df = pd.DataFrame(data=prop, index=['data1', 'data2'])
    prop_df.to_csv("final/models_propensity.csv", header=False)
    
    return att_res, prop_df

In [13]:
att_res, prop_df = get_att_results()

Start calculating Att for dataset no.1
IPW ATT: 6.343669477664777
S-Learner ATT: 4.631261263427119
T-Learner ATT: 4.334095686371272
Matching ATT: 4.009718240917792

Start calculating Att for dataset no.2
IPW ATT: 4.200080944487386
S-Learner ATT: 3.1236820096113656
T-Learner ATT: 3.4733605977485937
Matching ATT: 3.87412620809955


In [14]:
att_res

Unnamed: 0,Type,data1,data2
0,1,6.343669,4.200081
1,2,4.631261,3.123682
2,3,4.334096,3.473361
3,4,4.009718,3.874126
4,5,4.23134,3.603808


In [15]:
prop_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4792,4793,4794,4795,4796,4797,4798,4799,4800,4801
data1,0.500263,0.558599,0.669056,0.612377,0.660447,0.623255,0.745683,0.708029,0.661287,0.705857,...,0.547365,0.664137,0.675705,0.729696,0.703627,0.661912,0.506077,0.66224,0.658703,0.589029
data2,0.896465,0.508538,0.75625,0.296213,0.423098,0.430701,0.262317,0.697293,0.557922,0.690645,...,0.440974,0.300839,0.238827,0.220151,0.506504,0.469937,0.52975,0.560099,0.404936,0.588888
