In [1]:
!pip install xgboost



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import random
import pickle

random.seed(42)
np.random.seed(42)

# S-Learner

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor  # Import MLPRegressor

class S_learner:
    def __init__(self, x_train, y_train, x_test, y_test, T_train, T_test, train_on_full_data=False):
        # Validate data before initializing
        inputs = [x_train, y_train, x_test, y_test, T_train, T_test]
        for dataset in inputs:
            self.check_nulls_in_dataframe(dataset)
        
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.T_train = T_train
        self.T_test = T_test
        self.train_on_full_data = train_on_full_data
        self.models = {}
        self.measures = {}

    def fit(self):
        x_train_full = self.x_train.copy()
        x_train_full.loc[:, "T"] = self.T_train
        y_train = self.y_train.copy()
        
        if self.train_on_full_data:
            x_test_full = self.x_test.copy()
            x_test_full.loc[:, "T"] = self.T_test
            x_train_full = pd.concat([x_train_full, x_test_full], axis=0)
            y_test = self.y_test.copy()
            y_train = pd.concat([y_train, y_test], axis=0)
                
        self.models['linear_regression'] = LinearRegression()
        self.models['linear_regression'].fit(x_train_full, y_train)
        print("Fitted Linear Regression")

        self.models['svr_rbf'] = SVR(kernel='rbf')
        self.models['svr_rbf'].fit(x_train_full, y_train)
        print("Fitted SVR with RBF kernel")

        self.models['svr_poly'] = SVR(kernel='poly', degree=2)
        self.models['svr_poly'].fit(x_train_full, y_train)
        print("Fitted SVR with Polynomial kernel")

        self.models['gradient_boosting'] = GradientBoostingRegressor()
        self.models['gradient_boosting'].fit(x_train_full, y_train)
        print("Fitted Gradient Boosting Regressor")

        self.models['random_forest'] = RandomForestRegressor()
        self.models['random_forest'].fit(x_train_full, y_train)
        print("Fitted RandomForest Regressor")
        
        self.models['mlp'] = MLPRegressor(hidden_layer_sizes=(192,), max_iter=1500, activation='relu', 
                                          solver='adam', learning_rate_init=0.00005, random_state=42)
        self.models['mlp'].fit(x_train_full, y_train)
        print("Fitted MLP Regressor")
        

    def evaluate(self):
        x_train_full = self.x_train.copy()
        x_test_full = self.x_test.copy()
        
        x_train_full.loc[:, "T"] = self.T_train
        x_test_full.loc[:, "T"] = self.T_test
        
        results = {}
        for name, model in self.models.items():
            train_pred = model.predict(x_train_full)
            test_pred = model.predict(x_test_full)
            train_mse = mean_squared_error(self.y_train, train_pred)
            test_mse = mean_squared_error(self.y_test, test_pred)
            results[name] = {'Train MSE': train_mse, 'Test MSE': test_mse}
        
        for name, scores in results.items():
            print(f"{name}: Train MSE = {scores['Train MSE']:.4f}, Test MSE = {scores['Test MSE']:.4f}")
    
    def compute_ATE(self, with_prints=True):
        x_combined = pd.concat([self.x_train, self.x_test], axis=0)
        self.compute_effect(x=x_combined, measure_name='ATE', with_prints=with_prints)
    
    def compute_ATE_final(self, list_of_models):
        final_ATE = 0
        for model in list_of_models:
            final_ATE += self.measures['ATE'][model]
        final_ATE = final_ATE / len(list_of_models)
        print("\nFinal ATE:", final_ATE)
        return final_ATE
        
    def compute_effect(self, x, measure_name, with_prints=True):
        self.measures[measure_name] = {}
        
        x_with_T_zero = x.copy()
        x_with_T_zero['T'] = 0

        x_with_T_one = x.copy()
        x_with_T_one['T'] = 1
        
        for key in self.models:
            predictions_one = self.models[key].predict(x_with_T_one)
            predictions_zero = self.models[key].predict(x_with_T_zero)
            diffrences = predictions_one - predictions_zero
            self.measures[measure_name][key] = np.mean(diffrences)
        
        if with_prints:
            print()
            print(f"The {measure_name} are ", self.measures[measure_name])
            print()
               
    
    def check_nulls_in_dataframe(self, df):
        # Check if there are any null values in the DataFrame
        if df.isnull().any().any():
            print("The DataFrame contains null values.")
            # Count of nulls in each column
            null_counts = df.isnull().sum()
            print("Count of null values in each column:")
            print(null_counts[null_counts > 0])
        else:
            print("The DataFrame does not contain any null values.")


# T-Learner

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor  # Import MLPRegressor

class T_learner:
    def __init__(self, x_train, y_train, x_test, y_test, T_train, T_test, train_on_full_data=False):
        # Validate data before initializing
        inputs = [x_train, y_train, x_test, y_test, T_train, T_test]
        for dataset in inputs:
            self.check_nulls_in_dataframe(dataset)
        
        treated_indices_train = T_train == 1
        treated_indices_test = T_test == 1
        
        # x
        self.x = {'train': {}, 'test': {}}
        self.x['train'][0] = x_train[~treated_indices_train]
        self.x['train'][1] = x_train[treated_indices_train]
        self.x['test'][0] = x_test[~treated_indices_test]
        self.x['test'][1] = x_test[treated_indices_test]
        
        # y
        self.y = {'train': {}, 'test': {}}
        self.y['train'][0] = y_train[~treated_indices_train]
        self.y['train'][1] = y_train[treated_indices_train]
        self.y['test'][0] = y_test[~treated_indices_test]
        self.y['test'][1] = y_test[treated_indices_test]
        
        # else
        self.train_on_full_data = train_on_full_data
        self.models = {0:{}, 1:{}}
        self.measures = {}
        

    def fit(self):
        for T in [0,1]:
            print(f"\nFitting models with T = {T}")
            
            x_train_full = self.x['train'][T].copy()
            y_train = self.y['train'][T].copy()

            if self.train_on_full_data:
                x_test_full = self.x['test'][T].copy()
                x_train_full = pd.concat([x_train_full, x_test_full], axis=0)
                y_test = self.y['test'][T].copy()
                y_train = pd.concat([y_train, y_test], axis=0)

            self.models[T]['linear_regression'] = LinearRegression()
            self.models[T]['linear_regression'].fit(x_train_full, y_train)
            print("Fitted Linear Regression")

            self.models[T]['svr_rbf'] = SVR(kernel='rbf')
            self.models[T]['svr_rbf'].fit(x_train_full, y_train)
            print("Fitted SVR with RBF kernel")

            self.models[T]['svr_poly'] = SVR(kernel='poly', degree=2)
            self.models[T]['svr_poly'].fit(x_train_full, y_train)
            print("Fitted SVR with Polynomial kernel")

            self.models[T]['gradient_boosting'] = GradientBoostingRegressor()
            self.models[T]['gradient_boosting'].fit(x_train_full, y_train)
            print("Fitted Gradient Boosting Regressor")

            self.models[T]['random_forest'] = RandomForestRegressor()
            self.models[T]['random_forest'].fit(x_train_full, y_train)
            print("Fitted RandomForest Regressor")

            self.models[T]['mlp'] = MLPRegressor(hidden_layer_sizes=(192,), max_iter=1500, activation='relu', 
                                          solver='adam', learning_rate_init=0.00005, random_state=42)
            self.models[T]['mlp'].fit(x_train_full, y_train)
            print("Fitted MLP Regressor")
        

    def evaluate(self):
        results = {0 : {}, 1 : {}}
        for T in [0,1]:
            x_train_full = self.x['train'][T].copy()
            x_test_full = self.x['test'][T].copy()

            for name, model in self.models[T].items():
                train_pred = model.predict(x_train_full)
                test_pred = model.predict(x_test_full)
                train_mse = mean_squared_error(self.y['train'][T], train_pred)
                test_mse = mean_squared_error(self.y['test'][T], test_pred)
                results[T][name] = {'Train MSE': train_mse, 'Test MSE': test_mse}
        
        for T in [0,1]: 
            for name, scores in results[T].items():
                print(f"T: {T} | {name}: Train MSE = {scores['Train MSE']:.4f}, Test MSE = {scores['Test MSE']:.4f}")
    
    def compute_ATE(self, with_prints=True):
        x_combined = pd.concat([self.x['train'][0], self.x['train'][1], self.x['test'][0], self.x['test'][1]], axis=0)
        self.compute_effect(x=x_combined, measure_name='ATE', with_prints=with_prints)
    
    def compute_ATE_final(self, list_of_models):
        final_ATE = 0
        for model in list_of_models:
            final_ATE += self.measures['ATE'][model]
        final_ATE = final_ATE / len(list_of_models)
        print("\nFinal ATE:", final_ATE)
        return final_ATE
        
    def compute_effect(self, x, measure_name, with_prints=True):
        self.measures[measure_name] = {}
        
        for key in self.models[0]:
            predictions_one = self.models[1][key].predict(x)
            predictions_zero = self.models[0][key].predict(x)
            diffrences = predictions_one - predictions_zero
            #print(f"{measure_name}, diffrences for {key}:", diffrences)
            self.measures[measure_name][key] = np.mean(diffrences)
        
        if with_prints:
            print()
            print(f"The {measure_name} are ", self.measures[measure_name])
            print()
               
    
    def check_nulls_in_dataframe(self, df):
        # Check if there are any null values in the DataFrame
        if df.isnull().any().any():
            print("The DataFrame contains null values.")
            # Count of nulls in each column
            null_counts = df.isnull().sum()
            print("Count of null values in each column:")
            print(null_counts[null_counts > 0])
        else:
            print("The DataFrame does not contain any null values.")


# Matching

In [5]:
from sklearn.neighbors import NearestNeighbors

class Matching:
    def __init__(self, x, y, T):
        self.x = x.copy()
        self.y = y.copy()
        self.T = T.copy()
        # Splitting the data based on T
        self.x1 = x[T == 1].reset_index(drop=True)
        self.y1 = y[T == 1].reset_index(drop=True)
        self.x0 = x[T == 0].reset_index(drop=True)
        self.y0 = y[T == 0].reset_index(drop=True)
        
        self.ATE = {}
        self.ATT = {}

    def compute_ATE(self, k):
        # Ensure k is a positive integer
        if k <= 0 or not isinstance(k, int):
            raise ValueError("k must be a positive integer")
        
        # Initialize the nearest neighbors models
        nn_0 = NearestNeighbors(n_neighbors=k)
        nn_1 = NearestNeighbors(n_neighbors=k)
        
        nn_0.fit(self.x0.values)
        nn_1.fit(self.x1.values)
        
        # Calculate CAT for T=1
        CAT_1 = []
        for i, xi in self.x1.iterrows():
            _, indices = nn_0.kneighbors([xi.values])
            y_neighbors = self.y0.iloc[indices[0]]
            CAT_1.append(self.y1.loc[i] - y_neighbors.mean())

        # Calculate CAT for T=0
        CAT_0 = []
        for i, xi in self.x0.iterrows():
            _, indices = nn_1.kneighbors([xi.values])
            y_neighbors = self.y1.iloc[indices[0]]
            CAT_0.append(y_neighbors.mean() - self.y0.loc[i])

        # Calculate and return ATE
        all_CAT = CAT_1 + CAT_0
        self.ATE[k] = np.mean(all_CAT)
        print(f"ATE for k = {k} : {self.ATE[k]}")

# Young, no children

In [6]:
with open('./preprocessed_data/df_young_no_children_dict.pickle', 'rb') as f:
    df_young_no_children_dict = pickle.load(f)

for key in df_young_no_children_dict:
    print(key, df_young_no_children_dict[key].shape)

threshold = 2
df_young_no_children_dict['T_train'] = df_young_no_children_dict['T_train'].apply(lambda x: 0 if x <= threshold else 1)
df_young_no_children_dict['T_test'] = df_young_no_children_dict['T_test'].apply(lambda x: 0 if x <= threshold else 1)

X_train_normalized (1591, 97)
X_test_normalized (281, 97)
Y_train (1591,)
Y_test (281,)
T_train (1591,)
T_test (281,)


In [7]:
df_young_no_children_dict['Y_train']

0       2
1       2
2       4
3       2
4       2
       ..
1586    2
1587    1
1588    3
1589    2
1590    3
Name: # BIO CHILDREN REPORTED, Length: 1591, dtype: int64

### s-learner

In [64]:
s_learner_young_no_children = S_learner(x_train=df_young_no_children_dict['X_train_normalized'], 
                      y_train=df_young_no_children_dict['Y_train'], 
                      x_test=df_young_no_children_dict['X_test_normalized'],
                      y_test=df_young_no_children_dict['Y_test'],
                      T_train=df_young_no_children_dict['T_train'],
                      T_test=df_young_no_children_dict['T_test'])

s_learner_young_no_children.fit()
s_learner_young_no_children.evaluate()
s_learner_young_no_children.compute_ATE()
_ = s_learner_young_no_children.compute_ATE_final(['svr_rbf', 'gradient_boosting', 'random_forest', 'mlp'])

The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
Fitted Linear Regression
Fitted SVR with RBF kernel
Fitted SVR with Polynomial kernel
Fitted Gradient Boosting Regressor
Fitted RandomForest Regressor




Fitted MLP Regressor
linear_regression: Train MSE = 1.7393, Test MSE = 3121135964352433241456640.0000
svr_rbf: Train MSE = 1.4127, Test MSE = 2.1832
svr_poly: Train MSE = 1.4298, Test MSE = 2.2454
gradient_boosting: Train MSE = 1.3942, Test MSE = 2.1804
random_forest: Train MSE = 0.2786, Test MSE = 2.1812
mlp: Train MSE = 0.4767, Test MSE = 2.5022

The ATE are  {'linear_regression': 0.0855716803135016, 'svr_rbf': 0.08173014394129184, 'svr_poly': 0.00583226934077547, 'gradient_boosting': 0.03563232264890556, 'random_forest': 0.05887286324786324, 'mlp': 0.0996685064526924}


Final ATE: 0.06897595907268826


### t-learner

In [65]:
t_learner_df_young_no_children = T_learner(x_train=df_young_no_children_dict['X_train_normalized'], 
                      y_train=df_young_no_children_dict['Y_train'], 
                      x_test=df_young_no_children_dict['X_test_normalized'],
                      y_test=df_young_no_children_dict['Y_test'],
                      T_train=df_young_no_children_dict['T_train'],
                      T_test=df_young_no_children_dict['T_test'])

t_learner_df_young_no_children.fit()
t_learner_df_young_no_children.evaluate()
t_learner_df_young_no_children.compute_ATE()
_ = t_learner_df_young_no_children.compute_ATE_final(['svr_rbf', 'gradient_boosting', 'random_forest', 'mlp'])

The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.

Fitting models with T = 0
Fitted Linear Regression
Fitted SVR with RBF kernel
Fitted SVR with Polynomial kernel
Fitted Gradient Boosting Regressor
Fitted RandomForest Regressor




Fitted MLP Regressor

Fitting models with T = 1
Fitted Linear Regression
Fitted SVR with RBF kernel
Fitted SVR with Polynomial kernel
Fitted Gradient Boosting Regressor
Fitted RandomForest Regressor




Fitted MLP Regressor
T: 0 | linear_regression: Train MSE = 1.6339, Test MSE = 2432753395077570138996736.0000
T: 0 | svr_rbf: Train MSE = 1.3076, Test MSE = 1.6435
T: 0 | svr_poly: Train MSE = 1.3001, Test MSE = 1.6442
T: 0 | gradient_boosting: Train MSE = 1.2119, Test MSE = 1.6899
T: 0 | random_forest: Train MSE = 0.2765, Test MSE = 1.5896
T: 0 | mlp: Train MSE = 0.3872, Test MSE = 2.0440
T: 1 | linear_regression: Train MSE = 1.7328, Test MSE = 2013389047060483919870689280.0000
T: 1 | svr_rbf: Train MSE = 1.3986, Test MSE = 3.0785
T: 1 | svr_poly: Train MSE = 1.4122, Test MSE = 3.2042
T: 1 | gradient_boosting: Train MSE = 1.1119, Test MSE = 3.3940
T: 1 | random_forest: Train MSE = 0.2954, Test MSE = 3.2934
T: 1 | mlp: Train MSE = 0.5252, Test MSE = 3.7379

The ATE are  {'linear_regression': 3518195482422.8013, 'svr_rbf': 0.2707701616161646, 'svr_poly': 0.24632254178976506, 'gradient_boosting': 0.12951836030500222, 'random_forest': 0.10128205128205128, 'mlp': 0.13467215900461355}


Fina

### matching

In [66]:
matching_df_young_no_children = Matching(x=pd.concat([df_young_no_children_dict['X_train_normalized'].reset_index(drop=True), df_young_no_children_dict['X_test_normalized'].reset_index(drop=True)], axis=0),
                     y=pd.concat([df_young_no_children_dict['Y_train'].reset_index(drop=True), df_young_no_children_dict['Y_test'].reset_index(drop=True)], axis=0),
                     T=pd.concat([df_young_no_children_dict['T_train'], df_young_no_children_dict['T_test']], axis=0))

matching_df_young_no_children.compute_ATE(1)
matching_df_young_no_children.compute_ATE(3)
matching_df_young_no_children.compute_ATE(5)
matching_df_young_no_children.compute_ATE(9)
matching_df_young_no_children.compute_ATE(15)
matching_df_young_no_children.compute_ATE(50)

ATE for k = 1 : 0.22382478632478633
ATE for k = 3 : 0.1898148148148148
ATE for k = 5 : 0.18782051282051282
ATE for k = 9 : 0.18209876543209877
ATE for k = 15 : 0.1772792022792023
ATE for k = 50 : 0.19371794871794873


# Mature, no children

In [67]:
with open('df_mature_no_children_dict.pickle', 'rb') as f:
    df_mature_no_children_dict = pickle.load(f)

for key in df_young_no_children_dict:
    print(key, df_mature_no_children_dict[key].shape)

threshold = 2
df_mature_no_children_dict['T_train'] = df_mature_no_children_dict['T_train'].apply(lambda x: 0 if x <= threshold else 1)
df_mature_no_children_dict['T_test'] = df_mature_no_children_dict['T_test'].apply(lambda x: 0 if x <= threshold else 1)

X_train_normalized (1256, 93)
X_test_normalized (222, 93)
Y_train (1256,)
Y_test (222,)
T_train (1256,)
T_test (222,)


### s-learner

In [68]:
s_learner_mature_no_children = S_learner(x_train=df_mature_no_children_dict['X_train_normalized'], 
                      y_train=df_mature_no_children_dict['Y_train'], 
                      x_test=df_mature_no_children_dict['X_test_normalized'],
                      y_test=df_mature_no_children_dict['Y_test'],
                      T_train=df_mature_no_children_dict['T_train'],
                      T_test=df_mature_no_children_dict['T_test'])

s_learner_mature_no_children.fit()
s_learner_mature_no_children.evaluate()
s_learner_mature_no_children.compute_ATE()
_ = s_learner_mature_no_children.compute_ATE_final(['svr_rbf', 'gradient_boosting', 'random_forest', 'mlp'])

The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
Fitted Linear Regression
Fitted SVR with RBF kernel
Fitted SVR with Polynomial kernel
Fitted Gradient Boosting Regressor
Fitted RandomForest Regressor




Fitted MLP Regressor
linear_regression: Train MSE = 1.4152, Test MSE = 38713129794933195014144.0000
svr_rbf: Train MSE = 1.1050, Test MSE = 1.8404
svr_poly: Train MSE = 1.1164, Test MSE = 1.9354
gradient_boosting: Train MSE = 1.1491, Test MSE = 1.7347
random_forest: Train MSE = 0.2221, Test MSE = 1.7637
mlp: Train MSE = 0.2999, Test MSE = 2.3847

The ATE are  {'linear_regression': 0.18460078878234945, 'svr_rbf': 0.07764656932449003, 'svr_poly': 0.0056766361526181764, 'gradient_boosting': 0.08976166312302018, 'random_forest': 0.07942489851150203, 'mlp': 0.10186885315376218}


Final ATE: 0.0871754960281936


### t-learner

In [69]:
t_learner_mature_no_children = T_learner(x_train=df_mature_no_children_dict['X_train_normalized'], 
                      y_train=df_mature_no_children_dict['Y_train'], 
                      x_test=df_mature_no_children_dict['X_test_normalized'],
                      y_test=df_mature_no_children_dict['Y_test'],
                      T_train=df_mature_no_children_dict['T_train'],
                      T_test=df_mature_no_children_dict['T_test'])

t_learner_mature_no_children.fit()
t_learner_mature_no_children.evaluate()
t_learner_mature_no_children.compute_ATE()
_ = t_learner_df_young_no_children.compute_ATE_final(['svr_rbf', 'gradient_boosting', 'random_forest', 'mlp'])

The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.

Fitting models with T = 0
Fitted Linear Regression
Fitted SVR with RBF kernel
Fitted SVR with Polynomial kernel
Fitted Gradient Boosting Regressor
Fitted RandomForest Regressor




Fitted MLP Regressor

Fitting models with T = 1
Fitted Linear Regression
Fitted SVR with RBF kernel
Fitted SVR with Polynomial kernel
Fitted Gradient Boosting Regressor
Fitted RandomForest Regressor




Fitted MLP Regressor
T: 0 | linear_regression: Train MSE = 1.3012, Test MSE = 176867084656900840108326912.0000
T: 0 | svr_rbf: Train MSE = 0.9950, Test MSE = 1.8766
T: 0 | svr_poly: Train MSE = 0.9907, Test MSE = 1.9281
T: 0 | gradient_boosting: Train MSE = 1.0236, Test MSE = 1.8589
T: 0 | random_forest: Train MSE = 0.2170, Test MSE = 1.8541
T: 0 | mlp: Train MSE = 0.2213, Test MSE = 2.4112
T: 1 | linear_regression: Train MSE = 1.4105, Test MSE = 1852155952696930955662721024.0000
T: 1 | svr_rbf: Train MSE = 1.1146, Test MSE = 1.4887
T: 1 | svr_poly: Train MSE = 1.0744, Test MSE = 1.5752
T: 1 | gradient_boosting: Train MSE = 0.8716, Test MSE = 1.6661
T: 1 | random_forest: Train MSE = 0.2413, Test MSE = 1.7045
T: 1 | mlp: Train MSE = 0.2619, Test MSE = 2.2909

The ATE are  {'linear_regression': 2831750159435.3564, 'svr_rbf': 0.1869374979536639, 'svr_poly': 0.22930162133879478, 'gradient_boosting': 0.17368177382689085, 'random_forest': 0.17094722598105547, 'mlp': 0.10818375225112814}


Fi

### matching

In [70]:
matching_mature_no_children = Matching(x=pd.concat([df_mature_no_children_dict['X_train_normalized'].reset_index(drop=True), df_mature_no_children_dict['X_test_normalized'].reset_index(drop=True)], axis=0),
                     y=pd.concat([df_mature_no_children_dict['Y_train'].reset_index(drop=True), df_mature_no_children_dict['Y_test'].reset_index(drop=True)], axis=0),
                     T=pd.concat([df_mature_no_children_dict['T_train'], df_mature_no_children_dict['T_test']], axis=0))

matching_mature_no_children.compute_ATE(1)
matching_mature_no_children.compute_ATE(3)
matching_mature_no_children.compute_ATE(5)
matching_mature_no_children.compute_ATE(9)
matching_mature_no_children.compute_ATE(15)
matching_mature_no_children.compute_ATE(50)

ATE for k = 1 : 0.2780784844384303
ATE for k = 3 : 0.24515110509697793
ATE for k = 5 : 0.23315290933694183
ATE for k = 9 : 0.23372425199218164
ATE for k = 15 : 0.24023455119530898
ATE for k = 50 : 0.24376184032476322


# Mature, with children

In [71]:
with open('df_mature_with_children_dict.pickle', 'rb') as f:
    df_mature_with_children_dict = pickle.load(f)

for key in df_mature_with_children_dict:
    print(key, df_mature_with_children_dict[key].shape)

threshold = 2
df_mature_with_children_dict['T_train'] = df_mature_with_children_dict['T_train'].apply(lambda x: 0 if x <= threshold else 1)
df_mature_with_children_dict['T_test'] = df_mature_with_children_dict['T_test'].apply(lambda x: 0 if x <= threshold else 1)

X_train_normalized (460, 82)
X_test_normalized (82, 82)
Y_train (460,)
Y_test (82,)
T_train (460,)
T_test (82,)


### s-learner 

In [72]:
s_learner_mature_with_children = S_learner(x_train=df_mature_with_children_dict['X_train_normalized'], 
                      y_train=df_mature_with_children_dict['Y_train'], 
                      x_test=df_mature_with_children_dict['X_test_normalized'],
                      y_test=df_mature_with_children_dict['Y_test'],
                      T_train=df_mature_with_children_dict['T_train'],
                      T_test=df_mature_with_children_dict['T_test'])

s_learner_mature_with_children.fit()
s_learner_mature_with_children.evaluate()
s_learner_mature_with_children.compute_ATE()
_ = s_learner_mature_with_children.compute_ATE_final(['svr_rbf', 'gradient_boosting', 'random_forest', 'mlp'])

The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
Fitted Linear Regression
Fitted SVR with RBF kernel
Fitted SVR with Polynomial kernel
Fitted Gradient Boosting Regressor
Fitted RandomForest Regressor




Fitted MLP Regressor
linear_regression: Train MSE = 1.2246, Test MSE = 58081881843523414654976.0000
svr_rbf: Train MSE = 0.9753, Test MSE = 2.5108
svr_poly: Train MSE = 1.0456, Test MSE = 2.6612
gradient_boosting: Train MSE = 0.6733, Test MSE = 2.4924
random_forest: Train MSE = 0.2179, Test MSE = 2.6054
mlp: Train MSE = 0.3364, Test MSE = 2.7386

The ATE are  {'linear_regression': 0.2433473931907288, 'svr_rbf': 0.05985493576233995, 'svr_poly': 0.004449037556042961, 'gradient_boosting': 0.06512305003700121, 'random_forest': 0.13470479704797048, 'mlp': 0.1014327708222044}


Final ATE: 0.090278888417379


### t-learner

In [73]:
t_learner_mature_with_children = T_learner(x_train=df_mature_with_children_dict['X_train_normalized'], 
                      y_train=df_mature_with_children_dict['Y_train'], 
                      x_test=df_mature_with_children_dict['X_test_normalized'],
                      y_test=df_mature_with_children_dict['Y_test'],
                      T_train=df_mature_with_children_dict['T_train'],
                      T_test=df_mature_with_children_dict['T_test'])

t_learner_mature_with_children.fit()
t_learner_mature_with_children.evaluate()
t_learner_mature_with_children.compute_ATE()
_ = t_learner_mature_with_children.compute_ATE_final(['svr_rbf', 'gradient_boosting', 'random_forest', 'mlp'])

The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.
The DataFrame does not contain any null values.

Fitting models with T = 0
Fitted Linear Regression
Fitted SVR with RBF kernel
Fitted SVR with Polynomial kernel
Fitted Gradient Boosting Regressor
Fitted RandomForest Regressor




Fitted MLP Regressor

Fitting models with T = 1
Fitted Linear Regression
Fitted SVR with RBF kernel
Fitted SVR with Polynomial kernel
Fitted Gradient Boosting Regressor
Fitted RandomForest Regressor




Fitted MLP Regressor
T: 0 | linear_regression: Train MSE = 0.8996, Test MSE = 2340197915879636683718656.0000
T: 0 | svr_rbf: Train MSE = 0.7297, Test MSE = 2.8617
T: 0 | svr_poly: Train MSE = 0.7078, Test MSE = 3.0380
T: 0 | gradient_boosting: Train MSE = 0.3640, Test MSE = 3.1521
T: 0 | random_forest: Train MSE = 0.1794, Test MSE = 2.9553
T: 0 | mlp: Train MSE = 0.2229, Test MSE = 3.3425
T: 1 | linear_regression: Train MSE = 1.1260, Test MSE = 14494535059815162745867730944.0000
T: 1 | svr_rbf: Train MSE = 1.0580, Test MSE = 1.9187
T: 1 | svr_poly: Train MSE = 1.2484, Test MSE = 2.0536
T: 1 | gradient_boosting: Train MSE = 0.3548, Test MSE = 2.2106
T: 1 | random_forest: Train MSE = 0.2699, Test MSE = 1.8323
T: 1 | mlp: Train MSE = 0.3991, Test MSE = 2.2422

The ATE are  {'linear_regression': 7355349974780.64, 'svr_rbf': 0.5114343025331958, 'svr_poly': 0.5551432471321722, 'gradient_boosting': 0.30124083173016875, 'random_forest': 0.3122509225092251, 'mlp': 0.24033011734063342}


Final A

### matching

In [74]:
matching_mature_with_children = Matching(x=pd.concat([df_mature_with_children_dict['X_train_normalized'].reset_index(drop=True), df_mature_with_children_dict['X_test_normalized'].reset_index(drop=True)], axis=0),
                     y=pd.concat([df_mature_with_children_dict['Y_train'].reset_index(drop=True), df_mature_with_children_dict['Y_test'].reset_index(drop=True)], axis=0),
                     T=pd.concat([df_mature_with_children_dict['T_train'], df_mature_with_children_dict['T_test']], axis=0))

matching_mature_with_children.compute_ATE(1)
matching_mature_with_children.compute_ATE(3)
matching_mature_with_children.compute_ATE(5)
matching_mature_with_children.compute_ATE(9)
matching_mature_with_children.compute_ATE(15)
matching_mature_with_children.compute_ATE(50)

ATE for k = 1 : 0.48154981549815495
ATE for k = 3 : 0.42988929889298894
ATE for k = 5 : 0.4612546125461255
ATE for k = 9 : 0.45202952029520294
ATE for k = 15 : 0.4253382533825339
ATE for k = 50 : 0.45845018450184505
