In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [13]:
# Load the dataset
dataset = pd.read_csv('../data/dataset.csv')

In [14]:
# Display basic info of the dataset to check for missing values and data types
print("Dataset Information:")
print(dataset.info())
print("Dataset Description:")
print(dataset.describe())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10391 entries, 0 to 10390
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   schoolid  10391 non-null  int64  
 1   Z         10391 non-null  int64  
 2   Y         10391 non-null  float64
 3   S3        10391 non-null  int64  
 4   C1        10391 non-null  int64  
 5   C2        10391 non-null  int64  
 6   C3        10391 non-null  int64  
 7   XC        10391 non-null  int64  
 8   X1        10391 non-null  float64
 9   X2        10391 non-null  float64
 10  X3        10391 non-null  float64
 11  X4        10391 non-null  float64
 12  X5        10391 non-null  float64
dtypes: float64(6), int64(7)
memory usage: 1.0 MB
None
Dataset Description:
           schoolid             Z             Y            S3            C1  \
count  10391.000000  10391.000000  10391.000000  10391.000000  10391.000000   
mean      39.888846      0.325666     -0.096742    

In [18]:
def data_preprocessing(dataset):

    # Display basic info of the dataset to check for missing values and data types
    print("Dataset Information:")
    print(dataset.info())
    print("Dataset Description:")
    print(dataset.describe())

    # Step 1: Feature Engineering (optional, you can add more interactions based on domain knowledge)
    # Example: Creating an interaction feature between mindset and school achievement level
    dataset['Mindset_School_Achievement_Interaction'] = dataset['X1'] * dataset['X2']

    # Step 2: Define covariates (features), treatment, and outcome
    outcome_col = 'Y'  # Student Achievement Score (Outcome)
    treatment_col = 'Z'  # Growth Mindset Intervention (Treatment)
    covariate_cols = ['S3', 'C1', 'C2', 'C3', 'XC', 'X1', 'X2', 'X3', 'X4', 'X5', 'Mindset_School_Achievement_Interaction']

    # Step 3: Categorical and Numerical Feature Identification
    # Categorical columns that need one-hot encoding
    categorical_cols = ['C1', 'C2', 'C3', 'XC']

    # Numerical columns that need to be standardized
    numerical_cols = ['X1', 'X2', 'X3', 'X4', 'X5', 'Mindset_School_Achievement_Interaction']

    # Step 4: Preprocessing Pipeline
    # A pipeline that will scale numerical features and one-hot encode categorical ones
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),  # Scaling numerical columns
            ('cat', OneHotEncoder(drop='first'), categorical_cols)  # One-hot encoding categorical columns
        ])

    # Step 5: Train-Test Split
    X = dataset[covariate_cols]  # Features (covariates)
    y = dataset[outcome_col]  # Outcome (Student Achievement Score)

    # Split data into training and test sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 6: Apply the Preprocessing Pipeline
    # Fit the preprocessor on the training data and transform both train and test sets
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Retrieve the feature names after one-hot encoding and scaling
    # This ensures that our transformed data has the correct column names
    feature_names = numerical_cols + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols))

    # Convert the transformed data back to DataFrames for easier use
    X_train_df = pd.DataFrame(X_train_transformed, columns=feature_names)
    X_test_df = pd.DataFrame(X_test_transformed, columns=feature_names)

    return X_train_df, X_test_df, y_train, y_test,X_train, X_test

In [19]:
X_train_df, X_test_df, y_train, y_test,X_train, X_test = data_preprocessing(dataset)

# Step 7: Output the results to verify the transformations
print("Transformed Training Set:")
print(X_train_df.head())
print("\nTransformed Test Set:")
print(X_test_df.head())

# Output the shapes of the datasets
print(f"\nTraining data shape: {X_train_df.shape}, Test data shape: {X_test_df.shape}")


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10391 entries, 0 to 10390
Data columns (total 14 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   schoolid                                10391 non-null  int64  
 1   Z                                       10391 non-null  int64  
 2   Y                                       10391 non-null  float64
 3   S3                                      10391 non-null  int64  
 4   C1                                      10391 non-null  int64  
 5   C2                                      10391 non-null  int64  
 6   C3                                      10391 non-null  int64  
 7   XC                                      10391 non-null  int64  
 8   X1                                      10391 non-null  float64
 9   X2                                      10391 non-null  float64
 10  X3                                   

In [21]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to handle missing values and train the S-Learner
def s_learner(X_train, X_test, y_train, y_test, treatment_train, treatment_test,s_model):
    # Append treatment column to training and test sets
    X_train_with_treatment = X_train.copy()
    X_test_with_treatment = X_test.copy()
    
    # Add treatment indicator to the feature set
    X_train_with_treatment['treatment'] = treatment_train
    X_test_with_treatment['treatment'] = treatment_test
    
    # Handle missing values by imputing them
    imputer = SimpleImputer(strategy='mean')  # For numerical values
    X_train_imputed = imputer.fit_transform(X_train_with_treatment)
    X_test_imputed = imputer.transform(X_test_with_treatment)
    
    # Train the model on the combined feature set (covariates + treatment indicator)
    s_model.fit(X_train_imputed, y_train)
    
    # Predict outcomes for both treated and untreated cases
    X_test_with_treatment['treatment'] = 1  # Treated case
    y_pred_treated = s_model.predict(X_test_imputed)
    
    X_test_with_treatment['treatment'] = 0  # Untreated case
    y_pred_control = s_model.predict(X_test_imputed)
    
    # Calculate the treatment effect (difference between treated and control)
    treatment_effect = y_pred_treated - y_pred_control
    
    # Evaluate the model performance using mean squared error (MSE)
    mse = mean_squared_error(y_test, s_model.predict(X_test_imputed))
    print(f"S-Learner MSE: {mse}")
    
    return treatment_effect, s_model

# Extract treatment column (Z) for both training and testing datasets
treatment_train = dataset.loc[X_train.index, 'Z']
treatment_test = dataset.loc[X_test.index, 'Z']

 # Use a RandomForestRegressor as the base model for the S-learner
s_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train and evaluate the S-learner with missing value handling
s_treatment_effect, s_model = s_learner(X_train_df, X_test_df, y_train, y_test, treatment_train, treatment_test,s_model)

# Print the estimated treatment effects
print("Estimated treatment effects (S-Learner):\n", s_treatment_effect[:5])  # Showing first 5 treatment effects

S-Learner MSE: 0.45278502287687317
Estimated treatment effects (S-Learner):
 [0. 0. 0. 0. 0.]


In [3]:
# Import necessary libraries
from sklearn.utils import resample

# Combine training data with treatment indicator and target
X_train_with_treatment = X_train_df.copy()
X_train_with_treatment['Z'] = treatment_train
X_train_with_treatment['Y'] = y_train  # Add the target to the dataset

# Split treated and untreated groups
treated = X_train_with_treatment[X_train_with_treatment['Z'] == 1]
untreated = X_train_with_treatment[X_train_with_treatment['Z'] == 0]

# Oversample treated group to match the size of the untreated group
treated_oversampled = resample(treated, 
                               replace=True,  # Sample with replacement
                               n_samples=len(untreated),  # Match untreated sample size
                               random_state=42)

# Combine the oversampled treated group with the untreated group
balanced_train = pd.concat([untreated, treated_oversampled])

# Separate the features, treatment labels, and target again
X_train_balanced = balanced_train.drop(columns=['Z', 'Y'])  # Features
treatment_train_balanced = balanced_train['Z']  # Treatment indicator
y_train_balanced = balanced_train['Y']  # Target (Outcome)

 # Use a RandomForestRegressor as the base model for the S-learner
s_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Now train the S-learner on the balanced dataset
s_treatment_effect_balanced, s_model_balanced = s_learner(X_train_balanced, X_test_df, y_train_balanced, y_test, treatment_train_balanced, treatment_test,s_model)

# Print the estimated treatment effects
print("Estimated treatment effects (S-Learner, Balanced):\n", s_treatment_effect_balanced[:5])


S-Learner MSE: 0.520193741079241
Estimated treatment effects (S-Learner, Balanced):
 [0. 0. 0. 0. 0.]


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Step 1: Estimate propensity scores using Logistic Regression
propensity_model = LogisticRegression(random_state=42)
propensity_model.fit(X_train_df, treatment_train)

# Get propensity scores for training and test sets
propensity_scores_train = propensity_model.predict_proba(X_train_df)[:, 1]
propensity_scores_test = propensity_model.predict_proba(X_test_df)[:, 1]

# Step 2: Add propensity scores to the training and test datasets
X_train_with_ps = X_train_df.copy()
X_train_with_ps['propensity_score'] = propensity_scores_train

X_test_with_ps = X_test_df.copy()
X_test_with_ps['propensity_score'] = propensity_scores_test

# Step 3: Handle missing values by imputing them
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_with_ps)
X_test_imputed = imputer.transform(X_test_with_ps)

# Step 4: Train the S-Learner using Gradient Boosting
s_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
s_model.fit(X_train_with_ps, y_train)

# Predict outcomes for treated and untreated cases to calculate treatment effects
X_test_with_ps['treatment'] = 1  # For treated cases
y_pred_treated = s_model.predict(X_test_imputed)

X_test_with_ps['treatment'] = 0  # For untreated cases
y_pred_control = s_model.predict(X_test_imputed)

# Calculate the treatment effect
treatment_effect = y_pred_treated - y_pred_control

# Step 5: Evaluate the model performance using mean squared error (MSE)
mse = mean_squared_error(y_test, s_model.predict(X_test_imputed))
print(f"S-Learner MSE: {mse}")

# Print the estimated treatment effects
print("Estimated treatment effects (S-Learner, with Propensity Scores):\n", treatment_effect[:5])


S-Learner MSE: 0.37981158788422975
Estimated treatment effects (S-Learner, with Propensity Scores):
 [0. 0. 0. 0. 0.]




In [23]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Step 1: Estimate propensity scores using Logistic Regression
propensity_model = LogisticRegression(random_state=42)
propensity_model.fit(X_train_df, treatment_train)

# Get propensity scores for training and test sets
propensity_scores_train = propensity_model.predict_proba(X_train_df)[:, 1]
propensity_scores_test = propensity_model.predict_proba(X_test_df)[:, 1]

# Step 2: Add propensity scores to the training and test datasets
X_train_with_ps = X_train_df.copy()
X_train_with_ps['propensity_score'] = propensity_scores_train

X_test_with_ps = X_test_df.copy()
X_test_with_ps['propensity_score'] = propensity_scores_test

# Step 3: Handle missing values by imputing them
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_with_ps)
X_test_imputed = imputer.transform(X_test_with_ps)

# Step 4: Create polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train_imputed)
X_test_poly = poly.transform(X_test_imputed)

# Step 5: Add treatment column to imputed numpy arrays using np.column_stack
X_train_poly_with_treatment = np.column_stack((X_train_poly, treatment_train))
X_test_poly_with_treatment = np.column_stack((X_test_poly, treatment_test))

# Step 6: Train the S-Learner using Gradient Boosting
s_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
s_model.fit(X_train_poly_with_treatment, y_train)

# Predict outcomes for both treated and untreated cases
X_test_poly_with_treatment[:, -1] = 1  # For treated cases
y_pred_treated = s_model.predict(X_test_poly_with_treatment)

X_test_poly_with_treatment[:, -1] = 0  # For untreated cases
y_pred_control = s_model.predict(X_test_poly_with_treatment)

# Calculate the treatment effect
treatment_effect = y_pred_treated - y_pred_control

# Step 7: Evaluate the model performance using mean squared error (MSE)
mse = mean_squared_error(y_test, s_model.predict(X_test_poly_with_treatment))
print(f"S-Learner MSE: {mse}")

# Print the estimated treatment effects
print("Estimated treatment effects (S-Learner, Polynomial Features):\n", treatment_effect[:5])


S-Learner MSE: 0.3864978186060779
Estimated treatment effects (S-Learner, Polynomial Features):
 [0.16798218 0.16823798 0.35765285 0.43156245 0.28123992]
