In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

## CSV 8.1

Using homework_8.1.csv, find the Average treatment effect with inverse probability weighting. Then, include your code and a written explanation of your work (mentioning any choices or strategies you made in writing the code) in your homework reflection.  



Here are some steps to follow: 



* Estimate the propensity scores using logistic regression. Fit the model so that the Z values predict ﻿X﻿. 

* Use the model to predict the propensity scores (e.g., using predict_proba if you are using sklearn). 

* Calculate inverse probability weights (﻿1 over P﻿ for ﻿X equals 1﻿ and ﻿fraction numerator 1 over denominator 1 minus P end fraction﻿ for ﻿X equals 0﻿). 

* Estimate the average treatment effect (the Y difference between ﻿X equals 1﻿ and ﻿X equals 0﻿, using the appropriate weights for each). 

In [5]:
df_1 = pd.read_csv(r'C:\Users\JT von Seggern\DS Masters Repos\2025-summer-mod-6\homework_8.1.csv')
df_1 = df_1.drop(columns=['Unnamed: 0'])
df_1.head()

Unnamed: 0,X,Y,Z
0,1,4.109218,1.764052
1,0,2.259504,0.400157
2,0,-0.647584,0.978738
3,0,2.106071,2.240893
4,1,3.583464,1.867558


In [6]:
df_1.info()
df_1.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       1000 non-null   int64  
 1   Y       1000 non-null   float64
 2   Z       1000 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 23.6 KB


Unnamed: 0,X,Y,Z
count,1000.0,1000.0,1000.0
mean,0.481,1.014397,-0.045257
std,0.499889,1.998531,0.987527
min,0.0,-5.491184,-3.046143
25%,0.0,-0.509696,-0.69842
50%,0.0,1.013902,-0.058028
75%,1.0,2.56796,0.606951
max,1.0,6.150865,2.759355


In [7]:
# Find the average treatment effect with inverse probability weighting
def average_treatment_effect(df, treatment_col, outcome_col, confounder_col):
       # Initialize the logistic regression model
       model = LogisticRegression()

       # reshape the dataframe to fit the model
       Z = df[confounder_col].values.reshape(-1,1)
       model.fit(Z, df[treatment_col])

       # Calculate the propensity score
       df['propensity_score'] = model.predict_proba(Z)[:, 1]
       # Calculate the weights
       df['weights'] = np.where(df[treatment_col] == 1, 1 / df['propensity_score'], 1 / (1 - df['propensity_score']))

       # Calculate the weighted means
       treated = df[df[treatment_col] == 1]
       control = df[df[treatment_col] == 0]

       # Calculate the ATE
       treated_mean = (treated[outcome_col] * treated['weights']).sum() / treated['weights'].sum()
       control_mean = (control[outcome_col] * control['weights']).sum() / control['weights'].sum()
       ate = treated_mean - control_mean

       print(f"ATE: {ate}, Treated Mean: {treated_mean}, Control Mean: {control_mean}")
       return df, ate, treated, control

In [8]:
df_8_1, ate_8_1, treated_mean_8_1, control_mean_8_1 = average_treatment_effect(df_1, 'X', 'Y', 'Z')

ATE: 2.2743411898510133, Treated Mean: 2.2366970237600547, Control Mean: -0.037644166090958685


In [9]:
df_8_1.head()

Unnamed: 0,X,Y,Z,propensity_score,weights
0,1,4.109218,1.764052,0.840114,1.190315
1,0,2.259504,0.400157,0.584646,2.407585
2,0,-0.647584,0.978738,0.711082,3.461195
3,0,2.106071,2.240893,0.892793,9.327719
4,1,3.583464,1.867558,0.853089,1.172211


## CSV 8.2

Using homework_8.2.csv, match all treated items to the single nearest untreated item using the Mahalanobis distance. (Do this with replacement — the same untreated item can be used again.) 



* Use the Mahalanobis function from scipy.spatial.distance 

* For the inverse covariance matrix, use all ﻿Z 1﻿ values and all ﻿Z 2﻿ values, make them into a ﻿2 x N﻿ matrix, find its ﻿2 x 2﻿ covariance, and invert. 

In [10]:
# Read in 8.2 data
df_2 = pd.read_csv(r'C:\Users\JT von Seggern\DS Masters Repos\2025-summer-mod-6\homework_8.2.csv')
df_2 = df_2.drop(columns=['Unnamed: 0'])
df_2.head()

Unnamed: 0,X,Y,Z1,Z2
0,1,4.652085,1.764052,2.320015
1,1,2.590221,0.400157,1.292631
2,1,3.898981,0.978738,0.556423
3,1,5.857179,2.240893,2.345607
4,1,3.647489,1.867558,2.095611


In [11]:
df_2.info()
df_2.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       1000 non-null   int64  
 1   Y       1000 non-null   float64
 2   Z1      1000 non-null   float64
 3   Z2      1000 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 31.4 KB


Unnamed: 0,X,Y,Z1,Z2
count,1000.0,1000.0,1000.0,1000.0
mean,0.483,1.552274,-0.045257,-0.03164
std,0.499961,2.650695,0.987527,1.361324
min,0.0,-5.797134,-3.046143,-4.024931
25%,0.0,-0.601263,-0.69842,-1.007259
50%,0.0,1.512986,-0.058028,-0.017216
75%,1.0,3.628108,0.606951,0.879308
max,1.0,8.041867,2.759355,4.174226


In [None]:
def match_treated_to_control(df):
    from scipy.spatial.distance import cdist

    # Separate treated and untreated groups
    treated = df[df['X'] == 1].copy()
    control = df[df['X'] == 0].copy()

    # Create a matrix of the confounders
    Z = df[['Z1', 'Z2']].values

    # Calculate the covariance matrix of the confounders
    cov_matrix = np.cov(Z.T)
    inv_cov_matrix = np.linalg.inv(cov_matrix)

    # Calculate the Mahalanobis distance matrix
    # Rows = treated units, Columns = control units
    distance_matrix = cdist(treated[['Z1', 'Z2']], control[['Z1', 'Z2']], 
                           metric='mahalanobis', VI=inv_cov_matrix)

    # Find the closest control for each treated unit
    closest_control_indices = np.argmin(distance_matrix, axis=1)
    
    # Get the minimum distance for each treated unit to its nearest control
    min_distances_to_controls = np.min(distance_matrix, axis=1)
    
    # Find the treated unit with the MAXIMUM minimum distance (least common support)
    farthest_treated_index = np.argmax(min_distances_to_controls)
    farthest_distance = min_distances_to_controls[farthest_treated_index]
    
    # Get the treated unit with least common support
    treated_reset = treated.reset_index(drop=True)
    farthest_treated_unit = treated_reset.iloc[farthest_treated_index]

    # Find the specific control unit closest to this farthest treated unit
    closest_control_to_farthest = closest_control_indices[farthest_treated_index]
    control_reset = control.reset_index(drop=True)
    nearest_control_unit = control_reset.iloc[closest_control_to_farthest]

    # Get Y values of matched controls for ALL treated units
    treated_reset['matched_control_Y'] = control_reset.iloc[closest_control_indices]['Y'].values
    treated_reset['mahalanobis_distance'] = min_distances_to_controls

    # Calculate the ATE
    ate = (treated_reset['Y'] - treated_reset['matched_control_Y']).mean()

    print(f"ATE: {ate:.4f}")
    print(f"\nTreated unit with least common support (farthest from any control):")
    print(f"  Treated Index: {farthest_treated_index}")
    print(f"  Z1: {farthest_treated_unit['Z1']:.4f}")
    print(f"  Z2: {farthest_treated_unit['Z2']:.4f}")
    print(f"  Y: {farthest_treated_unit['Y']:.4f}")
    print(f"  Distance to nearest control: {farthest_distance:.4f}")
    print()
    print(f"Its nearest control unit:")
    print(f"  Control Index: {closest_control_to_farthest}")
    print(f"  Z1: {nearest_control_unit['Z1']:.4f}")
    print(f"  Z2: {nearest_control_unit['Z2']:.4f}")
    print(f"  Y: {nearest_control_unit['Y']:.4f}")
    
    return treated_reset, ate, farthest_treated_unit, nearest_control_unit, farthest_distance

In [15]:
treated_reset, ate, farthest_treated_unit, nearest_control_unit, farthest_distance = match_treated_to_control(df_2)

ATE: 3.4377

Treated unit with least common support (farthest from any control):
  Treated Index: 241
  Z1: 2.6962
  Z2: 0.5382
  Y: 6.5402
  Distance to nearest control: 1.3830

Its nearest control unit:
  Control Index: 217
  Z1: 1.5200
  Z2: -1.2822
  Y: 5.4080


In [16]:
treated_reset.head()

Unnamed: 0,X,Y,Z1,Z2,matched_control_Y,mahalanobis_distance
0,1,4.652085,1.764052,2.320015,0.428954,0.648812
1,1,2.590221,0.400157,1.292631,-0.034844,0.066003
2,1,3.898981,0.978738,0.556423,1.164988,0.082207
3,1,5.857179,2.240893,2.345607,1.79745,0.974003
4,1,3.647489,1.867558,2.095611,1.79745,0.687321
