In [6]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import mahalanobis
from scipy.optimize import linear_sum_assignment

# Step 1: Create a sample of 400 patients
np.random.seed(42)
n_patients = 400

# Step 2: Assign each patient an ID and randomly assign to a group (treated or not-yet-treated)
patient_ids = np.arange(1, n_patients + 1)
is_treated = np.random.choice([0, 1], size=n_patients)

# Step 3: Generate 6-dimensional symptom variables (values from 0 to 9)
symptom_data = np.random.randint(0, 10, size=(n_patients, 6))
columns = ['pain_baseline', 'urgency_baseline', 'frequency_baseline',
           'pain_treatment', 'urgency_treatment', 'frequency_treatment']

patient_df = pd.DataFrame(symptom_data, columns=columns)
patient_df['ID'] = patient_ids
patient_df['treated'] = is_treated

# Step 4: Convert each variable into 2 binary indicators using percentiles
for col in columns:
    p33, p67 = np.percentile(patient_df[col], [33.33, 66.67])
    
    patient_df[f'{col}_low'] = (patient_df[col] < p33).astype(int)
    patient_df[f'{col}_mid'] = ((patient_df[col] >= p33) & (patient_df[col] < p67)).astype(int)
    patient_df[f'{col}_high'] = (patient_df[col] >= p67).astype(int)

# Keep only binary variables for matching
binary_columns = [col for col in patient_df.columns if '_low' in col or '_mid' in col or '_high' in col]
binary_data = patient_df[['ID', 'treated'] + binary_columns]

# Step 5: Implement integer programming to create balanced matches

def create_balanced_matching(binary_data):
    treated = binary_data[binary_data['treated'] == 1].reset_index(drop=True)
    not_yet_treated = binary_data[binary_data['treated'] == 0].reset_index(drop=True)
    
    num_treated = len(treated)
    num_not_treated = len(not_yet_treated)
    
    # Construct cost matrix using Mahalanobis distance
    cov_matrix = np.cov(binary_data[binary_columns].values, rowvar=False)
    inv_cov_matrix = np.linalg.pinv(cov_matrix)  # Use pseudo-inverse for numerical stability
    
    cost_matrix = np.full((num_treated, num_not_treated), np.inf)
    
    for i in range(num_treated):
        for j in range(num_not_treated):
            cost_matrix[i, j] = mahalanobis(treated.iloc[i, 2:].values, 
                                            not_yet_treated.iloc[j, 2:].values, 
                                            inv_cov_matrix)
    
    # Solve the assignment problem ensuring each untreated patient is matched at most once
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    
    matched_pairs = []
    used_controls = set()
    for i, j in zip(row_ind, col_ind):
        if j not in used_controls:  # Ensure untreated patient is matched at most once
            matched_pairs.append((treated.iloc[i, 0], not_yet_treated.iloc[j, 0]))
            used_controls.add(j)
    
    return matched_pairs

# Get the balanced matching
balanced_pairs = create_balanced_matching(binary_data)

# Display a sample of matched pairs with patient IDs
print("Matched Pairs (Treated ID, Not-Yet-Treated ID):")
print(balanced_pairs[:5])


Matched Pairs (Treated ID, Not-Yet-Treated ID):
[(np.int64(2), np.int64(387)), (np.int64(6), np.int64(369)), (np.int64(10), np.int64(191)), (np.int64(15), np.int64(338)), (np.int64(17), np.int64(139))]
