In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
import statsmodels.api as sm

# Step 1: Setup
trial_pp_dir = "trial_pp"
trial_itt_dir = "trial_itt"
os.makedirs(trial_pp_dir, exist_ok=True)
os.makedirs(trial_itt_dir, exist_ok=True)




In [None]:
# Step 2: Data Preparation
df = pd.read_csv("data/data_censored.csv")
print(df.head())
print(df.describe())

df = df.sort_values(by=["id", "period"]).reset_index(drop=True)

# Per-protocol
trial_pp = {
    "data": df,
    "id": "id",
    "period": "period",
    "treatment": "treatment",
    "outcome": "outcome",
    "eligible": "eligible"
}
print("## Trial Sequence Object")
print("## Estimand: Per-Protocol")
print("##")
print(f"## Data: \n##  - N: {df.shape[0]} observations from {df['id'].nunique()} patients")
print(df.head(10).to_string(index=False))
print("##")
print("## IPW for informative censoring:")
print("##  - No weight model specified")
print("##")
print("## Sequence of Trials Data:")
print("## - Use set_expansion_options() and expand_trials() to construct the sequence of trials dataset.")
print("##")
print("## Outcome model:")
print("##  - Outcome model not specified. Use set_outcome_model()")
# ITT
trial_itt = {
    "data": df,
    "id": "id",
    "period": "period",
    "treatment": "treatment",
    "outcome": "outcome",
    "eligible": "eligible"
}



   id  period  treatment  x1        x2  x3        x4  age     age_s  outcome  \
0   1       0          1   1  1.146148   0  0.734203   36  0.083333        0   
1   1       1          1   1  0.002200   0  0.734203   37  0.166667        0   
2   1       2          1   0 -0.481762   0  0.734203   38  0.250000        0   
3   1       3          1   0  0.007872   0  0.734203   39  0.333333        0   
4   1       4          1   1  0.216054   0  0.734203   40  0.416667        0   

   censored  eligible  
0         0         1  
1         0         0  
2         0         0  
3         0         0  
4         0         0  
               id      period   treatment          x1          x2          x3  \
count  725.000000  725.000000  725.000000  725.000000  725.000000  725.000000   
mean    49.278621    7.051034    0.467586    0.405517   -0.173552    0.486897   
std     28.119313    5.802351    0.499293    0.491331    0.997552    0.500173   
min      1.000000    0.000000    0.000000    0.0000

In [4]:

# Step 3: Weight Models and Censoring
# 3.1 Censoring due to treatment switching
switch_model = LogisticRegression()
switch_model.fit(df[["age", "x1", "x3"]], df["treatment"])
df["switch_weights"] = switch_model.predict_proba(df[["age", "x1", "x3"]])[:, 1]

# 3.2 Other informative censoring
censor_model = LogisticRegression()
censor_model.fit(df[["x2", "x1"]], df["censored"])
df["censor_weights"] = censor_model.predict_proba(df[["x2", "x1"]])[:, 1]

In [6]:
# Step 4: Calculate Weights
df["weights"] = 1 / (df["switch_weights"] + df["censor_weights"] + 1e-6)

# Step 5: Specify Outcome Model
X = df[['treatment', 'age', 'sex_male']] if "sex_male" in df.columns else df[['treatment', 'age']]
y = df['outcome']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

# Step 6: Expand Trials
df_expanded = df.copy()
df_expanded['trial_id'] = np.arange(len(df))

# Step 7: Clustering Mechanism (TTE-v2 Modification)
kmeans = KMeans(n_clusters=3, random_state=42)
df_expanded['cluster'] = kmeans.fit_predict(df_expanded[['age', 'treatment']])

# Step 8: Fit Marginal Structural Model
X_clustered = df_expanded[['treatment', 'age', 'sex_male', 'cluster']] if "sex_male" in df_expanded.columns else df_expanded[['treatment', 'age', 'cluster']]
X_clustered = sm.add_constant(X_clustered)
model_clustered = sm.OLS(y, X_clustered).fit()
print(model_clustered.summary())

# Step 9: Inference
print("Standard Model Coefficients:")
print(model.params)
print("Clustered Model Coefficients:")
print(model_clustered.params)


                            OLS Regression Results                            
Dep. Variable:                outcome   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.004
Date:                Sun, 09 Mar 2025   Prob (F-statistic):              0.367
Time:                        17:21:29   Log-Likelihood:                 496.07
No. Observations:                 725   AIC:                            -986.1
Df Residuals:                     722   BIC:                            -972.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0096      0.021      0.458      0.6