# Prepare data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import ast
import matplotlib.pyplot as plt
import dill
import torch
import nbimporter
import shap

import os
import sys

os.chdir('/data/repos/actin-personalization/prediction')
sys.path.insert(0, os.path.abspath("src/main/python"))

from models import *
from data.data_processing import DataSplitter, DataPreprocessor
from data.lookups import lookup_manager
from utils.settings import settings
from src.main.python.analysis.predictive_algorithms_training import get_data, plot_different_models_survival_curves

preprocessor = DataPreprocessor(settings.db_config_path, settings.db_name)

In [None]:
def get_preprocessed_data_with_sourceId(preprocessor):
    df_raw = preprocessor.load_data()
    df_all, updated_features, _ = preprocessor.preprocess_data(
        lookup_manager.features, df=df_raw
    )
    df_all["sourceId"] = df_raw.loc[df_all.index, "sourceId"]
    #df_all["reasonRefrainmentFromTreatment"] = df_raw.loc[df_all.index, "reasonRefrainmentFromTreatment"]
    return df_raw, df_all, updated_features

df_raw, df_all, updated_features = get_preprocessed_data_with_sourceId(preprocessor)

In [None]:
import json
with open('src/main/python/data/treatment_combinations.json', 'r') as f:
    valid_treatment_combinations = json.load(f)

# Preperation propensity scores

Exclude non covariate columns

In [None]:
exclude = [
    'hadSurvivalEvent',
    'systemicTreatmentPlan_5-FU',
    'systemicTreatmentPlan_oxaliplatin',
    'systemicTreatmentPlan_irinotecan',
    'systemicTreatmentPlan_bevacizumab',
    'systemicTreatmentPlan_panitumumab',
    'systemicTreatmentPlan_pembrolizumab',
    'systemicTreatmentPlan_nivolumab',
    'hasTreatment',
    'survivalDaysSinceMetastaticDiagnosis',
    'investigatedLymphNodesCountPrimaryDiagnosis',
    'hasRasMutation'
]

df_covariate = df_all.copy()

#df_covariate["hasInvestigatedLymphNodes"] = (
#    df_covariate["investigatedLymphNodesCountPrimaryDiagnosis"] > 0
#)

df_covariate = df_covariate.drop(columns=exclude, errors='ignore')



Create treatment table

In [None]:
treatment_prefix = "systemicTreatmentPlan_"
treatment_cols = [col for col in df_all.columns if col.startswith(treatment_prefix)]

def extract_actual_treatment(row):
    actual_treatments = [col for col in treatment_cols if row[col] == 1]
    if actual_treatments:
        return ", ".join([col.replace(treatment_prefix, "") for col in actual_treatments])
    else:
        return "No Treatment"

df_all["actual_treatment"] = df_all.apply(extract_actual_treatment, axis=1)


# Calculation of propensity scores

Logistic regression based on whole dataset

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

treatments = df_all["actual_treatment"].astype("category")
covariates = df_covariate.copy()

covariates_encoded = pd.get_dummies(covariates, drop_first=True)
constant_cols = covariates_encoded.columns[covariates_encoded.std() == 0]
if len(constant_cols) > 0:
    print(f"Dropping constant columns (for StandardScaler): {list(constant_cols)}")
    covariates_encoded = covariates_encoded.drop(columns=constant_cols)

covariates_encoded = covariates_encoded.reset_index(drop=True)
df_all = df_all.reset_index(drop=True)

pipe = Pipeline([
    ("scale", StandardScaler()),
    ("clf", LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=1000,
        random_state=42
    ))
])

pipe.fit(covariates_encoded, treatments)

propensity_probs = pipe.predict_proba(covariates_encoded)
treatment_classes = pipe.named_steps["clf"].classes_

for i, label in enumerate(treatment_classes):
    df_all[f"propensity_{label}"] = propensity_probs[:, i]


Logistic regression based on trainset

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

treatments = df_all["actual_treatment"].astype("category")
covariates = df_covariate.copy()

covariates_encoded = pd.get_dummies(covariates, drop_first=True)
constant_cols = covariates_encoded.columns[covariates_encoded.std() == 0]
if len(constant_cols) > 0:
    print(f"Dropping constant columns (for StandardScaler): {list(constant_cols)}")
    covariates_encoded = covariates_encoded.drop(columns=constant_cols)

covariates_encoded = covariates_encoded.reset_index(drop=True)
df_all = df_all.reset_index(drop=True)

covariates_train, covariates_test, treatments_train, treatments_test = train_test_split(
    covariates_encoded, treatments, test_size=0.2, random_state=42, stratify=treatments
)

pipe = Pipeline([
    ("scale", StandardScaler()),
    ("clf", LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=1000,
        random_state=42
    ))
])

pipe.fit(covariates_train, treatments_train)

propensity_probs = pipe.predict_proba(covariates_encoded)
treatment_classes = pipe.named_steps["clf"].classes_

for i, label in enumerate(treatment_classes):
    df_all[f"propensity_{label}"] = propensity_probs[:, i]


# Random patient generation

import random
valid_ids = set(df_all['sourceId'].unique())

def get_valid_random_patient_id():
    while True:
        random_id = random.choice(range(1, 10000000))
        if random_id in valid_ids:
            return random_id

random_patient_id = get_valid_random_patient_id()
print(random_patient_id)


In [None]:
import random
threshold_days = 2000 
eligible_patients = df_all[
    (df_all['hasTreatment'] == 0) &
    (df_all['survivalDaysSinceMetastaticDiagnosis'] > threshold_days)
]

valid_ids = set(eligible_patients['sourceId'].unique())

def get_valid_random_patient_id():
    if not valid_ids:
        raise ValueError("No valid patients meet the criteria.")
    return random.choice(list(valid_ids))

random_patient_id = get_valid_random_patient_id()
print(random_patient_id)


# Personalized patient propensity score estimation

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from IPython.display import display, Markdown

patient_id = random_patient_id
patient_id = 6774820
row = df_all[df_all["sourceId"] == patient_id]
treatment_row = df_all[df_all["sourceId"] == patient_id].squeeze()
raw_row = df_raw[df_raw["sourceId"] == patient_id].squeeze()

model = pipe.named_steps["clf"]
scaler = pipe.named_steps["scale"]
feature_names = covariates_encoded.columns
classes = model.classes_
coef_matrix = model.coef_
intercepts = model.intercept_

propensity_cols = [col for col in df_all.columns if col.startswith("propensity_")]
patient_propensity = row[propensity_cols].T
patient_propensity.columns = ["Propensity Score"]
patient_propensity.index = [col.replace("propensity_", "") for col in patient_propensity.index]
patient_propensity["Propensity Score"] = patient_propensity["Propensity Score"].astype(float).round(2)
patient_propensity = patient_propensity[patient_propensity["Propensity Score"] >= 0.05]
patient_propensity = patient_propensity.sort_values(by="Propensity Score", ascending=False)

patient_cov_std = scaler.transform(row[feature_names])[0]
logits = coef_matrix @ patient_cov_std + intercepts
probs = np.exp(logits) / np.sum(np.exp(logits))

top_two_idx = np.argsort(probs)[-2:][::-1]
top_idx, second_idx = top_two_idx
top_class = classes[top_idx]
second_class = classes[second_idx]

top_coef = coef_matrix[top_idx]
top_contrib = patient_cov_std * top_coef
treatment_counts = Counter(treatments)
total = sum(treatment_counts[c] for c in classes if c != top_class)
weights = [treatment_counts[c] / total for c in classes if c != top_class]
avg_other_contrib = np.average(
    [patient_cov_std * coef_matrix[i] for i, c in enumerate(classes) if c != top_class],
    axis=0, weights=np.array(weights)
)
delta_contrib_first = top_contrib - avg_other_contrib
actual_values = row[feature_names].values.flatten()

df_first = pd.DataFrame({
    "feature": feature_names,
    "value": np.round(actual_values, 2),
    "top_contribution": np.round(top_contrib, 2),
    "weighted_avg_other_contribution": np.round(avg_other_contrib, 2),
    "Δ_contribution": np.round(delta_contrib_first, 2)
})
df_first = df_first[df_first["Δ_contribution"] > 0].sort_values("Δ_contribution", ascending=False)

second_coef = coef_matrix[second_idx]
second_contrib = patient_cov_std * second_coef
delta_contrib_second = top_contrib - second_contrib

df_second = pd.DataFrame({
    "feature": feature_names,
    "value": np.round(actual_values, 2),
    f"{top_class} contribution": np.round(top_contrib, 2),
    f"{second_class} contribution": np.round(second_contrib, 2),
    "Δ_contribution": np.round(delta_contrib_second, 2)
})
df_second["abs_Δ_contribution"] = df_second["Δ_contribution"].abs()
df_second = df_second.sort_values("abs_Δ_contribution", ascending=False)

def interpret_value(v):
    if v == 1.0: return "positive"
    if v == 0.0: return "negative"
    if v > 0: return "relatively high"
    if v < 0: return "relatively low"
    return "unknown"

def get_raw_value(encoded_feature):
    if encoded_feature in raw_row:
        return raw_row[encoded_feature]
    prefix = encoded_feature.split('_')[0]
    return raw_row.get(prefix, np.nan)

def prettify_feature(feat):
    if "_" in feat and not feat.startswith("has"):
        base, category = feat.split("_", 1)
        base = re.sub(r'(?<!^)(?=[A-Z])', ' ', base).strip().capitalize()
        category = category.replace("_", " ").lower().capitalize()
        return f"{base}: {category}"
    else:
        return re.sub(r'(?<!^)(?=[A-Z])', ' ', feat).strip().capitalize()

for df in [df_first, df_second]:
    df["interpretation"] = df["value"].apply(interpret_value)
    df["actual_value"] = df["feature"].apply(get_raw_value)
    df["feature_written"] = df["feature"].apply(prettify_feature)

treatment_cols = [col for col in treatment_row.index if col.startswith("systemicTreatmentPlan_")]
actual_treatments = [
    col.replace("systemicTreatmentPlan_", "")
    for col in treatment_cols
    if str(treatment_row[col]).strip() in {"1", "1.0", "True", "true"}
]
actual_treatment_str = " + ".join(actual_treatments) if actual_treatments else "No treatment"
survival_days = treatment_row.get("survivalDaysSinceMetastaticDiagnosis", "Unknown")

top_score_percent = f"{patient_propensity.loc[top_class, 'Propensity Score'] * 100:.0f}%" if top_class in patient_propensity.index else "N/A"

if second_class in patient_propensity.index:
    second_score_percent = f"{patient_propensity.loc[second_class, 'Propensity Score'] * 100:.0f}%"
    show_second = True
else:
    second_score_percent = None
    show_second = False

output = [
    f"**Actual Treatment:** {actual_treatment_str}  ",
    f"**Observed Survival:** {survival_days} days",
    ""
]

output.append(f"**{top_class}** ({top_score_percent})")
for _, row_ in df_first[df_first["Δ_contribution"] > 0.1].iterrows():
    output.append(f"- **{row_['feature_written']} →** {row_['interpretation']} ({row_['actual_value']})")

if show_second:
    output.append("")
    output.append(f"**{second_class}** ({second_score_percent})")
    for _, row_ in df_second[df_second["Δ_contribution"] < -0.1].iterrows():
        output.append(f"- **{row_['feature_written']} →** {row_['interpretation']} ({row_['actual_value']})")

output.append("")
for treatment, score in patient_propensity["Propensity Score"].items():
    if treatment not in [top_class, second_class] and score >= 0.05:
        output.append(f"**{treatment}** ({score * 100:.0f}%)")
        output.append("")

display(Markdown("\n".join(output)))


# Accuracy estimation

Based on test set

In [None]:
import re
import numpy as np
import pandas as pd

def normalize_treatment_string(treat_str):
    if not treat_str or treat_str.strip().lower() in {"none", "no treatment"}:
        return set()
    parts = re.split(r"[,+]", treat_str)
    return set(p.strip().lower() for p in parts if p.strip())

df_test = df_all.loc[covariates_test.index].reset_index(drop=True)
test_set = covariates_test.reset_index(drop=True)

treatment_comparisons = []

for idx, row in df_test.iterrows():
    source_id = row["sourceId"]

    treatment_cols = [col for col in row.index if col.startswith("systemicTreatmentPlan_")]
    actual_treatments = [
        col.replace("systemicTreatmentPlan_", "")
        for col in treatment_cols
        if str(row[col]).strip() in {"1", "1.0", "True", "true"}
    ]
    actual_treatment_str = " + ".join(actual_treatments) if actual_treatments else "None"

    X_std = scaler.transform(test_set.iloc[[idx]])[0]
    logits = coef_matrix @ X_std + intercepts
    probs = np.exp(logits) / np.sum(np.exp(logits))
    top_class = classes[np.argmax(probs)]
    top_prob = probs[np.argmax(probs)]

    predicted_set = normalize_treatment_string(top_class)
    actual_set = normalize_treatment_string(actual_treatment_str)
    is_correct = predicted_set == actual_set

    treatment_comparisons.append({
        "sourceId": source_id,
        "predicted_treatment": top_class,
        "predicted_probability": round(top_prob, 3),
        "actual_treatment": actual_treatment_str,
        "correct_match": is_correct
    })

df_treatment_comparison = pd.DataFrame(treatment_comparisons)

accuracy = df_treatment_comparison["correct_match"].mean()
print(f"Correct treatment match accuracy (test set): {accuracy:.2%}")

mismatches = df_treatment_comparison[~df_treatment_comparison["correct_match"]]
display(mismatches.head(100))


Based on whole dataset

def normalize_treatment_string(treat_str):
    if not treat_str or treat_str.strip().lower() in {"none", "no treatment"}:
        return set()
    parts = re.split(r"[,+]", treat_str)
    return set(p.strip().lower() for p in parts if p.strip())


treatment_comparisons = []

for idx, row in df_all.iterrows():
    source_id = row["sourceId"]

    treatment_cols = [col for col in row.index if col.startswith("systemicTreatmentPlan_")]
    actual_treatments = [
        col.replace("systemicTreatmentPlan_", "")
        for col in treatment_cols
        if str(row[col]).strip() in {"1", "1.0", "True", "true"}
    ]
    actual_treatment_str = " + ".join(actual_treatments) if actual_treatments else "None"

    X_std = scaler.transform(row[feature_names].to_frame().T)[0]
    logits = coef_matrix @ X_std + intercepts
    probs = np.exp(logits) / np.sum(np.exp(logits))
    top_class = classes[np.argmax(probs)]
    top_prob = probs[np.argmax(probs)]

    predicted_set = normalize_treatment_string(top_class)
    actual_set = normalize_treatment_string(actual_treatment_str)

    is_correct = predicted_set == actual_set

    treatment_comparisons.append({
        "sourceId": source_id,
        "predicted_treatment": top_class,
        "predicted_probability": round(top_prob, 3),
        "actual_treatment": actual_treatment_str,
        "correct_match": is_correct
    })

df_treatment_comparison = pd.DataFrame(treatment_comparisons)

accuracy = df_treatment_comparison["correct_match"].mean()
print(f"Correct treatment match accuracy: {accuracy:.2%}")

mismatches = df_treatment_comparison[~df_treatment_comparison["correct_match"]]
display(mismatches.head(100))


# Propensity overlap

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

for label in treatment_classes:
    sns.kdeplot(df_all[df_all["actual_treatment"] == label][f"propensity_{label}"], label=f"{label} - treated")
    sns.kdeplot(df_all[df_all["actual_treatment"] != label][f"propensity_{label}"], label=f"{label} - others")
    plt.title(f"Propensity Overlap for {label}")
    plt.legend()
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


df_treatment_comparison['actual_treatment'] = df_treatment_comparison['actual_treatment'].str.replace(" \+ ", ", ").str.title()
df_treatment_comparison['predicted_treatment'] = df_treatment_comparison['predicted_treatment'].str.title()


cross_tab = pd.crosstab(
    df_treatment_comparison['predicted_treatment'],
    df_treatment_comparison['actual_treatment'],
    normalize='index'
) * 100


ax = cross_tab.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.ylabel('Percentage of Actual Treatments')
plt.title('Distribution of Actual Treatments per Predicted Treatment')
plt.legend(title='Actual Treatment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
