In [None]:
import pandas as pd 
import pymysql

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = "SELECT * FROM knownPalliativeTreatments"

df = pd.read_sql(query, db_connection)

db_connection.close()

list(df.columns)

In [None]:
from lifelines import CoxPHFitter
df_clean = df.dropna(subset = [
    'observedPfsDays',
    'hadProgressionEvent',
    'consolidatedTumorType',
    'ageAtDiagnosis',
    'whoStatusPreTreatmentStart',
    'tumorIncidenceYear'
])

def cox_model(df, formula):
    return CoxPHFitter().fit(
        df,
        duration_col="observedPfsDays",
        event_col="hadProgressionEvent",
        formula=formula,
        strata=["systemicTreatmentPlan"]
    )

full_model = cox_model(df_clean, "consolidatedTumorType + ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear")
full_model.print_summary()

In [None]:
reduced_model = cox_model(df_clean, "ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear")

reduced_model.print_summary()

In [None]:
def likelihood_ratio(llmin, llmax):
    return 2 * (llmax - llmin)

In [None]:
from dataclasses import dataclass
from math import log10

@dataclass
class TestResult:
    feature: str
    survival_impact: float
    p_val: float
    dof_delta: int
    log_likelihood_ratio: float

# def is_complete(feature_result):
#     test_result = feature_result[1]
#     return len(test_result) > 3 and test_result[3] == 'OK' # and test_result[1] < 0.05

# complete_results = list(map(lambda kv: TestResult(kv[0], kv[1][2], kv[1][1], -log10(kv[1][1])), dict(filter(is_complete, result.items())).items()))

# result_df = pd.DataFrame(complete_results)

# result_df[result_df["p_val"] < 0.1]

In [None]:
from scipy.stats.distributions import chi2
from sklearn.preprocessing import StandardScaler

base_df = df.dropna(subset = [
    'observedPfsDays',
    'hadProgressionEvent',
    'ageAtDiagnosis',
    'whoStatusPreTreatmentStart',
    'tumorIncidenceYear'
]).copy()


scaler = StandardScaler()
cols_to_standardize = [f for f in (['ageAtDiagnosis', 'whoStatusPreTreatmentStart', 'tumorIncidenceYear'] + features) if pd.api.types.is_numeric_dtype(df[f])]

base_df.loc[:, cols_to_standardize] = scaler.fit_transform(base_df.loc[:, cols_to_standardize])

def test_feature(df, simple_formula, feature):
    try:
        df_clean = df.dropna(subset = [feature])
        reduced_model = cox_model(df_clean, simple_formula)
        full_model = cox_model(df_clean, " + ".join([feature, simple_formula]))
        dof = 1 if pd.api.types.is_numeric_dtype(df[feature]) else df[feature].nunique() - 1
        log_likelihood_ratio = likelihood_ratio(reduced_model.log_likelihood_, full_model.log_likelihood_)
        p = chi2.sf(log_likelihood_ratio, dof)
        print(f"Testing feature {feature}: p-value {p}, exp(coeff) {full_model.hazard_ratios_[0]}")
        return TestResult(feature, full_model.hazard_ratios_[0], p, dof, log_likelihood_ratio)
    except Exception as e:
        print(f"Failed to test {feature}: {str(e)}")
        return None

test_feature(base_df, "ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear", "consolidatedTumorType")

In [None]:
features = [
     'sex',
     'consolidatedTumorType',
     'hasHadPriorTumor',
     'cci',
     'cciNumberOfCategories',
     'cciHasAids',
     'cciHasCongestiveHeartFailure',
     'cciHasCollagenosis',
     'cciHasCopd',
     'cciHasCerebrovascularDisease',
     'cciHasDementia',
     'cciHasDiabetesMellitus',
     'cciHasDiabetesMellitusWithEndOrganDamage',
     'cciHasOtherMalignancy',
     'cciHasOtherMetastaticSolidTumor',
     'cciHasMyocardialInfarct',
     'cciHasMildLiverDisease',
     #'cciHasHemiplegiaOrParaplegia',
     'cciHasPeripheralVascularDisease',
     'cciHasRenalDisease',
     'cciHasLiverDisease',
     'cciHasUlcerDisease',
     'presentedWithIleus',
     'presentedWithPerforation',
     'anorectalVergeDistanceCategory',
     'hasMsi',
     'hasBrafMutation',
     'hasBrafV600EMutation',
     'hasRasMutation',
     'hasKrasG12CMutation',
     'asaClassificationPreSurgeryOrEndoscopy',
     'tumorBasisOfDiagnosis',
     'tumorLocation',
     'tumorDifferentiationGrade',
     'tnmCT',
     'tnmCN',
     'tnmCM',
     #'tnmPT',
     'tnmPN',
     'tnmPM',
     'stageCTNM',
     'stagePTNM',
     'stageTNM',
     'investigatedLymphNodesNumber',
     'positiveLymphNodesNumber',
     'distantMetastasesDetectionStatus',
     'numberOfLiverMetastases',
     'maximumSizeOfLiverMetastasisMm',
     'hasDoublePrimaryTumor',
     'mesorectalFasciaIsClear',
     'distanceToMesorectalFasciaMm',
     'venousInvasionDescription',
     'lymphaticInvasionCategory',
     'extraMuralInvasionCategory',
     'tumorRegression',
     'hasParticipatedInTrial',
     'metastasesSurgeries',
     'radiotherapies',
     #'metastasesRadiotherapies',
     #'response',
     'metastasisLocationGroupsPriorToSystemicTreatment'
]
base_formula = "ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear"
result = [r for f in features if (r := test_feature(base_df, base_formula, f)) is not None]
result

In [None]:
len(result)

In [None]:
from math import log10

def is_complete(feature_result):
    test_result = feature_result[1]
    return len(test_result) > 3 and test_result[3] == 'OK' # and test_result[1] < 0.05

complete_results = dict(filter(is_complete, result.items()))

result_df = pd.DataFrame(data={
    "feature": [k for (k, _) in complete_results.items()],
    "survival_impact": [v[2] for (_, v) in complete_results.items()],
    "log10_p": [-log10(v[1]) for (_, v) in complete_results.items()],
})

result_df

In [None]:
import seaborn

seaborn.scatterplot(result_df[result_df["p_val"] < 0.1], x="survival_impact", y="log10_p")