In [None]:
import pandas as pd 
import pymysql

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = "SELECT * FROM knownPalliativeTreatments"

df = pd.read_sql(query, db_connection)

db_connection.close()

list(df.columns)

In [None]:
df.metastasisLocationGroupsPriorToSystemicTreatment.value_counts()

In [None]:
from lifelines import CoxPHFitter
df_clean = df.dropna(subset = [
    'observedPfsDays',
    'hadProgressionEvent',
    'consolidatedTumorType',
    'ageAtDiagnosis',
    'whoStatusPreTreatmentStart',
    'tumorIncidenceYear'
])

def cox_model(df, formula):
    return CoxPHFitter().fit(
        df,
        duration_col="observedPfsDays",
        event_col="hadProgressionEvent",
        formula=formula,
        strata=["systemicTreatmentPlan"]
    )

full_model = cox_model(df_clean, "consolidatedTumorType + ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear")
full_model.print_summary()

In [None]:
reduced_model = cox_model(df_clean, "ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear")

reduced_model.print_summary()

In [None]:
from scipy.stats.distributions import chi2
def likelihood_ratio(llmin, llmax):
    return 2 * (llmax - llmin)

LR = likelihood_ratio(reduced_model.log_likelihood_, full_model.log_likelihood_)

p = chi2.sf(LR, 3) # L2 has 3 DoF more than L1

p

In [None]:
pd.api.types.is_string_dtype(df['consolidatedTumorType'])

In [None]:
base_df = df.dropna(subset = [
    'observedPfsDays',
    'hadProgressionEvent',
    'ageAtDiagnosis',
    'whoStatusPreTreatmentStart',
    'tumorIncidenceYear'
])

def test_feature(df, simple_formula, feature):
    try:
        df_clean = df.dropna(subset = [feature])
        reduced_model = cox_model(df_clean, simple_formula)
        full_model = cox_model(df_clean, " + ".join([feature, simple_formula]))
        dof = 1 if pd.api.types.is_numeric_dtype(df[feature]) else df[feature].nunique() - 1
        p = chi2.sf(likelihood_ratio(reduced_model.log_likelihood_, full_model.log_likelihood_), dof)
        print(f"Testing feature {feature}: p-value {p}, exp(coeff) {full_model.hazard_ratios_[0]}")
        return [dof, p, full_model.hazard_ratios_[0] , "OK"]
    except Exception as e:
        print(f"Failed to test {feature}: {str(e)}")
        return [0, 0, str(e)]

# test_feature(base_df, "ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear", "consolidatedTumorType")

In [None]:
features = [
     'sex',
     'consolidatedTumorType',
     'hasHadPriorTumor',
     'cci',
     'cciNumberOfCategories',
     'cciHasAids',
     'cciHasCongestiveHeartFailure',
     'cciHasCollagenosis',
     'cciHasCopd',
     'cciHasCerebrovascularDisease',
     'cciHasDementia',
     'cciHasDiabetesMellitus',
     'cciHasDiabetesMellitusWithEndOrganDamage',
     'cciHasOtherMalignancy',
     'cciHasOtherMetastaticSolidTumor',
     'cciHasMyocardialInfarct',
     'cciHasMildLiverDisease',
     #'cciHasHemiplegiaOrParaplegia',
     'cciHasPeripheralVascularDisease',
     'cciHasRenalDisease',
     'cciHasLiverDisease',
     'cciHasUlcerDisease',
     'presentedWithIleus',
     'presentedWithPerforation',
     'anorectalVergeDistanceCategory',
     'hasMsi',
     'hasBrafMutation',
     'hasBrafV600EMutation',
     'hasRasMutation',
     'hasKrasG12CMutation',
     'asaClassificationPreSurgeryOrEndoscopy',
     'tumorBasisOfDiagnosis',
     'tumorLocation',
     'tumorDifferentiationGrade',
     'tnmCT',
     'tnmCN',
     'tnmCM',
     #'tnmPT',
     'tnmPN',
     'tnmPM',
     'stageCTNM',
     'stagePTNM',
     'stageTNM',
     'investigatedLymphNodesNumber',
     'positiveLymphNodesNumber',
     'distantMetastasesDetectionStatus',
     'numberOfLiverMetastases',
     'maximumSizeOfLiverMetastasisMm',
     'hasDoublePrimaryTumor',
     'mesorectalFasciaIsClear',
     'distanceToMesorectalFasciaMm',
     'venousInvasionDescription',
     'lymphaticInvasionCategory',
     'extraMuralInvasionCategory',
     'tumorRegression',
     'hasParticipatedInTrial',
     'metastasesSurgeries',
     'radiotherapies',
     #'metastasesRadiotherapies',
     #'response',
     'metastasisLocationGroupsPriorToSystemicTreatment'
]

result = { f: test_feature(base_df, "ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear", f) for f in features }
result