In [None]:
import pandas as pd 
import pymysql

In [None]:
db_connection = pymysql.connect(
    read_default_file='/home/jupyter/.my.cnf',
    read_default_group='RAnalysis', 
    db = 'actin_personalization'
)

query = "SELECT * FROM knownPalliativeTreatments"

df = pd.read_sql(query, db_connection)

db_connection.close()

# list(df.columns)

In [None]:
from lifelines import CoxPHFitter

def cox_model(df, formula):
    return CoxPHFitter().fit(
        df,
        duration_col="observedPfsDays",
        event_col="hadProgressionEvent",
        formula=formula,
        strata=["systemicTreatmentPlan"]
    )

In [None]:
from numpy import nan

base_df = df.dropna(subset = [
    'observedPfsDays',
    'hadProgressionEvent',
    'ageAtDiagnosis',
    'whoStatusPreTreatmentStart',
    'tumorIncidenceYear'
]).copy()

stageTnm_lookup = {
    "ZERO": 0.0,
    "I": 1.0,
    "IA1": 1.1,
    "IA": 1.15,
    "IA2": 1.2,
    "IA3": 1.25,
    "IB": 1.4,
    "II": 2.0,
    "IIA": 2.1,
    "IIB": 2.2,
    "IIC": 2.3,
    "III": 3.0,
    "IIIA": 3.1,
    "IIIB": 3.2,
    "IIIC": 3.3,
    "IV": 4.0,
    "IVA": 4.1,
    "IVB": 4.2,
    "IVC": 4.3,
    "M": 4.0,
    "NA": nan,
    "X": nan,
}

tnmM_lookup = {
    "M0": 0,
    "M1": 1,
    "M1A": 1.1,
    "M1B": 1.2,
    "M1C": 1.3,
    "M_MINUS": 0,
    "X": nan,
}

tnmN_lookup = {
    "N0": 0,
    "N1": 1,
    "N1A": 1.1,
    "N1B": 1.2,
    "N1C": 1.3,
    "N1M": 1,
    "N2": 2,
    "N2A": 2.1,
    "N2B": 2.2,
    "X": nan
}

tnmT_lookup = {
    "T0": 0,
    "T_IS": 0.5,
    "T1": 1,
    "T2": 2,
    "T3": 3,
    "T4A": 4.1,
    "T4B": 4.2,
    "X": nan
}

lookup_dictionary = {
    "anorectalVergeDistanceCategory": {
        "ZERO_TO_FIVE_CM": 2.5,
        "FIVE_TO_TEN_CM": 7.5,
        "TEN_TO_FIFTEEN_CM": 12.5,
        "OVER_FIFTEEN_CM": 17.5,
    },
    "cciNumberOfCategories": {
        "ZERO_CATEGORIES": 0,
        "ONE_CATEGORY": 1,
        "TWO_OR_MORE_CATEGORIES": 2,
    },
    "numberOfLiverMetastases": {
        "ONE": 1,
        "TWO": 2,
        "THREE": 3,
        "FOUR": 4,
        "FIVE_OR_MORE": 5,
        "MULTIPLE_BUT_EXACT_NUMBER_UNKNOWN": 3
    },
    "asaClassificationPreSurgeryOrEndoscopy": {
        "I": 1,
        "II": 2,
        "III": 3,
        "IV": 4,
        "V": 5,
        "VI": 6,
    },
    "venousInvasionDescription": {  # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1769571/
        "EXTRAMURAL": 1,
        "INTRAMURAL": 1,
        "NA": nan,
        "NONE": 0,
        "SUSPECT": 0.7,
    },
    "lymphaticInvasionCategory": {
        "NONE": 0,
        "PRESENT": 1,
        "SUSPECT": 0.7,
        "NA": nan
    },
    "extraMuralInvasionCategory": {
        "NA": 0,
        "LESS_THAN_FIVE_MM": 3,
        "ABOVE_FIVE_MM": 7,
    },
    "tumorRegression": {
        "CANNOT_BE_DETERMINED": nan,
        "FULL_REGRESSION": 1,
        "MINIMAL_FOCI": 0.8,  # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4946373/
        "MINIMAL_REGRESSION": 0.2,
        "MODERATE_REGRESSION": 0.5,
        "NO_SIGNS_OF_REGRESSION": 0,
        "NA": nan,
    },
    "tumorDifferentiationGrade": {
        "GRADE_1_OR_WELL_DIFFERENTIATED": 1,
        "GRADE_2_OR_MODERATELY_DIFFERENTIATED": 2,
        "GRADE_3_OR_POORLY_DIFFERENTIATED": 3,
        "GRADE_4_OR_UNDIFFERENTIATED_OR_ANAPLASTIC_OR_GGG4": 4
    },
    "tnmCT": tnmT_lookup,
    "tnmPT": tnmT_lookup,
    "tnmCN": tnmN_lookup,
    "tnmPN": tnmN_lookup,
    "tnmCM": tnmM_lookup,
    "tnmPM": tnmM_lookup,
    "stageCTNM": stageTnm_lookup,
    "stagePTNM": stageTnm_lookup,
    "stageTNM": stageTnm_lookup,
}

for column, lookup in lookup_dictionary.items():
    base_df[column] = base_df[column].apply(lookup.get)

In [None]:
def likelihood_ratio(llmin, llmax):
    return 2 * (llmax - llmin)

In [None]:
from dataclasses import dataclass
from math import log10

@dataclass
class TestResult:
    feature: str
    survival_impact: float
    p_val: float
    dof_delta: int
    log_likelihood_ratio: float


In [None]:
features = [
     'sex',
     'consolidatedTumorType',
     'hasHadPriorTumor',
     'cci',
     'cciNumberOfCategories',
     'cciHasAids',
     'cciHasCongestiveHeartFailure',
     'cciHasCollagenosis',
     'cciHasCopd',
     'cciHasCerebrovascularDisease',
     'cciHasDementia',
     'cciHasDiabetesMellitus',
     'cciHasDiabetesMellitusWithEndOrganDamage',
     'cciHasOtherMalignancy',
     'cciHasOtherMetastaticSolidTumor',
     'cciHasMyocardialInfarct',
     'cciHasMildLiverDisease',
     #'cciHasHemiplegiaOrParaplegia',
     'cciHasPeripheralVascularDisease',
     'cciHasRenalDisease',
     'cciHasLiverDisease',
     'cciHasUlcerDisease',
     'presentedWithIleus',
     'presentedWithPerforation',
     'anorectalVergeDistanceCategory',
     'hasMsi',
     'hasBrafMutation',
     'hasBrafV600EMutation',
     'hasRasMutation',
     'hasKrasG12CMutation',
     'asaClassificationPreSurgeryOrEndoscopy',
     'tumorBasisOfDiagnosis',
     'tumorLocation',
     'tumorDifferentiationGrade',
     'tnmCT',
     'tnmCN',
     'tnmCM',
     #'tnmPT',
     'tnmPN',
     'tnmPM',
     'stageCTNM',
     'stagePTNM',
     'stageTNM',
     'investigatedLymphNodesNumber',
     'positiveLymphNodesNumber',
     'distantMetastasesDetectionStatus',
     'numberOfLiverMetastases',
     'maximumSizeOfLiverMetastasisMm',
     'hasDoublePrimaryTumor',
     'mesorectalFasciaIsClear',
     'distanceToMesorectalFasciaMm',
     'venousInvasionDescription',
     'lymphaticInvasionCategory',
     'extraMuralInvasionCategory',
     'tumorRegression',
     'hasParticipatedInTrial',
     'metastasesSurgeries',
     'radiotherapies',
     #'metastasesRadiotherapies',
     #'response',
     'metastasisLocationGroupsPriorToSystemicTreatment'
]

In [None]:
from scipy.stats.distributions import chi2
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
cols_to_standardize = [f for f in (['ageAtDiagnosis', 'whoStatusPreTreatmentStart', 'tumorIncidenceYear'] + features) if pd.api.types.is_numeric_dtype(base_df[f])]

base_df.loc[:, cols_to_standardize] = scaler.fit_transform(base_df.loc[:, cols_to_standardize])

def test_feature(df, simple_formula, feature):
    try:
        df_clean = df.dropna(subset = [feature])
        reduced_model = cox_model(df_clean, simple_formula)
        full_model = cox_model(df_clean, " + ".join([feature, simple_formula]))
        dof = 1 if pd.api.types.is_numeric_dtype(df[feature]) else df[feature].nunique() - 1
        log_likelihood_ratio = likelihood_ratio(reduced_model.log_likelihood_, full_model.log_likelihood_)
        p = chi2.sf(log_likelihood_ratio, dof)
        print(f"Testing feature {feature}: p-value {p}, exp(coeff) {full_model.hazard_ratios_[0]}")
        return TestResult(feature, full_model.hazard_ratios_[0], p, dof, log_likelihood_ratio)
    except Exception as e:
        print(f"Failed to test {feature}: {str(e)}")
        return None

# test_feature(base_df, "ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear", "consolidatedTumorType")

In [None]:
base_formula = "ageAtDiagnosis + whoStatusPreTreatmentStart + tumorIncidenceYear"
result = [r for r in (test_feature(base_df, base_formula, f) for f in features) if r is not None]

In [None]:
from math import log10

result_df = pd.DataFrame(result)
result_df["log10_p"] = result_df["p_val"].apply(lambda p: -log10(p))
result_df.tail()

In [None]:
result_df[result_df["p_val"] < 0.05]

In [None]:
import seaborn

seaborn.scatterplot(result_df[result_df["p_val"] < 0.05], x="survival_impact", y="log10_p")

In [None]:
correlation = base_df[features].corr('pearson')

In [None]:
seaborn.heatmap(correlation)

In [None]:
base_df['tumorDifferentiationGrade'].value_counts()

In [None]:
from numpy import argsort
import scipy.cluster.hierarchy as sch

def cluster_corr(corr_df):
    pairwise_distances = sch.distance.pdist(corr_df)
    linkage = sch.linkage(pairwise_distances, method='complete')
    cluster_distance_threshold = pairwise_distances.max()/2
    idx_to_cluster_array = sch.fcluster(linkage, cluster_distance_threshold, criterion='distance')
    idx = argsort(idx_to_cluster_array)
    
    return corr_df.copy().iloc[idx, :].T.iloc[idx, :]

In [None]:
clustered_corr = cluster_corr(correlation.fillna(0))
clustered_corr.head()

In [None]:
idx = list(range(0, 3)) + [25] + list(range(4, 24)) + list(range(26, len(clustered_corr)))
corrected_corr = clustered_corr.copy().iloc[idx, :].T.iloc[idx, :]

In [None]:
heatmap = seaborn.heatmap(corrected_corr, xticklabels=True, yticklabels=True)
heatmap.figure.set_figwidth(12)
heatmap.figure.set_figheight(8.5)

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist

Y = pdist(base_df[features].select_dtypes('number').fillna(0), 'correlation')
Z = linkage(Y, 'single', 'correlation')
dendrogram(Z, color_threshold=0)