# Analyzing Performance on HDP Dataset

In [None]:
# Basic Imports
import numpy as np
from scipy.stats import uniform, invwishart, matrix_normal, norm
from scipy.stats import multivariate_normal as mvn
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
import seaborn as sns
from time import localtime, strftime

# sklearn imports
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB as GNB

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.preprocessing import OneHotEncoder

In [None]:
HDP_data = pd.read_csv("https://stats.idre.ucla.edu/stat/data/hdp.csv")
HDP_data.tumorsize = (HDP_data.tumorsize - HDP_data.tumorsize.mean()) / HDP_data.tumorsize.std()

HDP_data.head()

In [None]:
HDP_data = HDP_data.drop(['Experience', 'School', 'Lawsuits', ], axis=1)

In [None]:
HDP_data.columns

In [None]:
HDP_data.shape

In [None]:
HDP_data = pd.get_dummies(HDP_data)

In [None]:
print("Number of unique Doctors:", len(HDP_data.DID.unique()))
print("Number of unique Hospitals:", len(HDP_data.HID.unique()))

In [None]:
# 5 fold cross validation

num_groups = len(HDP_data.DID.unique())

fold1 = list(range(1, num_groups // 5 + 1))
fold2 = list(range(num_groups // 5 + 1, 2 * num_groups // 5 + 1))
fold3 = list(range(2 * num_groups // 5 + 1, 3 * num_groups // 5 + 1))
fold4 = list(range(3 * num_groups // 5 + 1, 4 * num_groups // 5 + 1))
fold5 = list(range(4 * num_groups // 5 + 1, num_groups + 1))

folds = [fold1, fold2, fold3, fold4, fold5]

# X_train = data[data["DID"] < last_group]
# X_test = data[data["DID"] >= last_group]

## Regular Decision Tree

In [None]:
# Regular Tree

tree_mses = []

for i in range(5):

    X_train = HDP_data[~HDP_data["DID"].isin(folds[i])]
    X_test = HDP_data[HDP_data["DID"].isin(folds[i])]

    tree = DTR(random_state=0)
    tree.fit(X_train.drop(["DID", "HID", "tumorsize"], axis=1), X_train["tumorsize"])
    pred = tree.predict(X_test.drop(["DID", "HID", "tumorsize"], axis=1))
    tree_mses.append(mse(X_test["tumorsize"], pred))

print("Tree MSE:", np.mean(tree_mses))

## Regular Random Forest

In [None]:
# Regular random forest

forest_mses = []

for i in range(5):

    X_train = HDP_data[~HDP_data["DID"].isin(folds[i])]
    X_test = HDP_data[HDP_data["DID"].isin(folds[i])]

    tree = RFR(random_state=0, n_estimators=81)
    tree.fit(X_train.drop(["DID", "HID", "tumorsize"], axis=1), X_train["tumorsize"])
    pred = tree.predict(X_test.drop(["DID", "HID", "tumorsize"], axis=1))
    forest_mses.append(mse(X_test["tumorsize"], pred))

print("Forest MSE:", np.mean(forest_mses))

# LMM

In [None]:
formula = " + ".join(['co2', 'pain', 'wound', 'mobility', 'ntumors', 'nmorphine',
       'remission', 'lungcapacity', 'Age', 'Married', 'LengthofStay', 'WBC',
       'RBC', 'BMI', 'IL6', 'CRP', 'Medicaid', 'FamilyHx_no',
       'FamilyHx_yes', 'SmokingHx_current', 'SmokingHx_former',
       'SmokingHx_never', 'Sex_female', 'Sex_male', 'CancerStage_I',
       'CancerStage_II', 'CancerStage_III', 'CancerStage_IV'])

In [None]:

lmm_mses = []

for i in range(5):

    X_train = HDP_data[~HDP_data["DID"].isin(folds[i])]
    X_test = HDP_data[HDP_data["DID"].isin(folds[i])]

    md = smf.mixedlm("tumorsize ~ " + formula, X_train, groups=X_train["DID"], re_formula= " ~ (1|DID)")
    mdf = md.fit()
    pred = mdf.predict(X_test)
    lmm_mses.append(mse(X_test["tumorsize"], pred))

print("LMM MSE:", np.mean(lmm_mses))

## Sum of Trees

In [None]:

my_mses = []

for i in range(5):

    X_train = HDP_data[~HDP_data["DID"].isin(folds[i])]
    X_test = HDP_data[HDP_data["DID"].isin(folds[i])]



    # Build group classifier
    group_clf = LR()  # GNB()  # DTC or RFC or LR or something else?
    group_clf.fit(
        X_train.drop(["DID", "HID", "tumorsize"], axis=1), X_train["DID"]
    )
    group_pred = group_clf.predict_proba(
        X_test.drop(["DID", "HID", "tumorsize"], axis=1)
    )

    # group_pred = group_pred  # + 0.5 #

    # # Normalize group predictions
    # row_sums = group_pred.sum(axis=1)
    # group_pred = group_pred / row_sums[:, np.newaxis]

    for test_group in range(len(folds[i])):
        rows = np.where(X_test["DID"] == test_group)
        average = np.mean(group_pred[rows,], axis=1)
        group_pred[rows] = average

    # Mixture of Trees
    list_of_trees = []

    train_trees = list(set(range(1, 407+1)) - set(folds[i]))

    for i in train_trees:
        tree = DTR()
        tree.fit(
            X_train[X_train["DID"] == i].drop(["DID", "HID", "tumorsize"], axis=1),
            X_train[X_train["DID"] == i]["tumorsize"],
        )
        list_of_trees.append(tree)

    preds = np.array(
        [
            tree.predict(X_test.drop(["DID", "HID", "tumorsize"], axis=1))
            for tree in list_of_trees
        ]
    )
    preds = preds.T
    num = preds.shape[0]
    pred = [np.dot(preds[i, :], group_pred[i, :]) for i in range(num)]

    my_mses.append(mse(X_test["tumorsize"], pred))





print("Mixture of Trees MSE:", np.mean(my_mses))


# HDP but on Hospital

In [None]:
# 5 fold cross validation

num_groups = len(HDP_data.HID.unique())

fold1 = list(range(1, num_groups // 5 + 1))
fold2 = list(range(num_groups // 5 + 1, 2 * num_groups // 5 + 1))
fold3 = list(range(2 * num_groups // 5 + 1, 3 * num_groups // 5 + 1))
fold4 = list(range(3 * num_groups // 5 + 1, 4 * num_groups // 5 + 1))
fold5 = list(range(4 * num_groups // 5 + 1, num_groups + 1))

folds = [fold1, fold2, fold3, fold4, fold5]

## Regular Decision Tree

In [None]:
# Regular Tree

tree_mses = []

for i in range(5):

    X_train = HDP_data[~HDP_data["HID"].isin(folds[i])]
    X_test = HDP_data[HDP_data["HID"].isin(folds[i])]

    tree = DTR(random_state=0)
    tree.fit(X_train.drop(["DID", "HID", "tumorsize"], axis=1), X_train["tumorsize"])
    pred = tree.predict(X_test.drop(["DID", "HID", "tumorsize"], axis=1))
    tree_mses.append(mse(X_test["tumorsize"], pred))

print("Tree MSE:", np.mean(tree_mses))

## Regular Random Forest

In [None]:
# Regular random forest

forest_mses = []

for i in range(5):

    X_train = HDP_data[~HDP_data["HID"].isin(folds[i])]
    X_test = HDP_data[HDP_data["HID"].isin(folds[i])]

    tree = RFR(random_state=0, n_estimators=81)
    tree.fit(X_train.drop(["DID", "HID", "tumorsize"], axis=1), X_train["tumorsize"])
    pred = tree.predict(X_test.drop(["DID", "HID", "tumorsize"], axis=1))
    forest_mses.append(mse(X_test["tumorsize"], pred))

print("Forest MSE:", np.mean(forest_mses))

# LMM

In [None]:
formula = " + ".join(['co2', 'pain', 'wound', 'mobility', 'ntumors', 'nmorphine',
       'remission', 'lungcapacity', 'Age', 'Married', 'LengthofStay', 'WBC',
       'RBC', 'BMI', 'IL6', 'CRP', 'Medicaid', 'FamilyHx_no',
       'FamilyHx_yes', 'SmokingHx_current', 'SmokingHx_former',
       'SmokingHx_never', 'Sex_female', 'Sex_male', 'CancerStage_I',
       'CancerStage_II', 'CancerStage_III', 'CancerStage_IV'])

In [None]:

lmm_mses = []

for i in range(5):

    X_train = HDP_data[~HDP_data["HID"].isin(folds[i])]
    X_test = HDP_data[HDP_data["HID"].isin(folds[i])]

    md = smf.mixedlm("tumorsize ~ " + formula, X_train, groups=X_train["DID"], re_formula= " ~ (1|HID)")
    mdf = md.fit()
    pred = mdf.predict(X_test)
    lmm_mses.append(mse(X_test["tumorsize"], pred))

print("LMM MSE:", np.mean(lmm_mses))

## Sum of Trees

In [None]:

my_mses = []

for i in range(5):

    X_train = HDP_data[~HDP_data["HID"].isin(folds[i])]
    X_test = HDP_data[HDP_data["HID"].isin(folds[i])]



    # Build group classifier
    group_clf = LR()  # GNB()  # DTC or RFC or LR or something else?
    group_clf.fit(
        X_train.drop(["DID", "HID", "tumorsize"], axis=1), X_train["HID"]
    )
    group_pred = group_clf.predict_proba(
        X_test.drop(["DID", "HID", "tumorsize"], axis=1)
    )

    # group_pred = group_pred  # + 0.5 #

    # # Normalize group predictions
    # row_sums = group_pred.sum(axis=1)
    # group_pred = group_pred / row_sums[:, np.newaxis]

    for test_group in range(len(folds[i])):
        rows = np.where(X_test["HID"] == test_group)
        average = np.mean(group_pred[rows,], axis=1)
        group_pred[rows] = average

    # Mixture of Trees
    list_of_trees = []

    train_trees = list(set(range(1, 35+1)) - set(folds[i]))

    for i in train_trees:
        tree = DTR()
        tree.fit(
            X_train[X_train["HID"] == i].drop(["DID", "HID", "tumorsize"], axis=1),
            X_train[X_train["HID"] == i]["tumorsize"],
        )
        list_of_trees.append(tree)

    preds = np.array(
        [
            tree.predict(X_test.drop(["DID", "HID", "tumorsize"], axis=1))
            for tree in list_of_trees
        ]
    )
    preds = preds.T
    num = preds.shape[0]
    pred = [np.dot(preds[i, :], group_pred[i, :]) for i in range(num)]

    my_mses.append(mse(X_test["tumorsize"], pred))

print("Mixture of Trees MSE:", np.mean(my_mses))


# Sarcoma Dataset

In [None]:
data = pd.read_csv("sarcoma_data/combined_sarcoma_data.csv")

data.columns = data.columns.str.replace(' ', '_')
data.columns = data.columns.str.replace('-', '_')

# Change MALE to 0, FEMALE to 1
data = data.replace({"MALE": 0, "FEMALE":1})

data = data.dropna(axis=0)

data['Leukocyte_Fraction'] = (data['Leukocyte_Fraction'] - data['Leukocyte_Fraction'].mean()) / data['Leukocyte_Fraction'].std()


# Train-test split
X_train = data[~data["short_histo"].isin(['SS', 'MPNST'])]
X_test = data[data["short_histo"].isin(['SS', 'MPNST'])]

features = ['age_at_diagnosis', 'gender', 'JUN',
       'VGLL3', 'TERT', 'MAP3K5', 'UST', 'CDKN2A', 'YAP1', 'CDKN1B', 'PTPRQ',
       'RB1', 'TP53', 'MYOCD', 'NF1', 'CCNE1', 'CEBPA', 'ZNF552', 'ATRX',
       'PTEN', 'DDIT3', 'CDK4', 'HMGA2', 'MDM2', 'FRS2', 'Silent_per_Mb',
       'Non_silent_per_Mb']

output = 'Leukocyte_Fraction'

In [None]:
# Regular Tree

performances = []

for i in range(100):
    tree = DTR(random_state=i)
    tree.fit(X_train[features], X_train[output])
    pred = tree.predict(X_test[features])
    tree_mse = mse(X_test[output], pred)
    performances.append(tree_mse)

print("Tree MSE:", np.mean(performances))

In [None]:
# Regular random forest

performances = []

for i in range(100):
    tree = RFR(n_estimators=5, random_state=0)
    tree.fit(X_train[features], X_train[output])
    pred = tree.predict(X_test[features])
    forest_mse = mse(X_test[output], pred)
    performances.append(forest_mse)

print("Forest MSE:", np.mean(performances))

In [None]:
# Build group classifier
group_clf = LR()  # GNB()  # DTC or RFC or LR or something else?
group_clf.fit(
    X_train[features], X_train['short_histo']
)
group_pred = group_clf.predict_proba(
    X_test[features]
)

group_pred = group_pred  # + 0.5 #

# Normalize group predictions
row_sums = group_pred.sum(axis=1)
group_pred = group_pred / row_sums[:, np.newaxis]

for test_group in ['SS', 'MPNST']:
    rows = np.where(X_test['short_histo'] == test_group)
    average = np.mean(group_pred[rows,], axis=1)
    group_pred[rows] = average

In [None]:
# Mixture of Trees


performances = []

for i in range(100):

    list_of_trees = []
    for group in ['STLMS', 'DDLPS', 'UPS', 'MFS', 'ULMS']:
        tree = DTR()
        tree.fit(
            X_train[X_train["short_histo"] == group][
                features
            ],
            X_train[X_train["short_histo"] == group][output],
        )
        list_of_trees.append(tree)

    preds = np.array(
        [
            tree.predict(X_test[features])
            for tree in list_of_trees
        ]
    )
    preds = preds.T
    num = preds.shape[0]
    pred = [np.dot(preds[i, :], group_pred[i, :]) for i in range(num)]
    performances.append(mse(X_test[output], pred))



print("Mixture of Trees MSE:", np.mean(performances))

In [None]:
# LMM

formula = " ~ " + " + ".join(features)

md = smf.mixedlm(output + formula, X_train, groups=X_train["short_histo"])
mdf = md.fit()
pred = mdf.predict(X_test)
lmm_mse = mse(X_test[output], pred)
print("LMM MSE:", lmm_mse)
