In [118]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from hyperopt.pyll.base import scope

import os

In [98]:
features = ['patient_age', 'psa', 'psad', 'prostate_volume', 'f0_max']

In [99]:
df = pd.read_csv('results_unet_full.csv')
df = df.dropna(subset=features)
df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,study_id,mri_date,patient_age,psa,psad,prostate_volume,case_csPCa,f0_sum,...,f1_mean,f2_sum,f2_max,f2_mean,f3_sum,f3_max,f3_mean,f4_sum,f4_max,f4_mean
0,1,10001,1000001,2016-05-27,64.0,8.7,0.09,102.0,NO,6063.7334,...,0.00536,7947.8203,0.262723,0.006064,5701.035,0.191592,0.00435,7182.457,0.179602,0.00548
1,2,10002,1000002,2021-04-18,58.0,4.2,0.06,74.0,NO,5907.2393,...,0.005448,7774.4526,0.180468,0.005931,5649.919,0.131438,0.004311,7113.611,0.123889,0.005427
2,3,10003,1000003,2019-04-05,72.0,13.0,0.18,71.5,NO,5770.549,...,0.005521,7699.9746,0.088626,0.005875,5707.0527,0.080355,0.004354,7131.079,0.076757,0.005441
3,4,10004,1000004,2020-10-21,67.0,8.0,0.1,78.0,NO,5748.8384,...,0.005515,7726.0347,0.131269,0.005894,5904.078,0.14264,0.004504,7282.0786,0.116884,0.005556
4,5,10005,1000005,2012-07-18,64.0,12.1,0.24,51.0,YES,5822.135,...,0.005471,7686.206,0.153504,0.005864,6143.2334,0.16981,0.004687,7490.6484,0.134061,0.005715


In [100]:
unet_results = pd.read_csv('UNet/infer/overview.csv')
unet_results['study_id'] = unet_results['study_id'].apply(lambda x: int(x))
unet_results.head()

Unnamed: 0.1,Unnamed: 0,study_id,pred_score
0,0,1000001,0.15858
1,1,1000003,0.110075
2,2,1000006,0.115803
3,3,1000017,0.162361
4,4,1000020,0.180278


In [101]:
print(f"Before filtering:\n{len(df)} cases, {len(df[df['case_csPCa'] == 'YES'])} cancers")
df = df[[*features, 'case_csPCa', 'study_id']]
df = df.dropna()
print(f"After filtering:\n{len(df)} cases, {len(df[df['case_csPCa'] == 'YES'])} cancers")
df['case_csPCa'] = df['case_csPCa'].apply(lambda v: v == 'YES')

Before filtering:
1498 cases, 424 cancers
After filtering:
1498 cases, 424 cancers


In [102]:
unet_results = pd.read_csv('UNet/infer/overview.csv')

train_df = df[~df['study_id'].isin(unet_results['study_id'])]
val_df = df[df['study_id'].isin(unet_results['study_id'])]

print(len(train_df), len(val_df))

1202 296


In [103]:
print(f"Before filtering:\n{len(df)} cases, {len(df[df['case_csPCa'] == 'YES'])} cancers")
df = df[[*features, 'case_csPCa']]
df = df.dropna()
print(f"After filtering:\n{len(df)} cases, {len(df[df['case_csPCa'] == 'YES'])} cancers")
df['case_csPCa'] = df['case_csPCa'].apply(lambda v: v == 'YES')
df.head()

Before filtering:
1498 cases, 0 cancers
After filtering:
1498 cases, 0 cancers


Unnamed: 0,patient_age,psa,psad,prostate_volume,f0_max,case_csPCa
0,64.0,8.7,0.09,102.0,0.220088,False
1,58.0,4.2,0.06,74.0,0.149725,False
2,72.0,13.0,0.18,71.5,0.070739,False
3,67.0,8.0,0.1,78.0,0.056298,False
4,64.0,12.1,0.24,51.0,0.067053,False


In [104]:
model = xgb.XGBClassifier(max_depth=3, max_leaves=10, n_estimators=50, learning_rate=0.12, objective='binary:logistic', eval_metric='auc', grow_policy='lossguide', tree_method='exact', booster='dart', scale_pos_weight=0.28, gamma=0.9)

X_train = train_df[features].values
y_train = train_df['case_csPCa'].values
X_val = val_df[features].values
y_val = val_df['case_csPCa'].values


In [105]:
space = {
    'max_depth': scope.int(hp.quniform('max_depth', 2, 8, 1)),
    'max_leaves': scope.int(hp.quniform('max_leaves', 0, 1000, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 1000, 1)),
    'learning_rate': hp.loguniform('learning_rate', 0.0001, 0.5),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'grow_policy': hp.choice('grow_policy', ['lossguide']),
    'tree_method': hp.choice('tree_method', ['exact']),
    'booster': hp.choice('booster', ['dart', 'gbtree']),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0.1, 0.5),
    'gamma': hp.uniform('gamma', 0.0001, 10),
}

In [106]:
def objective(params):
    xgb_model = xgb.XGBClassifier(**params, objective='binary:logistic', eval_metric='auc')
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict_proba(X_val)[:,1]
    y_pred_train = xgb_model.predict_proba(X_train)[:,1]
    auc = roc_auc_score(y_val, y_pred)
    auc_train = roc_auc_score(y_train, y_pred_train)

    return {'auc': auc, 'auc_train': auc_train, 'loss': -auc, 'status': STATUS_OK}

In [107]:
xgb_model = xgb.XGBClassifier()

In [108]:
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=100, trials=trials)
best_trial = trials.best_trial

100%|██████████| 100/100 [10:28<00:00,  6.29s/trial, best loss: -0.7724806201550387]


In [119]:
space_eval(space, best)

{'booster': 'dart',
 'gamma': 6.358574341247602,
 'grow_policy': 'lossguide',
 'learning_rate': 1.1665405892449185,
 'max_depth': 2,
 'max_leaves': 342,
 'n_estimators': 532,
 'scale_pos_weight': 0.3464430619377608,
 'subsample': 0.9180788532736023,
 'tree_method': 'exact'}

In [120]:
model = xgb.XGBClassifier(**space_eval(space, best), objective='binary:logistic', eval_metric='auc')

In [121]:
model.fit(X_train, y_train)

In [122]:
preds = model.predict_proba(X_val)[:, 1]

In [123]:
print(roc_auc_score(y_val, preds))

0.7724806201550387


In [124]:
val_df['preds'] = preds

In [125]:
val_df.head()

Unnamed: 0,patient_age,psa,psad,prostate_volume,f0_max,case_csPCa,study_id,preds
0,64.0,8.7,0.09,102.0,0.220088,False,1000001,0.025608
2,72.0,13.0,0.18,71.5,0.070739,False,1000003,0.039453
5,73.0,6.2,0.23,27.0,0.098344,False,1000006,0.196482
16,67.0,5.5,0.13,64.06,0.224346,False,1000017,0.206518
19,43.0,4.6,0.11,47.0,0.205439,False,1000020,0.206518


In [126]:
for file in os.listdir('UNet_XGBoost/infer/results_F0'):
    study_id = int(file.split('.')[0])

    pred = val_df[val_df['study_id'] == study_id]['preds'].values[0]

    # Open file as numpy, fill numpy array with pred and save again
    npy = np.load(f'UNet_XGBoost/infer/results_F0/{file}')
    print(npy.shape)
    npy[:, :, :] = pred
    np.save(f'UNet_XGBoost/infer/results_F0/{file}', npy)


(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 256)
(20, 256, 