In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics import roc_curve, f1_score, precision_score, recall_score, mean_squared_error, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

from source import read_preprocessed_data
from transform import load_process_and_store_spreadsheets
from pipelines import benchmark_pipelines, build_classifier_pipelines, build_regression_pipelines, pipeline_Richard, pipeline_Julian
from pipelines import categorical_input_columns

RANDOM_STATE = 1234

In [3]:
def harmonic_mean(t0, t1):
    return t0 * t1 / (t0 + t1)

In [4]:
load_process_and_store_spreadsheets(transformation=harmonic_mean)

X_train, y_train = read_preprocessed_data("output/train.tsv")
X_test, y_test = read_preprocessed_data("output/test.tsv")

## Classification

In [5]:
y_train_resp = y_train["response_grouped"]
y_test_resp = y_test["response_grouped"]

In [6]:
ps_class = build_classifier_pipelines()

We want to predict non-responders:

In [7]:
response_labels = ['non responder (sd+pd)', 'responder (pr+cr)']
pos_label = 'non responder (sd+pd)'
# pos_label = 'responder (pr+cr)'

In [8]:
F1 = lambda x, y: f1_score(x, y, labels=response_labels, pos_label=pos_label)

In [9]:
from catboost import CatBoostClassifier
params = {
    'iterations': 400,
    'random_seed': RANDOM_STATE,
    'custom_loss': ['Precision', 'AUC', 'Accuracy', 'F1', 'Recall'],
    'eval_metric': 'F1',
    'logging_level': 'Silent',
}
cat = CatBoostClassifier(**params)

In [10]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [11]:
categorical_feature_indices = [X_train.columns.get_loc(column) for column in categorical_input_columns]

In [12]:
# cat.fit(
#     X_train, y_train_resp,
#     cat_features=categorical_feature_indices,
#     plot=True,
#     eval_set=(X_test, y_test_resp),
# )

## Architecure pipeline

In [13]:
# Calculate F_1 scores for the classifiers.
b = benchmark_pipelines(
    ps_class, X_train, y_train_resp, X_test, y_test_resp, metric=F1,
)
b

AttributeError: 'numpy.ndarray' object has no attribute 'count'

In [None]:
print(b.max(axis=0))
print('--'*10)
print(b.max(axis=1))

In [None]:
from sklearn.preprocessing import LabelBinarizer
def auc(y_true, y_pred):
    l = LabelBinarizer().fit(y_true)
    return roc_auc_score(l.transform(y_true), l.transform(y_pred), average='weighted')

In [None]:
b = benchmark_pipelines(
    ps_class, X_train, y_train_resp, X_test, y_test_resp, metric=auc,
)
b

In [None]:
print(b.max(axis=0))
print('--'*10)
print(b.max(axis=1))

In [None]:
X_train.columns

In [None]:
from views import view_decision_tree_julian
view_decision_tree_julian(ps_class['DecisionTreeClassifier']['Julian'])

In [None]:
from pipelines import hybrid_classifier
p = hybrid_classifier()
X_train, y_train_resp, X_test, y_test_resp
p.fit(X_train, y_train_resp)
y_train_pred = p.predict(X_train)
y_test_pred = p.predict(X_test)
auc(y_test_resp, y_test_pred)

In [None]:
p_win = ps_class['DecisionTreeClassifier']['Julian']

The confusion matrix $M_{ij}$: predicted $i$ but truth is $j$.

In [None]:
from views import plot_confusion_matrix
plot_confusion_matrix(y_test_resp, p_win.predict(X_test), classes=response_labels)

Precision:
$$p = \frac{\mathrm{TP}}{\mathrm{TP} + \mathrm{FP}}$$

Recall or sensitivity:
$$r = \frac{\mathrm{TP}}{\mathrm{TP} + \mathrm{FN}}$$

In [None]:
r = lambda x,y: recall_score(x, y, labels=response_labels, pos_label=pos_label)
p = lambda x, y: precision_score(x, y, labels=response_labels, pos_label=pos_label)
recall = benchmark_pipelines(
    ps_class, X_train, y_train_resp, X_test, y_test_resp, metric=r,
)
recall

In [None]:
precision = benchmark_pipelines(
    ps_class, X_train, y_train_resp, X_test, y_test_resp, metric=p,
)
precision

In [None]:
from views import plot_confusion_matrix
plot_confusion_matrix(y_test_resp, ps_class['LogisticRegression']['Lev'].predict(X_test), classes=y_test_resp.unique())

## Dimensional representation

In [None]:
from views import view_pipelines

view_pipelines(ps_class['LogisticRegression'], X_train, y_train_resp, random_state=RANDOM_STATE)

## Regression

In [None]:
y_train_os = y_train['OS_days']
y_test_os = y_test['OS_days']
y_train_pfs = y_train['PFS_days']
y_test_pfs = y_test['PFS_days']

In [None]:
sns.scatterplot(x=y_train_os, y=y_train_pfs, hue=y_train_resp)

### Overall survival (OS)

In [None]:
sns.distplot(y_train_os, norm_hist=False)

In [None]:
ps = build_regression_pipelines()
b = benchmark_pipelines(ps, X_train, y_train_os, X_test, y_test_os, metric=mean_squared_error)
# Root mean squared error.
b = b**0.5

In [None]:
print(b.mean(axis=0))
print('--'*10)
print(b.mean(axis=1))

In [None]:
b

## Progression Free Survival (PFS)

In [None]:
y_train_pfs = y_train['PFS_days']
y_test_pfs = y_test['PFS_days']
sns.distplot(y_train_pfs)

In [None]:
ps = build_regression_pipelines()
b = benchmark_pipelines(ps, X_train, y_train_pfs, X_test, y_test_pfs, metric=mean_squared_error)
# Root mean squared error.
b = b**0.5

In [None]:
print(b.mean(axis=0))
print('--'*10)
print(b.mean(axis=1))

In [None]:
b

## Multitask model


In [None]:
from sklearn.linear_model import MultiTaskElasticNet
from pipelines import pipelines
ps = pipelines(MultiTaskElasticNet, VotingEstimator=None)

In [None]:
Y_train = y_train[['OS_days', 'PFS_days']]
Y_test = y_test[['OS_days', 'PFS_days']]
rmse = lambda a, b: mean_squared_error(a, b, multioutput='raw_values')**0.5
benchmark_pipelines({'bla': ps}, X_train, Y_train, X_test, Y_test, metric=rmse)
# p = ps['Richard']
# p.fit(X_train, Y_train)