In [1]:
%matplotlib inline
from catboost import CatBoostClassifier
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTENC
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_curve, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OrdinalEncoder, OneHotEncoder, LabelBinarizer
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.utils.multiclass import unique_labels

from source import load_avenio_files
from transform import categorical_columns_to_lower, ClassifierAsTransformer, clean_mutation_columns, dummy_encode_mutations, get_top_genes, mutation_train_test_split, patient_allele_frequencies, CustomCatBoostClassifier
from views import plot_confusion_matrix


RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

ImportError: cannot import name 'categorical_columns_to_lower' from 'transform' (/home/donkerhc/avenio/transform.py)

In [None]:
# Load data from spreadsheet and SPSS files.
mutation_data_frame, phenotypes = load_avenio_files()

# Vocabulary is the entire dataset, not only training set. Otherwise we run into problems during inference.
gene_vocabulary = mutation_data_frame['Gene'].unique()
# allele_columns = ["T0: Allele \nFraction", "T1: Allele Fraction"]
allele_columns = ["T0: No. Mutant \nMolecules per mL", "T1: No. Mutant \nMolecules per mL"]

# raise Exception('Check allele columns!')

# Convert particular columns to numbers and drop rows with missing data.
mutation_data_frame = clean_mutation_columns(mutation_data_frame)

def f_t(t_0, t_1):
    return (t_1 - t_0) / t_0

In [None]:
patient_mutation_frequencies = patient_allele_frequencies(mutation_data_frame, gene_vocabulary, f_t)

## Feature reduction
Remove some of the mutation features.

In [None]:
def select_frequent_mutation_columns(X):
    """
    Select mutations that occur above a certain thresshold.
    """
    # Single out selected columns.
    return X[get_top_genes(patient_mutation_frequencies)]

In [None]:
patient_mutation_frequencies = FunctionTransformer(select_frequent_mutation_columns, validate=False).fit_transform(patient_mutation_frequencies)

## Add phenotype features

In [None]:
# Phenotype features that serve as input for the model.
phenotype_features = [
    "gender",
    "leeftijd",
    "stage",
    "therapyline",
    "smokingstatus",
    "Systemischetherapie",
    "histology_grouped",
    "lymfmeta",
    "brainmeta",
    "adrenalmeta",
    "livermeta",
    "lungmeta",
    "skeletonmeta",
]

# Phenotype labels that we wish to predict.
phenotype_labels = [
    # Labels:
    "Clinical_Response",
    "response_grouped",
    "progressie",
]
# The columns to extract from the SPSS file.
phenotypes_to_keep = phenotype_features + phenotype_labels

# From those listed above, the following columns are categorical (not counting the labels).
categorical_input_columns = [
    "gender",
    "stage",
    "therapyline",
    "smokingstatus",
    "Systemischetherapie",
    "histology_grouped",
    "lymfmeta",
    "brainmeta",
    "adrenalmeta",
    "livermeta",
    "lungmeta",
    "skeletonmeta",
]

# Combine mutation data and phenotype data.
X = pd.merge(
    left=patient_mutation_frequencies,
    right=phenotypes[phenotypes_to_keep],
    left_index=True,
    right_index=True,
)
X = categorical_columns_to_lower(X)
X.to_csv('combined_data.tsv', sep='\t')

In [None]:
X.dropna(subset=['response_grouped'], inplace=True)

# Extract the labels for the classifier.
y_resp = X.pop('Clinical_Response')
y_resp_gp = X.pop('response_grouped')
y_prog = X.pop('progressie')

Encode labels for catboost

In [None]:
categorical_feature_indices = [X.columns.get_loc(column) for column in categorical_input_columns]
y_resp_gp = LabelEncoder().fit_transform(y_resp_gp)

Train-test split

In [None]:
f_test = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y_resp_gp, test_size=f_test, random_state=RANDOM_STATE)

## Oversampling

In [None]:
# Use synthetic minority oversampling technique for nominal and continuous data.
smote_nc = SMOTENC(categorical_features=categorical_feature_indices, random_state=RANDOM_STATE)
X_resample, y_resample = smote_nc.fit_resample(X_train, y_train)
X_resample = pd.DataFrame(X_resample, columns=X_train.columns)
y_resample = pd.Series(y_resample)

## Train Catboost

Precision $p$: $\frac{tp}{tp + fp}$,

Recall $r$: $\frac{tp}{tp + fn}$

$F_1$: $\frac{2pr}{p+r}$

In [None]:
params = {
    'iterations': 400,
    'random_seed': RANDOM_STATE,
    'custom_loss': ['Precision', 'AUC', 'Accuracy', 'F1', 'Recall'],
    'eval_metric': 'F1',
    'logging_level': 'Silent',
}
classifier = CatBoostClassifier(**params)


In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
classifier.fit(
    X_resample, y_resample,
    cat_features=categorical_feature_indices,
    plot=True,
    eval_set=(X_test, y_test),
)

In [None]:
y_test_pred = classifier.predict(X_test)
f1_score(y_test, y_test_pred)

In [None]:
plot_confusion_matrix(y_test, y_test_pred, y_test)