# Load data

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer

from source import load_avenio_files
from transform import dummy_encode_mutations, mutation_train_test_split, patient_allele_frequencies


RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

In [None]:
# Load data from spreadsheet and SPSS files.
mutation_data_frame, phenotypes = load_avenio_files()

Remove records for which there is no allele frequencies available.

In [None]:
# Vocabulary is the entire dataset, not only training set. Otherwise we run into problems during inference.
gene_vocabulary = mutation_data_frame['Gene'].unique()
allele_columns = ["T0: Allele \nFraction", "T1: Allele Fraction"]

# 1) Convert to float.
columns_to_numeric = allele_columns
for column_name in allele_columns:
    mutation_data_frame.loc[:, column_name] = pd.to_numeric(mutation_data_frame[column_name], errors='coerce')
# 2) Drop rows for which the columns can not be converted.
mutation_data_frame = mutation_data_frame.dropna(subset=allele_columns)

Split the data in a training and validation set.

In [None]:
# Fraction of dataset we want to use for the validation set.
f_val = 0.3
# Split accordingly.
train_mutations, test_mutations = mutation_train_test_split(
    mutation_data_frame, test_fraction=f_val,
)

Calculate allele frequency transformation for the training data.

In [None]:
patient_mutation_frequencies = patient_allele_frequencies(train_mutations, gene_vocabulary)

Combine calculated mutation data with phenotype data

In [None]:
phenotypes_to_keep = ['Clinical_Response', 'response_grouped', 'leeftijd', 'progressie']
df_with_phenotype = pd.merge(
    left=patient_mutation_frequencies,
    right=phenotypes[phenotypes_to_keep],
    left_index=True,
    right_index=True,
)

## Decompositions

In [None]:
n_columns = 4
X_pca = PCA(n_columns).fit_transform(patient_mutation_frequencies)
X_pca += np.random.normal(scale=0.0025, size=X_pca.shape)
sns.scatterplot(X_pca[:, 0], X_pca[:, 1], hue=df_with_phenotype['Clinical_Response'])

In [None]:
X_embed = TSNE(n_components=2, random_state=RANDOM_STATE).fit_transform(patient_mutation_frequencies)
X_embed += np.random.normal(scale=0.25, size=X_embed.shape)
sns.scatterplot(X_embed[:,0], X_embed[:,1], hue=df_with_phenotype['Clinical_Response'] == 'SD')

# Dummy encode phenotypes
To dummy encode all phenotypes, just concatenate all the features into a string. Since all the words are unique, we can simply use the `CountVectorizer` for one-hot-encoding.

In [None]:
columns_to_hot_encode = [
    "gender",
    "stage",
    "therapyline",
    "previoustherapy",
    "Systemischetherapie",
    "smokingstatus",
    "histology_grouped",
    "progressie",
]
columns_numeric = [
    "leeftijd",
    "lymfmeta",
    "brainmeta",
    "adrenalmeta",
    "livermeta",
    "lungmeta",
    "skeletometa",
]

In [None]:
# Clean data.
empty_previous_therapy = phenotypes['previoustherapy'] == ''
phenotypes.loc[empty_previous_therapy, 'previoustherapy'] = 'unknown_therapy'

# Map numbers to words.
phenotypes['stage'] = phenotypes['stage'].astype(int) \
    .apply(lambda x: 'stage{}'.format(x))
phenotypes['therapyline'] = phenotypes['therapyline'].astype(int) \
    .apply(lambda x: 'line{}'.format(x))
phenotypes['progressie'] = phenotypes['progressie'].astype(int) \
    .map({0: 'noprogress', 1: 'progress'})

Convert all the phenotypes to a string (collection of words).

In [None]:
phenotype_as_str = [
    " ".join(patient_data)
    for _, patient_data in phenotypes[columns_to_hot_encode].iterrows()
]

Use a simple pipeline to one-hot-encode the data and decompose using NMF.

In [None]:
# Initialise operations to carry out.
n_components = 4
vectoriser = CountVectorizer()
nmf_decomp = NMF(n_components)
pca_decomp = PCA(n_components)

# Do the transformations: count and decompose.
X_pheno_hot = vectoriser.fit_transform(phenotype_as_str)
# Decompose using NMF.
W = nmf_decomp.fit_transform(X_pheno_hot)
H = nmf_decomp.components_
# Decompose using PCA.
X_pca = pca_decomp.fit_transform(X_pheno_hot.toarray())

Using t-SNE:

In [None]:
X_embed = TSNE(n_components=2).fit_transform(X_pheno_hot.toarray())
sns.scatterplot(X_embed[:,0], X_embed[:, 1], hue=phenotypes['response_grouped'])

In [None]:
print(phenotypes['progressie'].value_counts())
print('--' * 10)
print(phenotypes['Clinical_Response'].value_counts())
print('--' * 10)
print(phenotypes['response_grouped'].value_counts())
print('--' * 10)
print(phenotypes['progressie'].value_counts())

In [None]:
# Add jittering to help visualisation.
W_jit = W + np.random.normal(scale=0.025, size=W.shape)
X_pca_jit = X_pca + np.random.normal(scale=0.025, size=X_pca.shape)

for i in range(n_components):
    for j in range(i + 1, n_components):
        plt.figure()
        plt.title('({}, {})'.format(i, j))
        plt.xlabel(r'$W_{i' + str(i) + '}$')
        plt.ylabel(r'$W_{i' + str(j) + '}$')
#         sns.scatterplot(W_jit[:,i], W_jit[:,j], hue=phenotypes['Clinical_Response'], x_jitter=True, y_jitter=True)
        sns.scatterplot(W_jit[:,i], W_jit[:,j], hue=phenotypes['response_grouped'], x_jitter=True, y_jitter=True)
#         sns.scatterplot(X_pca_jit[:,i], X_pca_jit[:,j], hue=phenotypes['Clinical_Response'], x_jitter=True, y_jitter=True)
#         sns.scatterplot(W_jit[:,i], W_jit[:,j], hue=phenotypes['progressie'], x_jitter=True, y_jitter=True)