## Load spread sheet

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import NMF, PCA
from sklearn.manifold import TSNE

from source import load_avenio_files
from transform import dummy_encode_mutations, mutation_train_test_split

RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

In [None]:
# Load data from spreadsheet and SPSS files.
mutation_data_frame, phenotypes = load_avenio_files()

To make a fair comparison we should split the data in a training and validation set by randomly selecting patients. 

In [None]:
# Fraction of dataset we want to use for the validation set.
f_val = 0.3
# Split accordingly.
train_mutations, test_mutations = mutation_train_test_split(
    mutation_data_frame, test_fraction=f_val,
)

## Aggregated statistics mutation
Let us first look at the data as a whole (not just the training data), to explore the dataset.

In [None]:
# Focus only on the genes
columns_to_keep = ['Patient ID', 'Gene']
mutation_data_frame = mutation_data_frame[columns_to_keep]

In [None]:
# Count occurences of genes, regardless of how many per patient.
gene_set = mutation_data_frame['Gene'].unique()
gene_count  = mutation_data_frame.groupby('Gene') \
    .count()

print(gene_count)

print(gene_set)
# Plot occurences of genes, and make all genes with more than 3 instances red.
gene_colour = gene_count['Patient ID'].apply(lambda x: 'red' if x > 5 else 'blue')
ax = gene_count['Patient ID'].plot(kind='bar', color=gene_colour)
ax.set_ylabel('# occurences')

Clearly, the majority of the mutations are unique, whilst the presence of TP53 is quite ubiquitous. 

How are the mutations distributed? One per patient?

In [None]:
# How many mutations per patient?
mutation_counts = mutation_data_frame.groupby('Patient ID').count()
ax = sns.distplot(mutation_counts, kde=False)
ax.set_xlabel('# Mutations')
ax.set_ylabel('Frequency')
ax.set_xlim([1, max(mutation_counts['Gene'])])

The figure above indicates that patients usually have one or two mutations. 

It can happen that there are multiple mutations in the same gene. 
How often does this occur?

In [None]:
# How many patients have more than 1 mutation in the same gene?
# Group by (patient, gene):
num_gene_mutations = mutation_data_frame.groupby(columns_to_keep) \
    .size() \
    .to_frame('size')
# More than 1.
same_gene_mutations = num_gene_mutations[num_gene_mutations['size'] > 1]

# Fraction of total.
num_patients_multi_mutation = len(same_gene_mutations.groupby('Patient ID'))
num_patients = mutation_data_frame['Patient ID'].nunique()
f = num_patients_multi_mutation / num_patients

print('Number of patients with multiple mutations in same gene: {}/{} ({:.2f} %)'.format(
    num_patients_multi_mutation, 
    num_patients, 
    f * 100.0),
)

same_gene_mutations

But how many mutations, and what mutations?

In [None]:
# Plot distribution of patients with at least two mutations.
ax = sns.distplot(
    same_gene_mutations, 
    kde=False,
    bins=[2, 3, 4, 5],
)
ax.set_xlabel('# mutations in single gene')
ax.set_ylabel('# patients')

# Multiple mutations in same gene occurs almost solely in TP53.
same_gene_mutations.groupby('Gene').describe()

## Phenotypes


In [None]:
phenotypes.columns[:20]
features_to_investigate = ['gender', 'leeftijd', 'stage', 'smokingstatus', 'histology_grouped', 'metastasescount']
sns.pairplot(phenotypes[features_to_investigate], hue='gender')
sns.pairplot(phenotypes[features_to_investigate], hue='histology_grouped')
sns.pairplot(phenotypes[features_to_investigate], hue='smokingstatus')

## Text analysis

We can borrow some tricks that are also used in text analysis. For example, in the bag-of-words approach one collects all words (the vocabulary) and counts the occurences of each word per text document. I will do the same below: with _gene_ $\leftrightarrow$ _word_ and _text document_ $\leftrightarrow$ _patient_. That is, for each patient (row) count the number of mutations per gene (column). 

In [None]:
# Vocabulary is the entire dataset, not only training set. Otherwise we run into problems during inference.
gene_vocabulary = mutation_data_frame['Gene'].unique()

dummy_data_frame = dummy_encode_mutations(train_mutations, gene_vocabulary)

Combine with phenotype data

In [None]:
phenotypes_to_keep = ['Clinical_Response', 'response_grouped', 'leeftijd', 'progressie']
df_with_phenotype = pd.merge(
    left=dummy_data_frame,
    right=phenotypes[phenotypes_to_keep],
    left_index=True,
    right_index=True,
)

In [None]:
train_mutations['Patient ID'].unique()
# train_data_frame.set_index('Patient ID').loc[1022]

In [None]:
# Validate that the code above is correct.
assert dummy_data_frame.loc[1022]['TP53'] == 2
assert dummy_data_frame.loc[1172]['TP53'] == 4
assert dummy_data_frame.loc[1172]['STK11'] == 2

## Decomposition on counts
Let us first do a decomposition using non-negative matrix factorisation (NMF):
$$X = WH$$

In [None]:
n_components = 3
nmf_decomp = NMF(n_components).fit(dummy_data_frame)
W = nmf_decomp.transform(dummy_data_frame)
H = nmf_decomp.components_

# Add jittering to help visualisation.
W_jit = W + np.random.normal(scale=0.025, size=W.shape)

In [None]:
plt.figure(figsize=(16, 12))
ax = plt.gca()
ax.set_yticks(range(0, H.shape[1]))
ax.set_yticklabels(dummy_data_frame.columns)
plt.imshow(H.T)

So it seems that the components are completely determined by the presence of TP53 and KRAS.

Now lets zoom in on the response:

In [None]:
# Add jittering to help visualisation.
non_response = (df_with_phenotype['Clinical_Response'] == 'SD') | (df_with_phenotype['Clinical_Response'] == 'PD')
for i in range(n_components):
    for j in range(i + 1, n_components):
        plt.figure()
        plt.title('({}, {})'.format(i, j))
        plt.xlabel(r'$W_{i' + str(i) + '}$')
        plt.ylabel(r'$W_{i' + str(j) + '}$')
        sns.scatterplot(W_jit[:,i], W_jit[:,j], hue=non_response, x_jitter=True, y_jitter=True)

In turns out that patients with stable disease (SD)have no KRAS.

In [None]:
for i in range(n_components):
    for j in range(i + 1, n_components):
        plt.figure()
        plt.title('({}, {})'.format(i, j))
        plt.xlabel(r'$W_{i' + str(i) + '}$')
        plt.ylabel(r'$W_{i' + str(j) + '}$')
        sns.scatterplot(W_jit[:, i], W_jit[:, j], hue=df_with_phenotype['Clinical_Response'] == 'SD')

### PCA
PCA essentially gives the same results as NMF.

In [None]:
pca_decomp = PCA(n_components=2).fit(dummy_data_frame)
L = pca_decomp.transform(dummy_data_frame)
# Add jittering to help visualisation.
L += np.random.normal(scale=0.075, size=L.shape)
sns.scatterplot(L[:,0], L[:,1], hue=df_with_phenotype['Clinical_Response'] == 'SD')

### t-SNE

In [None]:
X_embed = TSNE(n_components=2).fit_transform(dummy_data_frame)
sns.scatterplot(X_embed[:,0], X_embed[:,1], hue=df_with_phenotype['Clinical_Response'])

## Term frequency - inverse document frequency (TF-IDF)
Since the matrix decomposition is completely dominated by TP53 and KRAS, let us use TF-IDF to help find patterns.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
X_tfidf = TfidfTransformer().fit_transform(dummy_data_frame)

In [None]:
plt.imshow(dummy_data_frame.T)

In [None]:
plt.imshow(X_tfidf.toarray())

Lets do the same NMF trick again and see if the result differ substantially.

In [None]:
n_components = 3
nmf_decomp = NMF(n_components).fit(X_tfidf)
W_tfidf = nmf_decomp.transform(X_tfidf)
H_tfidf = nmf_decomp.components_

# Add jittering to help visualisation.
W_jit2 = W_tfidf + np.random.normal(scale=0.025, size=W_tfidf.shape)

In [None]:
plt.figure(figsize=(16, 12))
ax = plt.gca()
ax.set_yticks(range(0, H_tfidf.shape[1]))
ax.set_yticklabels(dummy_data_frame.columns)
plt.imshow(H_tfidf.T)

In [None]:
# Add jittering to help visualisation.
for i in range(n_components):
    for j in range(i + 1, n_components):
        plt.figure()
        plt.title('({}, {})'.format(i, j))
        plt.xlabel(r'$W_{i' + str(i) + '}$')
        plt.ylabel(r'$W_{i' + str(j) + '}$')
        plt.title('Grouped response')
        sns.scatterplot(W_jit2[:,i], W_jit2[:,j], hue=df_with_phenotype['response_grouped'], x_jitter=True, y_jitter=True)

In [None]:
for i in range(n_components):
    for j in range(i + 1, n_components):
        plt.figure()
        plt.title('({}, {})'.format(i, j))
        plt.xlabel(r'$W_{i' + str(i) + '}$')
        plt.ylabel(r'$W_{i' + str(j) + '}$')
        sns.scatterplot(W_jit2[:, i], W_jit2[:, j], hue=df_with_phenotype['Clinical_Response'] == 'SD')