# Mutations
In this notebook we will focus on each of the mutatons individually.

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import NMF, PCA
from sklearn.manifold import TSNE

from source import load_avenio_files
from transform import dummy_encode_mutations, get_top_correlated, mutation_train_test_split, patient_allele_frequencies


RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

In [None]:
# Load data from spreadsheet and SPSS files.
mutation_data_frame, phenotypes = load_avenio_files()

In [None]:
# Vocabulary is the entire dataset, not only training set. Otherwise we run into problems during inference.
gene_vocabulary = mutation_data_frame['Gene'].unique()
allele_columns = ["T0: Allele \nFraction", "T1: Allele Fraction"]

# 1) Convert to float.
columns_to_numeric = allele_columns
for column_name in allele_columns:
    mutation_data_frame.loc[:, column_name] = pd.to_numeric(mutation_data_frame[column_name], errors='coerce')
# 2) Drop rows for which the columns can not be converted.
mutation_data_frame = mutation_data_frame.dropna(subset=allele_columns)

In [None]:
# Fraction of dataset we want to use for the validation set.
f_val = 0.3
# Split accordingly.
train_mutations, test_mutations = mutation_train_test_split(
    mutation_data_frame, test_fraction=f_val,
)

Calculate fraction as $$f_t = \frac{\Delta t}{t_0} \equiv \frac{t_1 - t_0}{t_0} .$$

In [None]:
def f_t(t_0, t_1):
    return (t_1 - t_0) / t_0

In [None]:
patient_mutation_frequencies = patient_allele_frequencies(train_mutations, gene_vocabulary, f_t)

# Correlations
How do gene's fractions correlate?

In [None]:
corr = patient_mutation_frequencies.corr().fillna(0)
corr.style.background_gradient(cmap='coolwarm', axis=None)

There is essentially no negative correlation.

In [None]:
gene_counts = train_mutations['Gene'].value_counts()
get_top_correlated(corr, gene_counts=gene_counts, ascending=True, top_count=4)

### Top correlators

In [None]:
get_top_correlated(corr, gene_counts=gene_counts, ascending=False)

## Do responders show an increase in mutational allel frequency?

Calculate quantities to comapre.

In [None]:
mutation_data_frame['f_t'] = f_t(mutation_data_frame[allele_columns[0]], mutation_data_frame[allele_columns[1]])
mutation_data_frame['dt'] = mutation_data_frame[allele_columns[1]] - mutation_data_frame[allele_columns[0]]
mutation_data_frame['ratio'] = mutation_data_frame[allele_columns[1]] / mutation_data_frame[allele_columns[0]]

Add phenotype data.

In [None]:
mutation_data_frame['response'] = mutation_data_frame['Patient ID'].apply(lambda x: phenotypes.loc[x, 'response_grouped'])
mutation_data_frame['progression'] = mutation_data_frame['Patient ID'].apply(lambda x: phenotypes.loc[x, 'progressie'])

In [None]:
gene_subset = mutation_data_frame['Gene'].isin(['TP53', 'KRAS', 'PIK3CA', 'NFE2L2'])

g = sns.catplot(
    x='Gene', 
    y='f_t', 
    hue='response',
    data=mutation_data_frame[gene_subset],
    kind='violin',
)
g.fig.set_size_inches(16,8)

Absolute difference

In [None]:
g = sns.catplot(
    x='Gene', 
    y='dt', 
    hue='response', 
#     col='progression', 
    data=mutation_data_frame[gene_subset],
#     col_wrap=1,
    kind='violin',
)
# g.ax.set_ylim([-.5, .5])
g.fig.set_size_inches(16,8)

In [None]:
g = sns.catplot(
    x='Gene', 
    y='ratio', 
    hue='response', 
#     col='progression', 
    data=mutation_data_frame[gene_subset], 
    kind='violin',
)
g.fig.set_size_inches(16,8)
# g.ax.set_ylim([-.5, .5])