# Mutations
In this notebook we will focus on each of the mutatons individually.

In [None]:
%matplotlib inline
from scipy.stats import pearsonr
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import NMF, PCA
from sklearn.manifold import TSNE

from source import load_avenio_files
from transform import (
    clean_mutation_columns, 
    dummy_encode_mutations, 
    get_top_correlated, 
    mutation_train_test_split, 
    patient_allele_frequencies,
)


RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

In [None]:
# Load data from spreadsheet and SPSS files.
mutation_data_frame, phenotypes = load_avenio_files()

Two ways to calculate the differences:
- Using the allele frequencies.
- Using the mutatant concentration.

In [None]:
# Vocabulary is the entire dataset, not only training set. Otherwise we run into problems during inference.
gene_vocabulary = mutation_data_frame['Gene'].unique()

# Convert particular columns to numbers and drop rows with missing data.
mutation_data_frame = clean_mutation_columns(mutation_data_frame)

Calculate the r-fraction as $$r = \frac{\Delta t}{t_0} \equiv \frac{t_1 - t_0}{t_0} .$$

In [None]:
def r(t_0, t_1):
    return (t_1 - t_0) / t_0

In [None]:
# Calculate r-fraction for allele frequencies.
mutant_allele_frequencies = patient_allele_frequencies(
    mutation_data_frame, 
    gene_vocabulary, 
    # Calculate r(t_0, t_1).
    transformation=r,
    # Sum mutation values per gene in each patient.
    handle_duplicates="sum",
    allele_columns=["T0: Allele \nFraction", "T1: Allele Fraction"],
)

# Calculate r-fraction for mutation concentrations.
mutant_allele_concentration = patient_allele_frequencies(
    mutation_data_frame, 
    gene_vocabulary, 
    # Calculate r(t_0, t_1).
    transformation=r,
    # Sum mutation values per gene in each patient.
    handle_duplicates="sum",
    allele_columns=[
        "T0: No. Mutant \nMolecules per mL",
        "T1: No. Mutant \nMolecules per mL",
    ],
)

# Correlations
How do gene's fractions correlate? We calculate the Pearson correlation value which is defined as:
$$C_{ij} = \sum_{m=1}^{N} \frac{(X_{mi} - \mu_i)(X_{mj} - \mu_j)}{\sigma_i \sigma_j} \, ,$$
with $\mu$ and $\sigma$ the mean and standard deviation, respectively.

In [None]:
def pearson_pval(x, y):
    return pearsonr(x, y)[1]

We will do a comparison for:
- the allele frequency.
- the mutant concentration.

## Allele frequency

In [None]:
corr = mutant_allele_frequencies.corr().fillna(0)
pval_corr = mutant_allele_frequencies.corr(method=pearson_pval).fillna(1)
corr.style.background_gradient(cmap='coolwarm', axis=None)

### Negative correlation

In [None]:
gene_counts = mutation_data_frame['Gene'].value_counts()
get_top_correlated(
    corr, 
    pval_corr,
    gene_counts=gene_counts, 
    ascending=True, 
    top_count=4,
)

The p-values should not be taken to seriously. The fact that the p-values are extremely low is easy to understand:
- All columns are zero.
- Except the columns where the two unique mutations happen to coincide.
This immediately implies that the p-value should be near zero.

## Mutant concentration
Instead of the allele frequencies, we now use the mutant concentration.

In [None]:
corr = mutant_allele_concentration.corr().fillna(0)
pval_corr = mutant_allele_concentration.corr(method=pearson_pval).fillna(1)
corr.style.background_gradient(cmap='coolwarm', axis=None)

### Negative correlation

In [None]:
gene_counts = mutation_data_frame['Gene'].value_counts()
get_top_correlated(
    corr, 
    pval_corr,
    gene_counts=gene_counts, 
    ascending=True, 
    top_count=4,
)

The p-values should not be taken to seriously. The fact that the p-values are extremely low is easy to understand:
- All columns are zero.
- Except the columns where the two unique mutations happen to coincide.
This immediately implies that the p-value should be near zero.

### Positive correlation

In [None]:
get_top_correlated(corr, pval_corr, gene_counts=gene_counts, top_count=20, ascending=False)

## Do responders show an increase in mutational allel frequency?

Calculate quantities to comapre.

In [None]:
allele_columns=["T0: Allele \nFraction", "T1: Allele Fraction"]
# Allele frequencies.
# Allele frequencies.
mutation_data_frame["f_t"] = r(
    mutation_data_frame[allele_columns[0]], mutation_data_frame[allele_columns[1]]
)
mutation_data_frame["dt"] = (
    mutation_data_frame[allele_columns[1]] - mutation_data_frame[allele_columns[0]]
)
mutation_data_frame["ratio"] = (
    mutation_data_frame[allele_columns[1]] / mutation_data_frame[allele_columns[0]]
)

In [None]:
allele_columns = [
    "T0: No. Mutant \nMolecules per mL",
    "T1: No. Mutant \nMolecules per mL",
]
# Allele frequencies.
# Allele frequencies.
mutation_data_frame["f_t2"] = r(
    mutation_data_frame[allele_columns[0]], mutation_data_frame[allele_columns[1]]
)
mutation_data_frame["dt2"] = (
    mutation_data_frame[allele_columns[1]] - mutation_data_frame[allele_columns[0]]
)
mutation_data_frame["ratio2"] = (
    mutation_data_frame[allele_columns[1]] / mutation_data_frame[allele_columns[0]]
)


Add phenotype data.

In [None]:
mutation_data_frame['response'] = mutation_data_frame['Patient ID'].apply(lambda x: phenotypes.loc[x, 'response_grouped'])
mutation_data_frame['progression'] = mutation_data_frame['Patient ID'].apply(lambda x: phenotypes.loc[x, 'progressie'])

In [None]:
gene_subset = mutation_data_frame['Gene'].isin(['TP53', 'KRAS', 'PIK3CA', 'NFE2L2'])

g = sns.catplot(
    x='Gene', 
    y='f_t', 
    hue='response',
    data=mutation_data_frame[gene_subset],
    kind='violin',
)
g.fig.set_size_inches(16,8)

In [None]:
gene_subset = mutation_data_frame['Gene'].isin(['TP53', 'KRAS', 'PIK3CA', 'NFE2L2'])

g = sns.catplot(
    x='Gene', 
    y='f_t2', 
    hue='response',
    data=mutation_data_frame[gene_subset],
    kind='violin',
)
g.fig.set_size_inches(16,8)

Absolute difference

In [None]:
g = sns.catplot(
    x='Gene', 
    y='dt', 
    hue='response', 
#     col='progression', 
    data=mutation_data_frame[gene_subset],
#     col_wrap=1,
    kind='violin',
)
# g.ax.set_ylim([-.5, .5])
g.fig.set_size_inches(16,8)

In [None]:
g = sns.catplot(
    x='Gene', 
    y='dt2', 
    hue='response', 
#     col='progression', 
    data=mutation_data_frame[gene_subset],
#     col_wrap=1,
    kind='violin',
)
# g.ax.set_ylim([-.5, .5])
g.fig.set_size_inches(16,8)

In [None]:
g = sns.catplot(
    x='Gene', 
    y='ratio', 
    hue='response', 
#     col='progression', 
    data=mutation_data_frame[gene_subset], 
    kind='violin',
)
g.fig.set_size_inches(16,8)
# g.ax.set_ylim([-.5, .5])

In [None]:
g = sns.catplot(
    x='Gene', 
    y='ratio2', 
    hue='response', 
#     col='progression', 
    data=mutation_data_frame[gene_subset], 
    kind='violin',
)
g.fig.set_size_inches(16,8)
# g.ax.set_ylim([-.5, .5])