# Mutations
In this notebook we will focus on each of the mutatons individually.

In [1]:
%matplotlib inline
from scipy.stats import pearsonr
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import NMF, PCA
from sklearn.manifold import TSNE

from source import load_avenio_files
from transform import dummy_encode_mutations, get_top_correlated, mutation_train_test_split, patient_allele_frequencies


RANDOM_STATE = 1234
np.random.seed(RANDOM_STATE)

In [2]:
# Load data from spreadsheet and SPSS files.
mutation_data_frame, phenotypes = load_avenio_files()

Two ways to calculate the differences:
- Using the allele frequencies.
- Using the mutatant concentration.

In [3]:
# Vocabulary is the entire dataset, not only training set. Otherwise we run into problems during inference.
gene_vocabulary = mutation_data_frame['Gene'].unique()
# Columns containing allele frequencies.
allele_columns = ["T0: Allele \nFraction", "T1: Allele Fraction"]
# # Columns containing mutant concentration.
# allele_columns = ["T0: No. Mutant \nMolecules per mL", "T1: No. Mutant \nMolecules per mL"]

# 1) Convert to float.
columns_to_numeric = allele_columns
for column_name in allele_columns:
    mutation_data_frame.loc[:, column_name] = pd.to_numeric(mutation_data_frame[column_name], errors='coerce')
# 2) Drop rows for which the columns can not be converted.
mutation_data_frame = mutation_data_frame.dropna(subset=allele_columns)

Calculate the r-fraction as $$r = \frac{\Delta t}{t_0} \equiv \frac{t_1 - t_0}{t_0} .$$

In [26]:
def r(t_0, t_1):
    return (t_1 - t_0) / t_0

In [27]:
patient_mutation_frequencies = patient_allele_frequencies(
    mutation_data_frame, 
    gene_vocabulary, 
    # Calculate r(t_0, t_1).
    r,
    # Sum mutation values per gene in each patient.
    handle_duplicates="sum",
)

# Correlations
How do gene's fractions correlate? We calculate the Pearson correlation value which is defined as:
$$C_{ij} = \sum_{m=1}^{N} \frac{(X_{mi} - \mu_i)(X_{mj} - \mu_j)}{\sigma_i \sigma_j} \, ,$$
with $\mu$ and $\sigma$ the mean and standard deviation, respectively.

In [28]:
def pearson_pval(x, y):
    return pearsonr(x, y)[1]

In [29]:
corr = patient_mutation_frequencies.corr().fillna(0)
pval_corr = patient_mutation_frequencies.corr(method=pearson_pval).fillna(1)
corr.style.background_gradient(cmap='coolwarm', axis=None)

Unnamed: 0,TP53,KRAS,FGFR1,PTEN,FBXW7,KDR,MTOR,EGFR,MET,CDKN2A,BRAF,APC,KEAP1,ALK,AR,ERBB2,NRAS,NFE2L2,TSC2,GNAS,STK11,CD274,CTNNB1,MAP2K2,IDH1,NF2,MAP2K1,PIK3CA,IDH2,FLT4,ESR1,DDR2,KIT,PTCH1,SMAD4,SMO,RNF43,FGFR2,JAK2,CCND1,GATA3,PDGFRA
TP53,1.0,0.375138,0.0555314,-0.0500797,0.0694674,-0.0147783,0.0838118,-0.00841225,0,0.363908,0.0292296,0.0143716,0.116676,0,-0.0147783,-0.00991446,0.0147783,0.143288,-0.0147783,0.0382829,0.0748715,0.0147783,-0.0150245,0.0147783,0.0537418,0,0,0.259162,0.0147783,0.0374386,0.115401,0.00751786,0.0539583,0,-0.0149104,0.114162,0,0,0.154658,0.0147783,0.0838118,0.154119
KRAS,0.375138,1.0,-0.00988976,0.00325205,0.00947636,-0.00988976,0.00988976,-0.0141612,0,-0.00650629,-0.00901678,0.0097515,-0.0229458,0,-0.00988976,-0.00988976,0.00988976,-0.0054182,-0.00988976,0.227301,0.181428,0.0984317,-0.0115278,0.0984317,0.0648421,0,0,0.0200241,0.00988976,0.00988976,0.150237,0.0396138,0.00988976,0,-0.00997813,-0.0789846,0,0,0.00988976,0.00988976,0.00988976,0.0729919
FGFR1,0.0555314,-0.00988976,1.0,0.660249,0.00921345,-0.00961538,0.00961538,-0.00882299,0,-0.00632578,0.00079359,0.00942822,-0.000568427,0,-0.00961538,-0.00961538,0.00961538,0.02441,-0.00961538,-0.00961538,-0.00565118,0.00961538,-0.00977558,0.00961538,0.00961538,0,0,0.00561984,0.00961538,0.00961538,0.00961538,0.00961538,0.00961538,0,-0.0097013,0.00961538,0,0,0.00961538,0.00961538,0.00961538,-0.00961538
PTEN,-0.0500797,0.00325205,0.660249,1.0,0.401111,0.0469781,-0.00316182,0.00290126,0,-0.333554,-0.000260956,-0.00310028,0.000186916,0,0.00316182,0.00316182,-0.00316182,0.208079,0.00316182,0.00316182,0.00185828,-0.00316182,0.0032145,-0.00316182,-0.00316182,0,0,-0.00184797,-0.00316182,-0.00316182,-0.00316182,-0.00316182,-0.00316182,0,0.00319008,-0.00316182,0,0,-0.00316182,-0.00316182,-0.00316182,0.00316182
FBXW7,0.0694674,0.00947636,0.00921345,0.401111,1.0,0.00921345,-0.00921345,0.00845417,0,0.0864404,-0.000760417,-0.00903411,0.000544666,0,0.00921345,0.00921345,-0.00921345,0.201027,0.00921345,0.00921345,0.00541495,-0.00921345,0.00936695,-0.00921345,-0.00921345,0,0,-0.00538492,-0.00921345,-0.00921345,-0.00921345,-0.00921345,-0.00921345,0,0.00929577,-0.00921345,0,0,-0.00921345,-0.00921345,-0.00921345,0.00921345
KDR,-0.0147783,-0.00988976,-0.00961538,0.0469781,0.00921345,1.0,0.00961538,-0.00882299,0,-0.00632578,0.00079359,0.00942822,-0.000568427,0,-0.00961538,-0.00961538,0.00961538,-0.583724,-0.00961538,-0.00961538,-0.00565118,0.00961538,-0.00977558,0.00961538,0.00961538,0,0,0.00561984,0.00961538,0.00961538,0.00961538,0.00961538,0.00961538,0,-0.0097013,0.00961538,0,0,0.00961538,0.00961538,0.00961538,-0.00961538
MTOR,0.0838118,0.00988976,0.00961538,-0.00316182,-0.00921345,0.00961538,1.0,0.00882299,0,0.00632578,-0.00079359,-0.00942822,0.000568427,0,0.00961538,0.00961538,-0.00961538,0.0478529,0.00961538,0.00961538,0.00565118,-0.00961538,0.00977558,-0.00961538,-0.00961538,0,0,0.276855,-0.00961538,-0.00961538,-0.00961538,-0.00961538,-0.00961538,0,0.0097013,-0.00961538,0,0,-0.00961538,-0.00961538,1.0,0.00961538
EGFR,-0.00841225,-0.0141612,-0.00882299,0.00290126,0.00845417,-0.00882299,0.00882299,1.0,0,-0.00580448,0.000728191,0.00865124,-0.000521583,0,-0.00882299,-0.00882299,0.00882299,0.0223984,-0.00882299,-0.00882299,-0.00518547,0.00882299,-0.00896998,0.00882299,0.00882299,0,0,0.00515671,0.00882299,0.00882299,0.00882299,0.00882299,0.00882299,0,-0.00890182,0.00882299,0,0,0.00882299,0.00882299,0.00882299,-0.00882299
MET,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0
CDKN2A,0.363908,-0.00650629,-0.00632578,-0.333554,0.0864404,-0.00632578,0.00632578,-0.00580448,0,1.0,0.000522088,0.304923,0.116855,0,-0.00632578,-0.00632578,0.00632578,0.0195654,-0.00632578,-0.00632578,-0.00371781,0.00632578,-0.00643118,0.00632578,0.00632578,0,0,0.0266693,0.00632578,0.00632578,0.00632578,0.00632578,0.00632578,0,-0.00638231,0.00632578,0,0,0.0492498,0.00632578,0.00632578,-0.00632578


### Negative correlation

In [30]:
gene_counts = mutation_data_frame['Gene'].value_counts()
get_top_correlated(
    corr, 
    pval_corr,
    gene_counts=gene_counts, 
    ascending=True, 
    top_count=4,
)

Unnamed: 0,gene 1,gene 2,correlation,p-value,# gene 1,# gene 2
0,NFE2L2,KDR,-0.583724,6.341728e-11,9.0,1.0
2,PTEN,CDKN2A,-0.333554,0.0005067958,4.0,7.0
4,KRAS,SMO,-0.078985,0.4231809,37.0,1.0
6,TP53,PTEN,-0.05008,0.6119164,100.0,4.0


The p-values should not be taken to seriously. The fact that the p-values are extremely low is easy to understand:
- All columns are zero.
- Except the columns where the two unique mutations happen to coincide.
This immediately implies that the p-value should be near zero.

### Positive correlation

In [31]:
get_top_correlated(corr, pval_corr, gene_counts=gene_counts, ascending=False)

Unnamed: 0,gene 1,gene 2,correlation,p-value,# gene 1,# gene 2
1762,MTOR,GATA3,1.0,0.0,1.0,1.0
1726,CD274,MAP2K2,1.0,0.0,1.0,1.0
1724,SMAD4,CTNNB1,0.999821,2.902454e-179,2.0,2.0
1722,STK11,GNAS,0.929957,1.470575e-46,4.0,1.0
1720,PTEN,FGFR1,0.660249,1.833417e-14,4.0,1.0
1718,JAK2,PIK3CA,0.534706,4.215318e-09,1.0,7.0
1716,FBXW7,PTEN,0.401111,2.230812e-05,4.0,4.0
1714,APC,KEAP1,0.396476,2.827031e-05,2.0,4.0
1712,SMO,STK11,0.375571,7.88319e-05,1.0,4.0
1710,TP53,KRAS,0.375138,8.046542e-05,100.0,37.0


## Do responders show an increase in mutational allel frequency?

Calculate quantities to comapre.

In [None]:
mutation_data_frame['f_t'] = f_t(mutation_data_frame[allele_columns[0]], mutation_data_frame[allele_columns[1]])
mutation_data_frame['dt'] = mutation_data_frame[allele_columns[1]] - mutation_data_frame[allele_columns[0]]
mutation_data_frame['ratio'] = mutation_data_frame[allele_columns[1]] / mutation_data_frame[allele_columns[0]]

Add phenotype data.

In [None]:
mutation_data_frame['response'] = mutation_data_frame['Patient ID'].apply(lambda x: phenotypes.loc[x, 'response_grouped'])
mutation_data_frame['progression'] = mutation_data_frame['Patient ID'].apply(lambda x: phenotypes.loc[x, 'progressie'])

In [None]:
gene_subset = mutation_data_frame['Gene'].isin(['TP53', 'KRAS', 'PIK3CA', 'NFE2L2'])

g = sns.catplot(
    x='Gene', 
    y='f_t', 
    hue='response',
    data=mutation_data_frame[gene_subset],
    kind='violin',
)
g.fig.set_size_inches(16,8)

Absolute difference

In [None]:
g = sns.catplot(
    x='Gene', 
    y='dt', 
    hue='response', 
#     col='progression', 
    data=mutation_data_frame[gene_subset],
#     col_wrap=1,
    kind='violin',
)
# g.ax.set_ylim([-.5, .5])
g.fig.set_size_inches(16,8)

In [None]:
g = sns.catplot(
    x='Gene', 
    y='ratio', 
    hue='response', 
#     col='progression', 
    data=mutation_data_frame[gene_subset], 
    kind='violin',
)
g.fig.set_size_inches(16,8)
# g.ax.set_ylim([-.5, .5])