# Interactivley Visualize SNPs 

In [15]:
import pandas as pd
import numpy as np
import altair as alt

# Remove the limit of ~5000 rows
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [16]:
snps_file = "./expanded_snp_freq.csv"

In [17]:
snps_df = pd.read_csv(snps_file)
snps_df.head()

Unnamed: 0,SNP,Tissue,AF,Haplotype,Background,Gene_Name,AA_Change,Effect,POS,ALT,REF,DP,ADAR,n,min_freq
0,G330T,Frontal Cortex 2,0.1182,subclonal,subclonal,N,Gly75Cys,Missense,330,T,G,217010,False,1.0,0.1
1,G330T,Parietal Lobe,0.0,subclonal,subclonal,N,Gly75Cys,Missense,330,T,G,167174,False,1.0,0.1
2,G330T,Hippocampus,0.0,subclonal,subclonal,N,Gly75Cys,Missense,330,T,G,96688,False,1.0,0.1
3,G330T,Temporal Lobe,0.0,subclonal,subclonal,N,Gly75Cys,Missense,330,T,G,200923,False,1.0,0.1
4,G330T,Midbrain,0.0,subclonal,subclonal,N,Gly75Cys,Missense,330,T,G,60711,False,1.0,0.1


## Define plot-wide parameters

Define the parameters that carry through to multiple plots for easy adjustment. 

In [84]:
# Width of the line plot and zoom bar
width = 1500

# Height of the line plot and scatter plot
height = 300



# Tooltips for scatterplot and names
point_tooltip = [
    alt.Tooltip('SNP:N', title="Mutation"),
    alt.Tooltip('AF:Q', title="Allele Frequency", format=".2f"),
    alt.Tooltip('DP:Q', title="Depth", format=".0f"),
    alt.Tooltip('Haplotype:N', title="Haplotype"),
    alt.Tooltip('Background:N', title="Background"), 
    alt.Tooltip('Gene_Name:N', title="Gene"),
    alt.Tooltip('AA_Change:N', title="Amino Acid Mutation")
]

## Define shared selections

Define the selections that will be shared by the final linked plots. 

In [65]:
# Interactive legend to select the variant background
haplotype_selection = alt.selection_single(fields=["Haplotype"], bind="legend")


## Line + Scatter Plots 

Plot of mutations across every tissue. 

In [48]:
# Line Plot
base = alt.Chart(snps_df
    ).encode(
        x=alt.X('Tissue:N',
                title="Tissue"
               ),
        y=alt.Y("AF:Q",
                title="Allele Frequency"
               ),
        color=alt.Color("Haplotype:N",
                        legend=alt.Legend(orient="top", title="Haplotype: ")
                ),
        opacity=alt.condition(haplotype_selection, alt.value(1.0), alt.value(0.01)),
        detail="SNP:N"
)

# Add the line mark
line = (base
        .mark_line(point=False)
       )

point = (base
        .mark_circle()
        .encode(tooltip=point_tooltip)
        )


((line + point)
 .add_selection(
     haplotype_selection
 )
 .properties(
     width=width,
     height=height
 )
)

In [85]:
base = alt.Chart(snps_df)

# Background plot
background = (
        base
    .mark_line(
        color="grey",
        opacity=0.02
    )
    .encode(
        x=alt.X("Tissue:N"),
        y=alt.Y("AF:Q"),
        detail="SNP:N",
    )
    .properties(
        width = width,
        height = height
    )
)

# Selection
selection = (
        base
    .mark_line(
        point=True
    )
    .encode(
        x=alt.X("Tissue:N"),
        y=alt.Y("AF:Q"),
        detail="SNP:N",
        color="Haplotype:N",
        tooltip=point_tooltip,
    )
    .transform_filter(
        haplotype_selection
    )
    .add_selection(
        haplotype_selection
    )
    .properties(
        width = width,
        height = height
    )
)

(background + selection).configure_point(
        size=50
    )

In [86]:
from scipy.stats import binom

In [95]:
k = 1
n = np.arange(1,101)
p = .14

a = 1 - binom.cdf(k, n, p)

n[a > .95].min()

32

In [96]:
def calculate_n_trials(halplotype_prob, maximum_clones, minimum_target_obsv, target_liklihood):
    n = np.arange(1, maximum_clones + 1)
    liklihoods = 1 - binom.cdf(minimum_target_obsv, n, halplotype_prob)
    return n[liklihoods > target_liklihood].min()

In [98]:
calculate_n_trials(.14, 100, 1, .99)

45

In [99]:
calculate_n_trials(.20, 100, 1, .99)

31

In [100]:
calculate_n_trials(.40, 100, 1, .99)

14

In [101]:
calculate_n_trials(.50, 100, 1, .99)

11