# Create interactive plots for the entire NS5 protein
To do this, we will concatenate the relevant datasets that have been generated for each tile. This means the per-tile analysis must be run before this analysis can be completed. 

First, we will concatenate the 'host_adapt' charts for each tile, which includes average mutation effect and differential selection data for each site in both our Huh-7.5-selected and C6-36-selected conditions. Then we will re-plot in Altair, and regenerate a list of most interesting muts for each selection condition  

In [1]:
# import necessary Python modules and packages
import glob
import os
import subprocess
import shutil

import Bio.SeqIO

import dms_tools2
from dms_tools2.ipython_utils import showPDF
from dms_tools2.plot import COLOR_BLIND_PALETTE_GRAY as CBPALETTE
import dms_tools2.prefs
import dms_tools2.utils
print(f"Using dms_tools2 {dms_tools2.__version__}")

from IPython.display import display, HTML

import pandas as pd

import altair as alt
from plotnine import *

import numpy

import dms_variants.plotnine_themes

Using dms_tools2 2.6.10


Disable max rows in Altair. This was leading to bug in chart generation step. 

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

Create dictionary of pandas dataframes for each host_adaptation.csv file in each tile's results folder. 

In [3]:
# create a pandas dataframe for each tile 'host_adapt.csv'
results = './results/'

d = {}

tile_list = ['tile_1', 'tile_2', 'tile_3', 'tile_4', 
             'tile_5', 'tile_6', 'tile_7', 'tile_8']

for tile in tile_list:
    tilepath = os.path.join(results + tile + "/host_adaptation/host_adaptation.csv")
    d[tile] = pd.read_csv(tilepath)


alltiles_hostadapt = pd.concat([d['tile_1'], d['tile_2'], d['tile_3'], d['tile_4'], 
                                d['tile_5'], d['tile_6'], d['tile_7'], d['tile_8']])

alltiles_hostadapt = (alltiles_hostadapt
                      # .replace(to_replace='conserved_site', value='conserved-site')
                     )
alltiles_hostadapt.mutation_type.unique()

array(['stop', 'all-others', 'conserved-site'], dtype=object)

Now we can save the concatenated file in a new results folder. 

In [4]:
# create 'all_tiles' file within results folder
alltiles_dir = './results/all_tiles'
os.makedirs(alltiles_dir, exist_ok=True)

# save concatenated dataframe as 'alltiles_host_adapt.csv'
alltiles_file = os.path.join(alltiles_dir + '/alltiles_host_adaptation.csv')
alltiles_hostadapt.to_csv(alltiles_file, index = False)
print('Saving concatenated data to "results/all_tiles/" folder. Here are first few lines...')
print(alltiles_hostadapt)

Saving concatenated data to "results/all_tiles/" folder. Here are first few lines...
      site wildtype mutant mutation  muteffect_C636  muteffect_Huh75  \
0        0        R      *      R0*         -6.5355          -6.5253   
1        0        R      A      R0A         -5.4941          -5.4757   
2        0        R      C      R0C         -6.5587          -6.5714   
3        0        R      D      R0D         -4.2456          -4.2537   
4        0        R      E      R0E         -4.6242          -4.6178   
...    ...      ...    ...      ...             ...              ...   
2389   903        L      S    L903S         -4.0944          -8.0479   
2390   903        L      T    L903T         -7.3498          -7.3545   
2391   903        L      V    L903V         -6.9133          -6.7640   
2392   903        L      W    L903W         -4.2839          -3.6808   
2393   903        L      Y    L903Y         -5.8816          -5.8905   

      foldchange_C636  foldchange_Huh75  diffsel_H

Now we can produce the Altair charts like we did for the per-tile analysis for the entire E gene

In [5]:
# select point nearest mouse
nearest = alt.selection(type='single', empty='none', nearest=True, on='mouseover')

# create the basic chart
basechart = (
 alt.Chart(alltiles_hostadapt
           .rename(columns={'muteffect_C636': 'effect C636',
                            'muteffect_Huh75': 'effect Huh75',
                            'diffsel_Huh75_vs_C636': 'Huh75 vs C636',
                            })
           .assign(dummy=0)
           )
 .add_selection(nearest)
 .encode(fill=alt.condition(nearest, alt.value('orange'), alt.value('gray')),
         opacity=alt.condition(nearest, alt.value(1), alt.value(0.4)),
         tooltip=['mutation', 'effect C636', 'effect Huh75', 'Huh75 vs C636'],
         color='mutation_type'
         )
 .interactive()
 )

# side-by-side interactive plots to select mutations
chart = (
 basechart.encode(x='effect C636:Q',
              y='effect Huh75:Q'
              )
      .mark_point()
      .properties(width=500,
                  height=500)
 |
 basechart.encode(x=alt.X('dummy:O', title=None),
              y='Huh75 vs C636:Q',           
              )
      .properties(width=50,
                  height=500)
      .mark_tick()
 )

# save the interactive plot
plotfile = os.path.join(alltiles_dir, 'select_muts_chart.html')
print(f"Saving interactive plot to {plotfile}")
chart.save(plotfile)

# show the chart
chart

Saving interactive plot to ./results/all_tiles/select_muts_chart.html


The above interactive plots make it easy to identify mutations.

As mentioned above, Huh-7.5-specific mutations will:
  - have *effect Huh-7.5* $> 0$ in the scatter plot at left (be favorable in Huh-7.5 cells)
  - have *effect C636* $< 0$ in the scatter plot at left (be unfavorable in C636 cells)
  - have *Huh-7.5 vs C636* $> 0$ in the strip chart at right (be favored in Huh-7.5 over C636)
  
The C636-specific mutations will:
  - have *effect Huh-7.5* $< 0$ in the scatter plot at left (be unfavorable in Huh-7.5 cells)
  - have *effect C636* $> 0$ in the scatter plot at left (be favorable in C636 cells)
  - have *Huh-7.5 vs C636* $< 0$ in the strip chart at right (be favored in C636 over Huh-7.5)
  
You can use the mouse to hover over marks and they will turn orange in both the scatter plot and the strip chart, and a box will appear giving detailed information on the mutations.
You can also use the mouse scroll bar to zoom in and out.

*Note: the interactive plot will only render interactively in the Jupyter notebook itself! If you have a HTMl rendering the plot will be static. In that case, you want to open the interactive plot saved to the HTML file above separately.*

The best way to pick mutations will be to look at the charts above, but below we also simply list what appear to be some of the top candidates in tabular form using simple criteria.

In [6]:
print("The top Huh-7.5-specific mutations appear to be...")
display(HTML(
    alltiles_hostadapt
    .query('muteffect_Huh75 > 0')
    .sort_values('diffsel_Huh75_vs_C636', ascending=False)
    .head(n=20)
    .to_html(index=False)
    ))

print("The top C6-36-specific mutations appear to be...")
display(HTML(
    alltiles_hostadapt
    .query('muteffect_C636 > 0')
    .sort_values('diffsel_Huh75_vs_C636', ascending=True)
    .head(n=20)
    .to_html(index=False)
    ))

The top Huh-7.5-specific mutations appear to be...


site,wildtype,mutant,mutation,muteffect_C636,muteffect_Huh75,foldchange_C636,foldchange_Huh75,diffsel_Huh75_vs_C636,mutation_type
697,K,E,K697E,-2.4412,1.1265,0.1841,2.1833,3.109808,all-others
697,K,D,K697D,-4.2001,1.2247,0.0544,2.3371,2.974878,all-others
22,L,A,L22A,-2.4251,0.4003,0.1862,1.3198,2.874315,all-others
22,L,S,L22S,-2.9724,0.3339,0.1274,1.2604,2.827964,all-others
22,L,G,L22G,-3.2188,0.4154,0.1074,1.3336,2.706757,all-others
697,K,L,K697L,-1.8649,0.4199,0.2745,1.3379,2.509215,all-others
257,V,T,V257T,-2.0868,0.5048,0.2354,1.4189,2.441824,all-others
697,K,A,K697A,0.1298,2.4846,1.0941,5.5968,2.429595,all-others
697,K,M,K697M,-1.3641,1.2061,0.3885,2.3071,2.417421,all-others
697,K,S,K697S,-0.0676,2.4559,0.9542,5.4866,2.410419,all-others


The top C6-36-specific mutations appear to be...


site,wildtype,mutant,mutation,muteffect_C636,muteffect_Huh75,foldchange_C636,foldchange_Huh75,diffsel_Huh75_vs_C636,mutation_type
513,L,T,L513T,0.1834,-3.0965,1.1356,0.1169,-2.661233,all-others
572,Y,Q,Y572Q,0.9593,-1.9276,1.9444,0.2629,-2.433683,all-others
299,H,S,H299S,0.0764,-3.5792,1.0544,0.0837,-2.329245,all-others
171,W,Y,W171Y,0.7507,-1.401,1.6827,0.3787,-2.050828,all-others
581,V,T,V581T,0.0507,-2.1191,1.0358,0.2302,-1.943392,all-others
4,T,F,T4F,0.5062,-1.5112,1.4203,0.3508,-1.893925,all-others
142,T,V,T142V,0.4556,-1.3955,1.3713,0.3801,-1.676757,all-others
787,V,D,V787D,0.395,-2.2839,1.3149,0.2053,-1.609872,all-others
27,Y,F,Y27F,1.1972,-0.726,2.2929,0.6046,-1.524421,all-others
171,W,F,W171F,0.0174,-1.8956,1.0121,0.2688,-1.522757,all-others
