In [1]:
import glob
import gzip
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from scipy import stats
import sys

from Bio.Seq import Seq
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as offline
from plotly.subplots import make_subplots
import seaborn as sns

import matrix_transform
import visualize

%matplotlib inline
sns.set(font="Arial")
sns.set_theme(style="ticks")

In [2]:
fig_folder = 'Figures/'
sample_dir = 'sample_spreadsheet_final.csv'
samples = pd.read_csv(sample_dir)

amino_acid_list = ['*', 'A', 'C', 'D', 'E', 'F', 'G', 'H',
                   'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R',
                   'S', 'T', 'V', 'W', 'Y']
amino_acid_list.reverse()
grouped_aa = ['H', 'K','R','D','E','C','M','N','Q','S','T','A',\
             'I','L','V','F','W','Y','G','P','*']

wt_ = ('SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDM'
       'LNPNYEDLLIRKSNHNFLVQAGNVQLRVIGHSMQNCVLKLKVDTANPKTP'
       'KYKFVRIQPGQTFSVLFLNGSCGSVG'
       'FNIDYDCVSFCYMHHMELPTGVHAGTDLEGNFYGPFVDRQTAQAAGTDTT'
       'ITVNVLAWLYAAVINGDRWFLNRFTTTLNDFNLVAMKYNYEPLTQDHVDI'
       'LGPLSAQTGIAVLDMCASLKELLQNGMNGRTILGSALLEDEFTPFDVVRQCSGVTFQ*')
wt_full = ('MSGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICT'
           'SEDMLNPNYEDLLIRKSNHNFLVQAGNVQLRVIGHSMQNCVLKLKV'
           'DTANPKTPKYKFVRIQPGQTFSVLACYNGSPSGVYQCAMRPNFTIK'
           'GSFLNGSCGSVGFNIDYDCVSFCYMHHMELPTGVHAGTDLEGNFYG'
           'PFVDRQTAQAAGTDTTITVNVLAWLYAAVINGDRWFLNRFTTTLND'
           'FNLVAMKYNYEPLTQDHVDILGPLSAQTGIAVLDMCASLKELLQNG'
           'MNGRTILGSALLEDEFTPFDVVRQCSGVTFQ')
wt_ = [x for x in wt_]
wt_full = [x for x in wt_full]

sets = [1, 2, 3, 4, 5, 6, 7,8, 9, 10, 11, 12, 13,\
        14, 15, 16, 17, 18, 19, 20, 'R1']
set21 = [21]
res_redo = ['8R', '13R1', '13R2', '14R', '16R']

all_sets = [1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 
        17, 18, 19, 20, 21, 8, 9, 10, 'R1',
       '8R', '13R1', '13R2', '14R', '16R']

## Activity (glu/gal)

In [14]:
# Replicate 1
all_res0 = matrix_transform.replicate(0, 'replicate_folder', '_gal_glu.csv', 
                                  samples, sets, res_redo, set21)
# Replicate 2
all_res1 = matrix_transform.replicate(1, 'replicate_folder', '_gal_glu.csv', 
                                  samples, sets, res_redo, set21)

### Plotting the overall correlation

In [15]:
rep1 = [item for sublist in all_res0.values for item in sublist]
rep2 = [item for sublist in all_res1.values for item in sublist]

fig  = go.Figure()
fig.add_trace(go.Scatter(x=rep1, y=rep2, marker=dict(
            color='#545555',
            size=8,
            opacity=0.5), mode = 'markers'))

corr = pd.Series(rep1).corr(pd.Series(rep2), method = 'pearson').round(2)
# fig.add_trace(go.Scatter(x=[-4, 6], y=[-4, 6], mode='lines'))
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True,
                showgrid=False)
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True,
                showgrid=False)
fig.add_annotation(
        x=-2.5,
        y=2.,
        xref="x",
        yref="y",
        text="R="+str(corr),showarrow=False,
        font=dict(family="Arial",
                    size=16,
                    color="#545555"
                    ))
fig.update_layout(
    title="Mutation effects on protease activity",
    xaxis_title="Replicate 1",
    yaxis_title="Replicate 2",
    paper_bgcolor='rgba(255,255,255,100)',
    plot_bgcolor='rgba(255,255,255,100)',
    autosize=False,
    width=800,
    height=800,
    font=dict(family="Arial",
                    size=16,
                    color="#020202"
                    )
    )
fig.show()
fig.write_image(fig_folder+"Fig1_glu_gal_replicate_correlation.pdf")

In [16]:
replicates = pd.DataFrame({'rep1': rep1, 
                           'rep2':rep2} )
replicates = replicates.dropna()
stats.pearsonr(replicates['rep1'], replicates['rep2'])

(0.8238969604575074, 0.0)

## Drug resistance (glu/gc)

In [17]:
# Replicate 1
all_res0 = matrix_transform.replicate_sigma(0, 'replicate_folder_gc', '_gc.csv', 
                                  samples, sets + ['21'], res_redo)
# Replicate 2
all_res1 = matrix_transform.replicate_sigma(1, 'replicate_folder_gc', '_gc.csv', 
                                  samples, sets + ['21'], res_redo)

### Plotting correlation

In [18]:
rep1 = [item for sublist in all_res0.values for item in sublist]
rep2 = [item for sublist in all_res1.values for item in sublist]

fig  = go.Figure()
fig.add_trace(go.Scatter(x=rep1, y=rep2, marker=dict(
            color='#545555',
            size=8,
            opacity=0.5), mode = 'markers'))

corr = pd.Series(rep1).corr(pd.Series(rep2), method = 'pearson').round(2)
# fig.add_trace(go.Scatter(x=[-4, 6], y=[-4, 6], mode='lines'))
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True,
                showgrid=False)
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True,
                showgrid=False)
fig.add_annotation(
        x=-6.9,
        y=5.5,
        xref="x",
        yref="y",
        text="R="+str(corr),showarrow=False,
        font=dict(family="Arial",
                    size=16,
                    color="#545555"
                    ))
fig.update_layout(
    title="Mutation effects on protease activity in presence of gc",
    xaxis_title="Replicate 1",
    yaxis_title="Replicate 2",
    paper_bgcolor='rgba(255,255,255,100)',
    plot_bgcolor='rgba(255,255,255,100)',
    autosize=False,
    width=800,
    height=800,
    font=dict(family="Arial",
                    size=16,
                    color="#020202"
                    )
    )
fig.show()
fig.write_image(fig_folder+"Fig1_glu_gc_replicate_correlaiont.pdf")

In [19]:
replicates = pd.DataFrame({'rep1': rep1, 
                           'rep2':rep2} )
replicates = replicates.dropna()
stats.pearsonr(replicates['rep1'], replicates['rep2'])

(0.6341918833534984, 0.0)

## Drug Condition (glu/grl condition)

In [20]:
# Replicate 1
all_res0 = matrix_transform.replicate_sigma(0, 'replicate_folder_grl', '_grl.csv', 
                                  samples, sets + ['21'], res_redo)
# Replicate 2
all_res1 = matrix_transform.replicate_sigma(1, 'replicate_folder_grl', '_grl.csv', 
                                  samples, sets + ['21'], res_redo)

#### plotting correlation

In [8]:
rep1 = [item for sublist in all_res0.values for item in sublist]
rep2 = [item for sublist in all_res1.values for item in sublist]

fig  = go.Figure()
fig.add_trace(go.Scatter(x=rep1, y=rep2, marker=dict(
            color='#545555',
            size=8,
            opacity=0.5), mode = 'markers'))

corr = pd.Series(rep1).corr(pd.Series(rep2), method = 'pearson').round(2)
# fig.add_trace(go.Scatter(x=[-4, 6], y=[-4, 6], mode='lines'))
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True,
                showgrid=False)
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True,
                showgrid=False)
fig.add_annotation(
        x=-6.9,
        y=8,
        xref="x",
        yref="y",
        text="R="+str(corr),showarrow=False,
        font=dict(family="Arial",
                    size=16,
                    color="#545555"
                    ))
fig.update_layout(
    title="Mutation effects on protease activity in presence of grl",
    xaxis_title="Replicate 1",
    yaxis_title="Replicate 2",
    paper_bgcolor='rgba(255,255,255,100)',
    plot_bgcolor='rgba(255,255,255,100)',
    autosize=False,
    width=800,
    height=800,
    font=dict(family="Arial",
                    size=16,
                    color="#020202"
                    )
    )
fig.show()
fig.write_image(fig_folder+"Fig1_glu_g_replicate_correlaiont.pdf)

In [27]:
replicates = pd.DataFrame({'rep1': rep1, 
                           'rep2':rep2} )
replicates = replicates.dropna()
stats.pearsonr(replicates['rep1'], replicates['rep2'])

(0.7907895279629658, 0.0)