# Figure generation using polyclonal
We would like to use some functions from polyclonal to visualize the selections datsets. 

Note: running this notebook requires the conda environment specified in "environment_polyclonal.yml"

In [1]:
# load modules
import os
import glob
import numpy
import pandas as pd
import scipy
import collections 
import requests
import tempfile
import altair as alt

import polyclonal
from polyclonal import pdb_utils

In [2]:
# identify data and results directory
datadir = "./data/"
resultsdir = "./results/"
polyclonaldir = os.path.join(resultsdir + 'polyclonal')

In [3]:
# disable max rows in Altair
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Visualize with mutant escape plots
Using polyclonal to visualize antibody escape  using heatmaps and graphs.

In [4]:
# wt activity at epitope
temp_wt_datafile = os.path.join(datadir + 'temp_wt_data.csv')
activity_wt_df = (pd.read_csv(temp_wt_datafile)
                 )
activity_wt_df

Unnamed: 0,epitope,activity
0,1,3


In [6]:
# config
antibody = "SiGN_3C"
escape_score_metric = "median"

mut_escape_df = pd.read_csv(os.path.join(polyclonaldir, f"{antibody}_data.csv"))

column_name = f"{escape_score_metric}mut_diffsel"
mut_escape_df = mut_escape_df.rename(columns={column_name : "escape"})


# load
mut_escape_df = (mut_escape_df
             [['condition',
               'mutant',
               'escape',]]
                .rename(columns = {'condition':'epitope',
                                   'mutant':'mutation',})
                .dropna()
                .drop_duplicates()
                .assign(epitope = lambda x: 1)
                # .query('escape>0')
                )

mut_escape_df

Unnamed: 0,epitope,mutation,escape
0,1,I1A,-0.000099
1,1,I1Q,-0.000099
2,1,I1P,-0.000099
3,1,I1K,-0.000099
4,1,I1L,-0.597518
...,...,...,...
10074,1,A504C,0.000050
10075,1,A504E,0.000050
10076,1,A504H,0.000050
10077,1,A504M,0.000050


In [7]:
bnabs_poly = polyclonal.Polyclonal(
    activity_wt_df=activity_wt_df, 
    mut_escape_df=mut_escape_df,
    alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
)

print(f"Epitopes: {bnabs_poly.epitopes}")
print(f"Number of mutations: {len(bnabs_poly.mutations)}")
print(f"Number of sites: {len(bnabs_poly.sites)}")

Epitopes: ('1',)
Number of mutations: 9576
Number of sites: 504


In [8]:
bnabs_poly.mut_escape_site_summary_df()

Unnamed: 0,epitope,site,wildtype,mean,total positive,max,min,total negative,n mutations
0,1,1,I,-0.097733,0.000000,-0.000094,-0.597518,-1.856926,19
1,1,2,R,-0.000034,0.682980,0.426309,-0.395626,-0.683635,19
2,1,3,C,-0.014047,0.222032,0.154066,-0.263085,-0.488922,19
3,1,4,I,-0.070003,0.207046,0.205969,-0.579148,-1.537104,19
4,1,5,G,0.001577,0.377192,0.193355,-0.347096,-0.347227,19
...,...,...,...,...,...,...,...,...,...
499,1,500,T,-0.221382,0.921918,0.592961,-1.266258,-5.128174,19
500,1,501,A,-0.088251,0.337452,0.315226,-0.681259,-2.014225,19
501,1,502,V,-0.011041,0.347323,0.263112,-0.254745,-0.557108,19
502,1,503,S,-0.092106,0.349752,0.349619,-0.424852,-2.099758,19


In [9]:
# NBVAL_IGNORE_OUTPUT
bnabs_poly.activity_wt_barplot()

In [11]:
# NBVAL_IGNORE_OUTPUT
mutescapeplotfile = os.path.join(polyclonaldir, str(antibody) + '_mut_escape.svg')

bnabs_poly.mut_escape_plot()
# .save(mutescapeplotfile)

## Reassign b-factor in PDB files
We want to assign the b-factor column in PDB files with data from our selections. This will allow us to visualize in PyMOL, which gives us greater flexibility in figure generation than dms-view. 

In [12]:
# generate files using these polyclonal input files
polyclonal_datafiles = ['alldata.csv']

In [13]:
# identify PDB files
pdb_url = 'https://files.rcsb.org/download/5IRE.pdb'

# make reassigned_pdb_files directory
reassigneddir = os.path.join(resultsdir + "reassigned_pdb_files")
os.makedirs(reassigneddir, exist_ok=True)

In [14]:
# generate files for this antibody
antibodylist = ['EDE1_C10',
                'EDE1_C8',
                'MZ4',
                'ZV_67', 
                'ZKA_64']

# generate files using these dms-data
# polyclonal_datafiles = ['dimer_polyclonal_data.csv']

# create dict missing_metric that assigns -1 to sites with missing metrics in chain C, and 0 to sites in other chains:
missing_metric = collections.defaultdict(lambda: 0)
missing_metric['C'] = -1


# identify input dms data
for antibody in antibodylist:
    print(f'Generating reassigned B-factor PDB file for {antibody}...')
    for data in polyclonal_datafiles:
        dms_data = os.path.join(polyclonaldir, data)
        df = (pd.read_csv(dms_data, index_col=False)
              .drop(['label_site','site','wildtype','mutation'], axis=1)
              .rename(columns=({'protein_chain':'chain',
                                'protein_site':'site'}))
              .drop_duplicates()
              .dropna()
              .query('condition == "' + str(antibody) + '"')
                  # [['site','wildtype','mutation','condition',
                  #   'site_abs_diffsel','site_positive_diffsel',
                  #   'site_max_diffsel','site_min_diffsel']]
                 )

        # download PDB, do the re-assignment of B factors, read the lines from the resulting re-assigned PDB:
        r = requests.get(pdb_url)
        with tempfile.TemporaryDirectory() as tmpdir:
           original_pdbfile = os.path.join(tmpdir, 'original.pdb')
           with open(original_pdbfile, 'wb') as f:
               _ = f.write(r.content)
           reassigned_pdbfile = os.path.join(tmpdir, 'reassigned.pdb')
           pdb_utils.reassign_b_factor(input_pdbfile=original_pdbfile,
                             output_pdbfile=reassigned_pdbfile,
                             df=df,
                             metric_col='site_median_positive_diffsel',
                             missing_metric=missing_metric)
           pdb_text = open(reassigned_pdbfile).readlines()
        # save reassigned pdbfiles
        outfile = os.path.join(reassigneddir + f"/{antibody}_5IRE.pdb")

        with open(outfile, 'w') as f:
            for line in pdb_text:
                f.write(f"{line}\n")

Generating reassigned B-factor PDB file for EDE1_C10...
Generating reassigned B-factor PDB file for EDE1_C8...
Generating reassigned B-factor PDB file for MZ4...
Generating reassigned B-factor PDB file for ZV_67...
Generating reassigned B-factor PDB file for ZKA_64...


In [15]:
# peak at a single strip of PDB file
print(pdb_text[0].strip())

ATOM      1  N   ILE A   1    -161.070 -67.005-130.595  1.00  0.30           N


In [16]:
# peak at single strip of PDB file
print('\n'.join(line.strip() for line in pdb_text[5010: 5025]))

ATOM   5009  N   THR C 205    -109.384-140.147-124.230  1.00  0.93           N
ATOM   5010  CA  THR C 205    -109.051-140.764-125.507  1.00  0.93           C
ATOM   5011  C   THR C 205    -109.063-139.722-126.612  1.00  0.93           C
ATOM   5012  O   THR C 205    -110.125-139.192-126.945  1.00  0.93           O
ATOM   5013  CB  THR C 205    -110.053-141.848-125.872  1.00  0.93           C
ATOM   5014  OG1 THR C 205    -111.313-141.232-126.144  1.00  0.93           O
ATOM   5015  CG2 THR C 205    -110.243-142.777-124.728  1.00  0.93           C
ATOM   5016  N   MET C 206    -107.908-139.468-127.207  1.00  0.07           N
ATOM   5017  CA  MET C 206    -107.863-138.843-128.515  1.00  0.07           C
ATOM   5018  C   MET C 206    -108.099-139.927-129.555  1.00  0.07           C
ATOM   5019  O   MET C 206    -108.562-141.018-129.234  1.00  0.07           O
ATOM   5020  CB  MET C 206    -106.543-138.126-128.743  1.00  0.07           C
ATOM   5021  CG  MET C 206    -106.392-136.871-127.9

In [17]:
# for EDE1-C10, we'd also like to generate reassigned B-factor for 2 other cryo-EM structures
pdb_list = ['5H37',
            '5H30']

# generate files for this antibody
antibody = 'EDE1_C10'

# create dict missing_metric that assigns -1 to sites with missing metrics in chain C, and 0 to sites in other chains:
missing_metric = collections.defaultdict(lambda: 0)
missing_metric['C'] = -1


# identify input dms data
for pdb in pdb_list:
    print(f'Generating reassigned B-factor PDB file for {antibody}...')
    for data in polyclonal_datafiles:
        dms_data = os.path.join(polyclonaldir, data)
        df = (pd.read_csv(dms_data, index_col=False)
              .drop(['label_site','site','wildtype','mutation'], axis=1)
              .rename(columns=({'protein_chain':'chain',
                                'protein_site':'site'}))
              .drop_duplicates()
              .dropna()
              .query('condition == "' + str(antibody) + '"')
                  # [['site','wildtype','mutation','condition',
                  #   'site_abs_diffsel','site_positive_diffsel',
                  #   'site_max_diffsel','site_min_diffsel']]
                 )

        # download PDB, do the re-assignment of B factors, read the lines from the resulting re-assigned PDB:
        pdb_url = 'https://files.rcsb.org/download/' + pdb + '.pdb'
        r = requests.get(pdb_url)
        with tempfile.TemporaryDirectory() as tmpdir:
           original_pdbfile = os.path.join(tmpdir, 'original.pdb')
           with open(original_pdbfile, 'wb') as f:
               _ = f.write(r.content)
           reassigned_pdbfile = os.path.join(tmpdir, 'reassigned.pdb')
           pdb_utils.reassign_b_factor(input_pdbfile=original_pdbfile,
                             output_pdbfile=reassigned_pdbfile,
                             df=df,
                             metric_col='site_median_positive_diffsel',
                             missing_metric=missing_metric)
           pdb_text = open(reassigned_pdbfile).readlines()
        # save reassigned pdbfiles
        outfile = os.path.join(reassigneddir + f"/{antibody}_{pdb}.pdb")

        with open(outfile, 'w') as f:
            for line in pdb_text:
                f.write(f"{line}\n")

Generating reassigned B-factor PDB file for EDE1_C10...
Generating reassigned B-factor PDB file for EDE1_C10...


In [18]:
# for MZ4, we'd also like to generate reassigned B-factor for 2 other cryo-EM structures
pdb_list = ['6NIU']

# generate files for this antibody
antibody = 'MZ4'

# create dict missing_metric that assigns -1 to sites with missing metrics in chain C, and 0 to sites in other chains:
missing_metric = collections.defaultdict(lambda: 0)
missing_metric['C'] = -1


# identify input dms data
for pdb in pdb_list:
    print(f'Generating reassigned B-factor PDB file for {antibody}...')
    for data in polyclonal_datafiles:
        dms_data = os.path.join(polyclonaldir, data)
        df = (pd.read_csv(dms_data, index_col=False)
              .drop(['label_site','site','wildtype','mutation'], axis=1)
              .rename(columns=({'protein_chain':'chain',
                                'protein_site':'site'}))
              .drop_duplicates()
              .dropna()
              .query('condition == "' + str(antibody) + '"')
                  # [['site','wildtype','mutation','condition',
                  #   'site_abs_diffsel','site_positive_diffsel',
                  #   'site_max_diffsel','site_min_diffsel']]
                 )

        # download PDB, do the re-assignment of B factors, read the lines from the resulting re-assigned PDB:
        pdb_url = 'https://files.rcsb.org/download/' + pdb + '.pdb'
        r = requests.get(pdb_url)
        with tempfile.TemporaryDirectory() as tmpdir:
           original_pdbfile = os.path.join(tmpdir, 'original.pdb')
           with open(original_pdbfile, 'wb') as f:
               _ = f.write(r.content)
           reassigned_pdbfile = os.path.join(tmpdir, 'reassigned.pdb')
           pdb_utils.reassign_b_factor(input_pdbfile=original_pdbfile,
                             output_pdbfile=reassigned_pdbfile,
                             df=df,
                             metric_col='site_median_positive_diffsel',
                             missing_metric=missing_metric)
           pdb_text = open(reassigned_pdbfile).readlines()
        # save reassigned pdbfiles
        outfile = os.path.join(reassigneddir + f"/{antibody}_{pdb}.pdb")

        with open(outfile, 'w') as f:
            for line in pdb_text:
                f.write(f"{line}\n")

Generating reassigned B-factor PDB file for MZ4...


ValueError: `df` has chains not in PDB: {'C'}

In [19]:
# for SiGN-3C, we'd also like to generate reassigned B-factor for 2 other cryo-EM structures
pdb_list = ['7BUA',
           '7BU8']

# generate files for this antibody
antibody = 'SiGN_3C'

# create dict missing_metric that assigns -1 to sites with missing metrics in chain C, and 0 to sites in other chains:
missing_metric = collections.defaultdict(lambda: 0)
missing_metric['C'] = -1


# identify input dms data
for pdb in pdb_list:
    print(f'Generating reassigned B-factor PDB file for {antibody}...')
    for data in polyclonal_datafiles:
        dms_data = os.path.join(polyclonaldir, data)
        df = (pd.read_csv(dms_data, index_col=False)
              .drop(['label_site','site','wildtype','mutation'], axis=1)
              .rename(columns=({'protein_chain':'chain',
                                'protein_site':'site'}))
              .drop_duplicates()
              .dropna()
              .query('condition == "' + str(antibody) + '"')
                  # [['site','wildtype','mutation','condition',
                  #   'site_abs_diffsel','site_positive_diffsel',
                  #   'site_max_diffsel','site_min_diffsel']]
                 )

        # download PDB, do the re-assignment of B factors, read the lines from the resulting re-assigned PDB:
        pdb_url = 'https://files.rcsb.org/download/' + pdb + '.pdb'
        r = requests.get(pdb_url)
        with tempfile.TemporaryDirectory() as tmpdir:
           original_pdbfile = os.path.join(tmpdir, 'original.pdb')
           with open(original_pdbfile, 'wb') as f:
               _ = f.write(r.content)
           reassigned_pdbfile = os.path.join(tmpdir, 'reassigned.pdb')
           pdb_utils.reassign_b_factor(input_pdbfile=original_pdbfile,
                             output_pdbfile=reassigned_pdbfile,
                             df=df,
                             metric_col='site_median_positive_diffsel',
                             missing_metric=missing_metric)
           pdb_text = open(reassigned_pdbfile).readlines()
        # save reassigned pdbfiles
        outfile = os.path.join(reassigneddir + f"/{antibody}_{pdb}.pdb")

        with open(outfile, 'w') as f:
            for line in pdb_text:
                f.write(f"{line}\n")

Generating reassigned B-factor PDB file for SiGN_3C...
Generating reassigned B-factor PDB file for SiGN_3C...
