In [16]:
import os
import Bio.PDB
from Bio.SeqUtils import seq1
import pandas as pd
import requests
import warnings
from io import StringIO

# Format variant effects for visualization

In this notebook I prepare and format the predicted variant fitness of SARS-CoV-2 mutants in every gene for [dms-viz](https://dms-viz.github.io/).

In [17]:
# Snakemake
if "snakemake" in locals() or "snakemake" in globals():
    fitness_effects = snakemake.input.aa_fitness
    wildtype_residues = snakemake.input.clade_founder_aas
    structure_info = snakemake.input.structure_info
    output_dir = snakemake.output.outdir
# Interactive 
else:
    fitness_effects = "../results/aa_fitness/aa_fitness.csv"
    wildtype_residues = "../results/clade_founder_nts/clade_founder_aas.csv"
    structure_info = "../data/proteins.csv"
    output_dir = "../results/dms-viz/"
    os.makedirs(output_dir, exist_ok=True)
    

Get the fitness effects for all SARS-CoV-2 genes.

In [18]:
# Fitness effects for all genes
all_gene_fitness_df = (
    pd.read_csv(fitness_effects)
    .rename(columns={'aa_site': 'site', 'aa': 'mutant'})
    .drop(columns=['aa_differs_among_clade_founders', 'subset_of_ORF1ab'])
)

all_gene_fitness_df.head()

Unnamed: 0,gene,site,mutant,fitness,expected_count
0,E,1,I,-6.933,512.29
1,E,1,K,-3.3793,14.176
2,E,1,L,-4.0995,29.655
3,E,1,M,0.0,705.35
4,E,1,R,-2.9282,8.8466


Get the 'wildtype' residues which correspond to clade 19A.

In [19]:
# 'Wildtype' amino acids from clade 19A ancestor
wildtype_aa_df = (
    pd.read_csv(wildtype_residues)
        .query("clade == '19A (B)'")[['gene', 'site', 'amino acid']]
        .rename(columns={'amino acid': 'wildtype'})
        .drop_duplicates()
        .sort_values(['gene', 'site'])
        .reset_index(drop=True)
)
wildtype_aa_df

Unnamed: 0,gene,site,wildtype
0,E,1,M
1,E,2,Y
2,E,3,S
3,E,4,F
4,E,5,V
...,...,...,...
16932,nsp9,109,T
16933,nsp9,110,V
16934,nsp9,111,R
16935,nsp9,112,L


Join the wildtype with the fitness data.

In [20]:
# Join the wildtype amino acids to the mutation effect data
gene_fitness_df = (all_gene_fitness_df
                   .merge(wildtype_aa_df, how='left', on=['gene', 'site'])
                  )
gene_fitness_df

Unnamed: 0,gene,site,mutant,fitness,expected_count,wildtype
0,E,1,I,-6.9330,512.2900,M
1,E,1,K,-3.3793,14.1760,M
2,E,1,L,-4.0995,29.6550,M
3,E,1,M,0.0000,705.3500,M
4,E,1,R,-2.9282,8.8466,M
...,...,...,...,...,...,...
121855,nsp9,113,K,-4.3772,39.3080,Q
121856,nsp9,113,L,-2.6445,20.6140,Q
121857,nsp9,113,P,-2.9487,9.0407,Q
121858,nsp9,113,Q,0.0000,760.0600,Q


Make a separate fitness dataset for each experiment.

In [21]:
# Loop through each gene and save as a separate 'experiment'
for gene, df in gene_fitness_df.groupby('gene'):
    # Drop the gene columns   
    df.drop(columns=['gene'], inplace = True)
    # Make the 'sitemap'
    sitemap = (df[['site']]
               .drop_duplicates()
               .reset_index(drop=True)
               .rename(columns={'site': 'reference_site'})
               .sort_values(['reference_site'])
              )
    sitemap['sequential_site'] = sitemap.index + 1
    # Save the data frames for each gene
    gene_dir = os.path.join(output_dir, gene)
    os.makedirs(gene_dir, exist_ok=True)
    df.to_csv(os.path.join(gene_dir, f'{gene}_fitness.csv'), index=False)
    sitemap.to_csv(os.path.join(gene_dir, f'{gene}_sitemap.csv'), index=False)
    

## Align the structures

Not all avaliable structures are aligned with the reference sequence. These are the offending structures and how they are off from the index of the reference sequence.

- E: Position 1 is equivalent to position 8 in the data
- nsp1: Position 5 is equivalent to position 15 in the data
- nsp3: Position 1 is equivalent to position 746 in the data
- nsp15: Position 2 is equivalent to position 1 in the data
- nsp16: Position 6799 is equivalent to position 1 in the data

In [22]:
# Get the data for each protein 
proteins = {gene: pd.read_csv(os.path.join(output_dir, gene, f"{gene}_fitness.csv")) for
            gene in
            [f.name for f in os.scandir(output_dir) if f.is_dir()]}
# Get the PDB structures for each protein
structures_df = pd.read_csv(structure_info)
structures = dict(zip(structures_df['selection'], structures_df['pdb']))

In [23]:
def get_structure(pdb_input):
    """
    Fetch a PDB structure from the RCSB PDB web service or load it from a local file.

    This function takes a string as input, which should either be a 4-character PDB ID or
    a path to a local PDB file. The function fetches the structure with the specified PDB ID
    from the RCSB PDB web service, or reads the structure from the specified local PDB file,
    and returns a Bio.PDB structure object.

    Parameters
    ----------
    pdb_input : str
        A string that is either a 4-character PDB ID or a path to a local .pdb file.

    Returns
    -------
    structure : Bio.PDB.Structure.Structure
        A Bio.PDB structure object.

    Raises
    ------
    ValueError
        If the pdb_input is neither a valid PDB ID nor a local PDB file path.
        If there was an error reading the local PDB file or parsing the PDB content.
        If there was an error downloading the PDB file from the RCSB PDB web service.

    """

    # Check if the input is a local file path
    if os.path.isfile(pdb_input) and pdb_input.endswith(".pdb"):
        try:
            # Ignore warnings about discontinuous chains
            with warnings.catch_warnings():
                warnings.simplefilter(
                    "ignore", category=Bio.PDB.PDBExceptions.PDBConstructionWarning
                )
                structure = Bio.PDB.PDBParser().get_structure(pdb_input[:-4], pdb_input)
        except Exception as e:
            raise ValueError(f"Error reading PDB file {pdb_input}: {e}")
    elif len(pdb_input) == 4 and pdb_input.isalnum():  # Check for a valid PDB ID format
        # Try to fetch the structure from RCSB PDB
        response = requests.get(f"https://files.rcsb.org/download/{pdb_input}.cif")
        if response.status_code == 200:
            try:
                pdb_file_content = StringIO(response.text)
                # Ignore warnings about discontinuous chains
                with warnings.catch_warnings():
                    warnings.simplefilter(
                        "ignore", category=Bio.PDB.PDBExceptions.PDBConstructionWarning
                    )
                    structure = Bio.PDB.MMCIFParser().get_structure(
                        pdb_input, pdb_file_content
                    )
            except Exception as e:
                raise ValueError(f"Error parsing PDB content for {pdb_input}: {e}")
        else:
            raise ValueError(
                f"Failed to download {pdb_input} from the RCSB database. Status code: {response.status_code}"
            )
    else:
        raise ValueError(
            f"Invalid input: {pdb_input}. Please provide a valid PDB ID or a local PDB file path."
        )

    return structure

In [24]:
def get_structure_sequence(structure):
    """Get the sequence and index for chains in a given structure"""
        
    # Get the sequence for each chain (assuming a single model)
    structure_seqs = {}
    for model in structure:
        for chain in model:
            sequence = ""
            indices = []

            for residue in chain:
                if Bio.PDB.is_aa(residue):  
                    sequence += seq1(residue.get_resname())
                    indices.append(residue.id[1])  
                    
            structure_seqs[chain.id] = [sequence, indices]
            
    return structure_seqs


In [29]:
def make_aligned_sitemap(protein, offset, sitemap, outpath, chains = None):
    """Offset and check the index of the reference sequence in the sitemap."""
    
    # Load the structure into a BioPython object
    structure = get_structure(structures[protein])
    # Get the sequence and index for each chain in the PDB
    structure_sequences = get_structure_sequence(structure)
    # Get the current wildtype residue at each site
    protein_df = proteins[protein][['site', 'wildtype']].drop_duplicates().reset_index(drop=True)
    # Load in the sitemap for this protein
    sitemap_df = pd.read_csv(sitemap)
    
    # Only get the sequence for the included chains
    for chain, values in structure_sequences.items(): 
        if chains is None or chain in chains:
            indices = values[1]
            sequence = values[0]
            
    # Get the indicies and sequence for the data and structure
    structure_seq = dict(zip([i for i in indices], sequence))
    data_seq = dict(zip(protein_df['site'], protein_df['wildtype']))
    
    protein_sites = []
    after_offset = []
    # Use this index by updating the protein_site column of the sitemap dataframe
    for site, res in data_seq.items():
        index = site + offset
        if index in structure_seq.keys():
            after_offset.append(res == structure_seq[index])
            # Add to protein_site column 
            protein_sites.append(index)
        else:
            # Add a nonsense value
            protein_sites.append(-1000)
            
    print(f"{sum(after_offset) / len(after_offset) * 100}% match after offset.")

    sitemap_df['protein_site'] = protein_sites

    sitemap_df.to_csv(outpath, index=False)



### Envelope (E)

Position 1 in the structure is equivalent to position 8 in the data. 

In [30]:
make_aligned_sitemap("E", offset = -7, sitemap = "../results/dms-viz/E/E_sitemap.csv", outpath = "../results/dms-viz/E/E_sitemap.csv", chains = None)


100.0% match after offset.


### Nsp1

Position 5 in the structure is equivalent to position 15 in the data.

In [31]:
make_aligned_sitemap("nsp1", offset = -9, sitemap = "../results/dms-viz/nsp1/nsp1_sitemap.csv", outpath = "../results/dms-viz/nsp1/nsp1_sitemap.csv", chains = None)


100.0% match after offset.


### Nsp 3

Position 1 in the structure is equivalent to position 746 in the data.

In [32]:
make_aligned_sitemap("nsp3", offset = -745, sitemap = "../results/dms-viz/nsp3/nsp3_sitemap.csv", outpath = "../results/dms-viz/nsp3/nsp3_sitemap.csv", chains = ["A"])


99.68051118210862% match after offset.


### Nsp 15

Position 2 in the structure is equivalent to position 1 in the data

In [35]:
make_aligned_sitemap("nsp15", offset = 1, sitemap = "../results/dms-viz/nsp15/nsp15_sitemap.csv", outpath = "../results/dms-viz/nsp15/nsp15_sitemap.csv", chains = ["A"])


100.0% match after offset.


### Nsp 16

Position 6799 in the structure is equivalent to position 1 in the data

In [36]:
make_aligned_sitemap("nsp16", offset = 6798, sitemap = "../results/dms-viz/nsp16/nsp16_sitemap.csv", outpath = "../results/dms-viz/nsp16/nsp16_sitemap.csv", chains = ["A"])


100.0% match after offset.
