In [1]:
#%pip install nglview
#%pip install nglview==3.0.8
#%pip install -U ipywidgets==7.7.1
#%pip install --upgrade
#!jupyter nbextension enable --py widgetsnbextension


**Note**: This is a SBG workspace dependent code, it won't work in other manchines. Please contact Leandro Radusky if you want to install the dependencies and run this script by yourself.

Tested in python 3.8


# PERM analysis

Sample pipeline to study the stability, aggregability and other structural features of PERM protein. To be discussed and then replicated on the other biomarkers.

In [2]:
# Import the sbg workspace
import sys
sys.path.append('/home/leandro/Dropbox/workspacesbg/sbg/')

In [3]:
# Imports
from sbg.scripts.foldx.TangoHandler import TangoHandler
from sbg.structure.Structure import Structure
from sbg.structure.AlphaFoldHandler import AlphaFoldHandler
from sbg.orf.Orf import OnlineOrf
from sbg.orf.FastaHandler import FastaHandler
from sbg.pdbtools.pdb_bfactor import pdbBfactor
from sbg.common.FileHandler import FileHandler

from pyfoldx.structure import Structure as PyFoldxStructure


In [4]:
# Some constants
PROTEIN_CODE = 'PERM'
PERM_UNIPROT_ACCESION = 'P05164'
MIMARK_ANTIGENS_SEQUENCES_PATH = './data/antigens.fasta'


## Step 1: Compare the sequences of the MiMARK PERM antigen vs the UniProt entry

In [5]:
# Get the uniprot object
perm_up_obj = OnlineOrf(PERM_UNIPROT_ACCESION)

In [6]:
# Get the uniprot sequence
perm_up_seq = perm_up_obj.orf.sequence.text
perm_up_seq

'MGVPFFSSLRCMVDLGPCWAGGLTAEMKLLLALAGLLAILATPQPSEGAAPAVLGEVDTSLVLSSMEEAKQLVDKAYKERRESIKQRLRSGSASPMELLSYFKQPVAATRTAVRAADYLHVALDLLERKLRSLWRRPFNVTDVLTPAQLNVLSKSSGCAYQDVGVTCPEQDKYRTITGMCNNRRSPTLGASNRAFVRWLPAEYEDGFSLPYGWTPGVKRNGFPVALARAVSNEIVRFPTDQLTPDQERSLMFMQWGQLLDHDLDFTPEPAARASFVTGVNCETSCVQQPPCFPLKIPPNDPRIKNQADCIPFFRSCPACPGSNITIRNQINALTSFVDASMVYGSEEPLARNLRNMSNQLGLLAVNQRFQDNGRALLPFDNLHDDPCLLTNRSARIPCFLAGDTRSSEMPELTSMHTLLLREHNRLATELKSLNPRWDGERLYQEARKIVGAMVQIITYRDYLPLVLGPTAMRKYLPTYRSYNDSVDPRIANVFTNAFRYGHTLIQPFMFRLDNRYQPMEPNPRVPLSRVFFASWRVVLEGGIDPILRGLMATPAKLNRQNQIAVDEIRERLFEQVMRIGLDLPALNMQRSRDHGLPGYNAWRRFCGLPQPETVGQLGTVLRNLKLARKLMEQYGTPNNIDIWMGGVSEPLKRKGRVGPLLACIIGTQFRKLRDGDRFWWENEGVFSMQQRQALAQISLPRIICDNTGITTVSKNNIFMSNSYPRDFVNCSTLPALNLASWREAS'

In [7]:
# Get the MiMARK antigen sequence
fh = FastaHandler(MIMARK_ANTIGENS_SEQUENCES_PATH)

In [8]:
str(fh.orfs['PERM'].seq)

'MKWVTFISLLFLFSSAYSRGVFRREAHKSEIAHRFNDVGEEHFIGLVLITFSQYLQKAPYEEHAKLVKEVTDLAKACVADESAANCDKSLHDIFGDKICALPSLRDTYGDVADCCEKKEPERNECFLHHKDDKPDLPPFARPEADVLCKAFHDDEKAFFGHYLYEVARRHPYFYAPELLYYAQKYKAILTECCEAADKGACLTPKLDGGGGSGGGGSGGGASAAPAVLGEVDTSLVLSSMEEAKQLVDKAYKERRESIKQRLRSGSASPMELLSYFKQPVAATRTAVRAADYLHVALDLLERKLRSLWRRPFNVTDVLTPAQLNVLSKSSGCAYQDVGVTCPEQDKYRTITGMCNNRRSPTLGASNRAFVRWLPAEYEDGFSLPYGWTPGVKRNGFPVALARAVSNEIVRFPTDQLTPDQERSLMFMQWGQLLDHDLDFTPEPAARASFVTGVNCETSCVQQPPCFPLKIPPNDPRIKNQADCIPFFRSCPACPGSNITIRNQINALTSFVDASMVYGSEEPLARNLRNMSNQLGLLAVNQRFQDNGRALLPFDNLHDDPCLLTNRSARIPCFLAGDTRSSEMPELTSMHTLLLREHNRLATELKSLNPRWDGERLYQEARKIVGAMVQIITYRDYLPLVLGPTAMRKYLPTYRSYNDSVDPRIANVFTNAFRYGHTLIQPFMFRLDNRYQPMEPNPRVPLSRVFFASWRVVLEGGIDPILRGLMATPAKLNRQNQIAVDEIRERLFEQVMRIGLDLPALNMQRSRDHGLPGYNAWRRFCGLPQPETVGQLGTVLRNLKLARKLMEQYGTPNNIDIWMGGVSEPLKRKGRVGPLLACIIGTQFRKLRDGDRFWWENEGVFSMQQRQALAQISLPRIICDNTGITTVSKNNIFMSNSYPRDFVNCSTLPALNLASWREASGSGHHHHHH'

In [9]:
# Align the two protein sequences using biopython pairwise2
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

alignments = pairwise2.align.globalxx(perm_up_seq, fh.orfs['PERM'].seq)



In [10]:
# Display the best alignment nicely, indicating at the begining which protein is which
print('PERM Uniprot vs PERM MiMARK')
print("********************************************")
print()
# print the alignment in several lines, 80 positions per line
for i in range(0, len(alignments[0][0]), 80):
    print(f"UniProt -> ", alignments[0][0][i:i+80])
    print(f"MiMARK  -> ", alignments[0][1][i:i+80])
    print()

PERM Uniprot vs PERM MiMARK
********************************************

UniProt ->  MG--VP-F----F--SSL---RCM-V---------------DL-G-------------------P---------------
MiMARK  ->  M-KWV-TFISLLFLFSS-AYSR--GVFRREAHKSEIAHRFND-VGEEHFIGLVLITFSQYLQKAPYEEHAKLVKEVTDLA

UniProt ->  --CW-AG---------------G-----L-----T----A---EMKL---------L--------L---A------L--A
MiMARK  ->  KAC-VA-DESAANCDKSLHDIFGDKICALPSLRDTYGDVADCCE-K-KEPERNECFLHHKDDKPDLPPFARPEADVLCKA

UniProt ->  ---------G--L--------------L---A-----IL-----A-------TPQP-------SEG-------A-A-PAV
MiMARK  ->  FHDDEKAFFGHYLYEVARRHPYFYAPELLYYAQKYKAILTECCEAADKGACLT--PKLDGGGGS-GGGGSGGGASAAPAV

UniProt ->  LGEVDTSLVLSSMEEAKQLVDKAYKERRESIKQRLRSGSASPMELLSYFKQPVAATRTAVRAADYLHVALDLLERKLRSL
MiMARK  ->  LGEVDTSLVLSSMEEAKQLVDKAYKERRESIKQRLRSGSASPMELLSYFKQPVAATRTAVRAADYLHVALDLLERKLRSL

UniProt ->  WRRPFNVTDVLTPAQLNVLSKSSGCAYQDVGVTCPEQDKYRTITGMCNNRRSPTLGASNRAFVRWLPAEYEDGFSLPYGW
MiMARK  ->  WRRPFNVTDVLTPAQLNVLSKSSGCAYQDVGVTCPEQDKYRTITGMCNNRRSPTLGASNRAFVRWLPAEYEDG

In [11]:
# Clearly the longest match goes from PAVLGEVDT to PALNLASWREAS
# Extract the Uniprot positions of this match for further structural analysis
up_match_start = perm_up_seq.find('PAVLGEVDT')
up_match_end = perm_up_seq.find('PALNLASWREAS') + len('PALNLASWREAS')

print( f'Uniprot match start: {up_match_start} Uniprot match end: {up_match_end}')

Uniprot match start: 50 Uniprot match end: 745


## Step 2: Computing protein structure features

In [12]:
perm_pdb_structure = AlphaFoldHandler.get_model(PERM_UNIPROT_ACCESION)

In [13]:
# Compute aggregability scores
perm_agg_dict = TangoHandler.getAggregation(perm_pdb_structure,"aggregation/")

In [14]:
# Map the aggregation scores to the structure
perm_mapped_agg_file_lines = pdbBfactor(perm_pdb_structure.fileLines, perm_agg_dict["A"])

In [15]:
FileHandler.writeLines(f'./data/{PERM_UNIPROT_ACCESION}_mapped_agg.pdb', perm_mapped_agg_file_lines)

True

In [16]:
perm_pyfoldx_structure = PyFoldxStructure(PERM_UNIPROT_ACCESION, f'./data/{PERM_UNIPROT_ACCESION}_mapped_agg.pdb')

In [17]:
perm_alascan_results = perm_pyfoldx_structure.alanineScan()

Performing Alanine Scan...
cd ./.foldx_202471784613831428/; /home/leandro/Dropbox/raduspostdoc/code/foldxEnterprise/build/Release/foldx --command=AlaScan --pdb=tmp.pdb > /dev/null 2> /dev/null
Alanine Scan finished.


In [18]:
perm_alascan_results

Unnamed: 0_level_0,ddG_ala
Residue,Unnamed: 1_level_1
MET_1,-0.0182932
GLY_2,3.18904
VAL_3,0.128108
PRO_4,-1.8469
PHE_5,-0.0133675
...,...
TRP_741,5.76942
ARG_742,-0.0577126
GLU_743,-1.29899
ALA_744,0


In [19]:
# get a dict from the alascan results dataframe
perm_alascan_dict = perm_alascan_results.to_dict()['ddG_ala']
# Remove the res name and the underscore from the keys
perm_alascan_dict = { k[4:] : v for k,v in perm_alascan_dict.items()} 
# Convert to float all the values, and normalize to a positive scale
perm_alascan_dict = { k : float(v) for k,v in perm_alascan_dict.items()}
# Round the values to 3 decimal places
perm_alascan_dict = { k : round(v,3) for k,v in perm_alascan_dict.items()}
perm_alascan_dict = { int(k) : str(v - min(perm_alascan_dict.values()))[0:5] for k,v in perm_alascan_dict.items()}


In [20]:
perm_mapped_alascan_file_lines = pdbBfactor(perm_pdb_structure.fileLines, perm_alascan_dict)

In [21]:
FileHandler.writeLines(f'./data/{PERM_UNIPROT_ACCESION}_mapped_alascan.pdb', perm_mapped_alascan_file_lines)

True

In [22]:
import nglview
view = nglview.show_file("data/P05164_mapped_alascan.pdb")
view



NGLWidget()