In this notebook, we will be generating features to use for PIPE.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [13]:
masks_path = "data/yeast_masks_singlesite_domains_only.pkl"
uniprot_file = "data/uniprot-proteome UP000002311.tab"# reference proteome at https://www.uniprot.org/proteomes/UP000002311
pssm_path = "data/yeast_pssms/"

# Load data and masks

## Load masks

In [9]:
ppi_masks = pd.read_pickle(masks_path)
ppi_masks.head()

Unnamed: 0,Uniprot ID A,Uniprot ID B,Domain_id_a,Domain_id_b,Domain positions A,Domain positions B,Sites Masks
0,P53854,P33334,[PF10197],[PF10596],"[(11, 47)]","[(1514, 1672)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,P38789,Q08235,[PF04427],[PF04427],"[(34, 341)]","[(38, 224)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,P38789,P38805,[PF04427],[PF04427],"[(34, 341)]","[(101, 269)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,P38789,P36160,[PF04427],[PF04427],"[(34, 341)]","[(35, 236)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,P38789,P38789,[PF04427],[PF04427],"[(34, 341)]","[(34, 341)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [10]:
ppi_masks.shape

(12917, 7)

## Load sequences

In [11]:
uniprot_df = pd.read_csv(uniprot_file,
                        sep = "\t", index_col='Entry')
print("Loaded UniProt proteome")
uniprot_df

Loaded UniProt proteome


Unnamed: 0_level_0,Entry name,Status,Protein names,Gene names,Organism,Length,Sequence
Entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P21192,ACE2_YEAST,reviewed,Metallothionein expression activator,ACE2 YLR131C L3123 L9606.10,Saccharomyces cerevisiae (strain ATCC 204508 /...,770,MDNVVDPWYINPSGFAKDTQDEEYVQHHDNVNPTIPPPDNYILNNE...
P46993,ASG7_YEAST,reviewed,Protein ASG7 (A-specific gene 7 protein),ASG7 YJL170C J0514,Saccharomyces cerevisiae (strain ATCC 204508 /...,209,MTTLASSIEHKTKHLAAPFENDENPWMKKYCCQCKSCKMSVPVQPW...
P47117,ARP3_YEAST,reviewed,Actin-related protein 3 (Actin-like protein AR...,ARP3 ACT4 YJR065C J1760,Saccharomyces cerevisiae (strain ATCC 204508 /...,449,MSYLNNPAVVMDNGTGLTKLGFAGNDSPSWVFPTAIATAAPSNTKK...
P22768,ASSY_YEAST,reviewed,Argininosuccinate synthase (EC 6.3.4.5) (Citru...,ARG1 YOL058W O1228,Saccharomyces cerevisiae (strain ATCC 204508 /...,420,MSKGKVCLAYSGGLDTSVILAWLLDQGYEVVAFMANVGQEEDFDAA...
P29311,BMH1_YEAST,reviewed,Protein BMH1,BMH1 YER177W,Saccharomyces cerevisiae (strain ATCC 204508 /...,267,MSTSREDSVYLAKLAEQAERYEEMVENMKTVASSGQELSVEERNLL...
...,...,...,...,...,...,...,...
P47049,UBX6_YEAST,reviewed,UBX domain-containing protein 6,UBX6 YJL048C J1164,Saccharomyces cerevisiae (strain ATCC 204508 /...,396,MYEMSGIDSLFHDRVVHDYSHTSEQVIVVYISSAAGDNSWLHQWFK...
P53142,VPS73_YEAST,reviewed,Vacuolar protein sorting-associated protein 73,VPS73 YGL104C G3090,Saccharomyces cerevisiae (strain ATCC 204508 /...,486,MNRILSSASLLSNVSMPRQNKHKITKALCYAIIVASIGSIQFGYHL...
Q05919,VPS38_YEAST,reviewed,Vacuolar protein sorting-associated protein 38,VPS38 VPL17 YLR360W L8039.11,Saccharomyces cerevisiae (strain ATCC 204508 /...,439,MKRFLLSRRQRHLRMICFHNISLFRANGDSKLIKEYGDGFIPCFFI...
Q04170,YD391_YEAST,reviewed,Uncharacterized protein YDR391C,YDR391C,Saccharomyces cerevisiae (strain ATCC 204508 /...,232,MSFENKLPTPLENNDAKGHMVCTLNKTTDARRAAETLSIAFSNSPA...


## ProtDCal

Generated using https://protdcal.zmb.uni-due.de/pages/form.php

## PSI blast

In [6]:
# psiblast -query .\output_sequences.fasta -db nr -out yeast_filtered_psiblast_out -evalue 0.001 -num_iterations 3 -out_pssm yeast_filtered_pssm_checkpoint -out_ascii_pssm yeast_filtered_pssm

In [23]:
AA = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

In [47]:
pssm_A = pd.read_csv(pssm_path + "A0A023PYF4.pssm", sep = "\s+", skiprows=2).reset_index()[AA].dropna()
pssm_A.to_numpy()

array([[-2., -3., -2., ...,  1.,  0.,  0.],
       [-2., -2.,  0., ...,  0.,  0.,  0.],
       [-3., -3., -1., ...,  3.,  0.,  0.],
       ...,
       [ 0., -1., -3., ..., -3.,  0.,  0.],
       [ 1.,  0., -1., ..., -2.,  0.,  0.],
       [ 0., -1., -3., ..., -3.,  0.,  0.]])

## Position

In [7]:
position_landscape = []
print("Generating position feature landscape")
for UA, UB, SM in zip(ppi_masks['Uniprot ID A'], ppi_masks['Uniprot ID B'], ppi_masks['Sites Masks']):
    try:
        lengths = (uniprot_df['Length'].loc[UA], uniprot_df['Length'].loc[UB])
        
        # number according to position    
        seqA = np.arange(1, lengths[0]+1)[np.newaxis]
        seqB = np.arange(1, lengths[1]+1)[np.newaxis]
        
        # normalize
        seqA = np.divide(seqA, lengths[0])
        seqB = np.divide(seqB, lengths[1])
        
        mask = np.matmul(seqA.T, seqB)
        
        assert SM.shape == mask.shape
        
        position_landscape.append(mask)
    except KeyError as inst:
        print(UA, UB)
        print(f"No uniprot entry found for protein {inst.args}")
        position_landscape.append(np.NaN)
        
position_landscape = np.asarray(position_landscape)

Generating position feature landscape


  return array(a, dtype, copy=False, order=order)


In [8]:
position_landscape[:2]

array([array([[2.31520604e-06, 4.63041208e-06, 6.94561813e-06, ...,
        5.58196177e-03, 5.58427697e-03, 5.58659218e-03],
       [4.63041208e-06, 9.26082417e-06, 1.38912363e-05, ...,
        1.11639235e-02, 1.11685539e-02, 1.11731844e-02],
       [6.94561813e-06, 1.38912363e-05, 2.08368544e-05, ...,
        1.67458853e-02, 1.67528309e-02, 1.67597765e-02],
       ...,
       [4.09791469e-04, 8.19582939e-04, 1.22937441e-03, ...,
        9.88007233e-01, 9.88417024e-01, 9.88826816e-01],
       [4.12106675e-04, 8.24213351e-04, 1.23632003e-03, ...,
        9.93589194e-01, 9.94001301e-01, 9.94413408e-01],
       [4.14421881e-04, 8.28843763e-04, 1.24326564e-03, ...,
        9.99171156e-01, 9.99585578e-01, 1.00000000e+00]]),
       array([[7.58592962e-06, 1.51718592e-05, 2.27577889e-05, ...,
        2.19233366e-03, 2.19991959e-03, 2.20750552e-03],
       [1.51718592e-05, 3.03437185e-05, 4.55155777e-05, ...,
        4.38466732e-03, 4.39983918e-03, 4.41501104e-03],
       [2.27577889e-05, 4.55

In [10]:
ppi_masks['position_landscape'] = position_landscape
ppi_masks

Unnamed: 0,Uniprot ID A,Uniprot ID B,Domain_id_a,Domain_id_b,Domain positions A,Domain positions B,Sites Masks,position_landscape
0,P53854,P33334,[PF10197],[PF10596],"[(11, 47)]","[(1514, 1672)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[2.3152060417616864e-06, 4.630412083523373e-0..."
1,P38789,Q08235,[PF04427],[PF04427],"[(34, 341)]","[(38, 224)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[7.585929617745007e-06, 1.5171859235490014e-0..."
2,P38789,P38805,[PF04427],[PF04427],"[(34, 341)]","[(101, 269)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[7.483069555131515e-06, 1.496613911026303e-05..."
3,P38789,P36160,[PF04427],[PF04427],"[(34, 341)]","[(35, 236)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[6.417167205708712e-06, 1.2834334411417424e-0..."
4,P38789,P38789,[PF04427],[PF04427],"[(34, 341)]","[(34, 341)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[4.8730806153726205e-06, 9.746161230745241e-0..."
...,...,...,...,...,...,...,...,...
12912,Q06188,P02293,[PF00855],[PF00125],"[(5, 97)]","[(1, 105)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[2.511048613901165e-05, 5.02209722780233e-05,..."
12913,Q06188,P04911,[PF00855],[PF00125],"[(5, 97)]","[(4, 92)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[2.492025518341308e-05, 4.984051036682616e-05..."
12914,Q03330,P35177,[PF00439],[PF00439],"[(336, 419)]","[(449, 530)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[1.7101383843980653e-06, 3.4202767687961306e-..."
12915,Q03330,Q03330,[PF00439],[PF00439],"[(336, 419)]","[(336, 419)]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[5.188848127604153e-06, 1.0377696255208306e-0..."
