In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# from http://www.ebi.ac.uk/Tools/webservices/psicquic/mint/webservices/current/search/query/species:yeast

# yeast_intra = pd.read_csv('data/species_yeast.txt', sep='\t', header=None);
# yeast_intra

## Load PFAM domains for yeast

In [3]:
import re

# http://pfam.xfam.org/proteome/559292#tabview=tab2
# load in header row as single string # names are between angle brackets
yeast_pfam_header = re.findall(r'<(.*?)>', pd.read_csv('data/pfam_yeast_domains.tsv', sep='\n', header=2).columns[0]);

yeast_pfam_header

['seq id',
 'alignment start',
 'alignment end',
 'envelope start',
 'envelope end',
 'hmm acc',
 'hmm name',
 'type',
 'hmm start',
 'hmm end',
 'hmm length',
 'bit score',
 'E-value',
 'clan']

In [4]:
yeast_pfam = pd.read_csv('data/pfam_yeast_domains.tsv', sep='\t', names=yeast_pfam_header);

In [5]:
yeast_pfam.head()

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
0,#Pfam-A regions from Pfam version 33.1 for ncb...,,,,,,,,,,,,,
1,#Total number of proteins in proteome: 6048,,,,,,,,,,,,,
2,#<seq id> <alignment start> <alignment end> <e...,,,,,,,,,,,,,
3,A0A0B7P3V8,621.0,721.0,620.0,724.0,PF00665,rve,Domain,2.0,99.0,102.0,52.0,1.8e-10,CL0219
4,A2P2R3,79.0,204.0,54.0,215.0,PF13522,GATase_6,Domain,10.0,121.0,132.0,54.7,3e-11,CL0052


In [6]:
yeast_pfam = yeast_pfam.drop(range(3)).reset_index(drop=True)
yeast_pfam.head(20)

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
0,A0A0B7P3V8,621.0,721.0,620.0,724.0,PF00665,rve,Domain,2.0,99.0,102.0,52.0,1.8e-10,CL0219
1,A2P2R3,79.0,204.0,54.0,215.0,PF13522,GATase_6,Domain,10.0,121.0,132.0,54.7,3e-11,CL0052
2,D6VPM8,24.0,117.0,14.0,120.0,PF00674,DUP,Family,9.0,100.0,103.0,115.8,2.2e-30,No_clan
3,D6VTK4,17.0,296.0,17.0,297.0,PF02116,STE2,Family,1.0,279.0,280.0,304.9,1.3e-87,No_clan
4,D6W196,231.0,335.0,225.0,337.0,PF00153,Mito_carr,Family,9.0,95.0,97.0,64.3,2e-14,No_clan
5,D6W196,345.0,402.0,343.0,436.0,PF00153,Mito_carr,Family,3.0,60.0,97.0,32.7,0.00014,No_clan
6,D6W196,89.0,107.0,87.0,110.0,PF13202,EF-hand_5,Domain,5.0,23.0,25.0,17.4,5.7,CL0220
7,D6W196,17.0,78.0,13.0,79.0,PF13499,EF-hand_7,Domain,5.0,69.0,70.0,35.0,3.7e-05,CL0220
8,O13297,279.0,497.0,278.0,497.0,PF02940,mRNA_triPase,Domain,2.0,221.0,221.0,253.8,3.5e-72,CL0273
9,O13516,107.0,150.0,107.0,153.0,PF01479,S4,Domain,1.0,44.0,48.0,45.8,9.5e-09,CL0492


In [7]:
yeast_pfam.shape

(7907, 14)

In [8]:
yeast_pfam['seq id'].unique().size

4930

In [9]:
yeast_pfam['type'].unique()

array(['Domain', 'Family', 'Coiled-coil', 'Repeat', 'Motif', 'Disordered'],
      dtype=object)

We are choosing to drop all types that are not explicitly domains

In [10]:
yeast_pfam_domains = yeast_pfam[yeast_pfam['type'] == 'Domain'].sort_values('seq id')
# yeast_pfam_domains = yeast_pfam
yeast_pfam_domains

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
0,A0A0B7P3V8,621.0,721.0,620.0,724.0,PF00665,rve,Domain,2.0,99.0,102.0,52.0,1.800000e-10,CL0219
1,A2P2R3,79.0,204.0,54.0,215.0,PF13522,GATase_6,Domain,10.0,121.0,132.0,54.7,3.000000e-11,CL0052
6,D6W196,89.0,107.0,87.0,110.0,PF13202,EF-hand_5,Domain,5.0,23.0,25.0,17.4,5.700000e+00,CL0220
7,D6W196,17.0,78.0,13.0,79.0,PF13499,EF-hand_7,Domain,5.0,69.0,70.0,35.0,3.700000e-05,CL0220
8,O13297,279.0,497.0,278.0,497.0,PF02940,mRNA_triPase,Domain,2.0,221.0,221.0,253.8,3.500000e-72,CL0273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7893,Q99383,162.0,223.0,161.0,229.0,PF00076,RRM_1,Domain,2.0,64.0,70.0,60.8,2.200000e-13,CL0221
7902,Q9ZZW7,269.0,367.0,268.0,369.0,PF00961,LAGLIDADG_1,Domain,2.0,100.0,102.0,77.7,2.800000e-18,CL0324
7903,Q9ZZW7,405.0,499.0,405.0,501.0,PF00961,LAGLIDADG_1,Domain,1.0,100.0,102.0,40.8,8.900000e-07,CL0324
7901,Q9ZZW7,18.0,199.0,17.0,208.0,PF00033,Cytochrome_B,Domain,2.0,183.0,189.0,198.7,2.100000e-55,CL0328


In [11]:
yeast_pfam_domains.reset_index(drop=True, inplace=True)
yeast_pfam_domains

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
0,A0A0B7P3V8,621.0,721.0,620.0,724.0,PF00665,rve,Domain,2.0,99.0,102.0,52.0,1.800000e-10,CL0219
1,A2P2R3,79.0,204.0,54.0,215.0,PF13522,GATase_6,Domain,10.0,121.0,132.0,54.7,3.000000e-11,CL0052
2,D6W196,89.0,107.0,87.0,110.0,PF13202,EF-hand_5,Domain,5.0,23.0,25.0,17.4,5.700000e+00,CL0220
3,D6W196,17.0,78.0,13.0,79.0,PF13499,EF-hand_7,Domain,5.0,69.0,70.0,35.0,3.700000e-05,CL0220
4,O13297,279.0,497.0,278.0,497.0,PF02940,mRNA_triPase,Domain,2.0,221.0,221.0,253.8,3.500000e-72,CL0273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3873,Q99383,162.0,223.0,161.0,229.0,PF00076,RRM_1,Domain,2.0,64.0,70.0,60.8,2.200000e-13,CL0221
3874,Q9ZZW7,269.0,367.0,268.0,369.0,PF00961,LAGLIDADG_1,Domain,2.0,100.0,102.0,77.7,2.800000e-18,CL0324
3875,Q9ZZW7,405.0,499.0,405.0,501.0,PF00961,LAGLIDADG_1,Domain,1.0,100.0,102.0,40.8,8.900000e-07,CL0324
3876,Q9ZZW7,18.0,199.0,17.0,208.0,PF00033,Cytochrome_B,Domain,2.0,183.0,189.0,198.7,2.100000e-55,CL0328


### Create map of sequence IDs and their indices

In [12]:
yeast_pfam_domains['seq id'].unique().size

2599

In [13]:
# Make unique uniprot IDs as index
yeast_pfam_indices = pd.Series(index=yeast_pfam_domains['seq id'].unique(), dtype=object)
yeast_pfam_indices

A0A0B7P3V8    NaN
A2P2R3        NaN
D6W196        NaN
O13297        NaN
O13516        NaN
             ... 
Q99359        NaN
Q99369        NaN
Q99383        NaN
Q9ZZW7        NaN
Q9ZZX0        NaN
Length: 2599, dtype: object

In [14]:
# Verify type of indices
print(type(yeast_pfam_indices.index[0]))

<class 'str'>


Find all locations in PFAM dataframe where index occurs and add it to map

In [15]:
for idx in yeast_pfam_indices.index:
    yeast_pfam_indices.loc[idx] = yeast_pfam_domains['seq id'][yeast_pfam_domains['seq id'] == idx].index

In [16]:
yeast_pfam_indices

A0A0B7P3V8                   Int64Index([0], dtype='int64')
A2P2R3                       Int64Index([1], dtype='int64')
D6W196                    Int64Index([2, 3], dtype='int64')
O13297                       Int64Index([4], dtype='int64')
O13516                       Int64Index([5], dtype='int64')
                                  ...                      
Q99359                    Int64Index([3870], dtype='int64')
Q99369                    Int64Index([3871], dtype='int64')
Q99383              Int64Index([3872, 3873], dtype='int64')
Q9ZZW7        Int64Index([3874, 3875, 3876], dtype='int64')
Q9ZZX0                    Int64Index([3877], dtype='int64')
Length: 2599, dtype: object

Confirm that the map works

In [17]:
yeast_pfam_domains.loc[yeast_pfam_indices.iloc[0]]

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
0,A0A0B7P3V8,621.0,721.0,620.0,724.0,PF00665,rve,Domain,2.0,99.0,102.0,52.0,1.8e-10,CL0219


In [18]:
# http://pfam.xfam.org/protein/TUP1_YEAST
'P16649' in yeast_pfam_domains['seq id'].to_list()

True

In [19]:
yeast_pfam_domains.loc[yeast_pfam_indices['P16649']]

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
611,P16649,11.0,88.0,11.0,88.0,PF08581,Tup_N,Domain,1.0,77.0,77.0,98.6,4.8e-25,No_clan


In [20]:
# http://pfam.xfam.org/protein/CDC4_YEAST
yeast_pfam_domains.loc[yeast_pfam_domains['seq id']=='P07834']

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
277,P07834,275.0,312.0,273.0,320.0,PF00646,F-box,Domain,3.0,40.0,48.0,20.9,0.62,CL0271
278,P07834,227.0,272.0,224.0,272.0,PF16856,CDC4_D,Domain,6.0,51.0,51.0,65.5,7.4e-15,No_clan


In [21]:
'PF00400' in yeast_pfam_domains['hmm acc'].to_list()

False

For some reason WD40 isn't in the downloaded data
EDIT: This is because we dropped all non-domains from the pfam database

In [22]:
# http://pfam.xfam.org/protein/MSI1_YEAST
'P13712' in yeast_pfam_domains['seq id'].to_list()

False

In [23]:
'PF12265' in yeast_pfam_domains['hmm acc']

False

Nor are some proteins

## Load BioGRID Interactome for yeast

In [24]:
# downloaded from https://downloads.thebiogrid.org/File/BioGRID/Release-Archive/BIOGRID-4.2.191/BIOGRID-ORGANISM-4.2.191.tab3.zip

yeast_df = pd.read_csv("data/BIOGRID-ORGANISM-Saccharomyces_cerevisiae_S288c-4.2.191.tab3.txt",
                        sep = "\t")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [25]:
yeast_df.shape

(756225, 37)

In [26]:
yeast_df.head()

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
0,68770,851136,854020,31676,34272,YLR418C,YOL145C,CDC73,CTR9,L000002792,...,-,NP_014496,-,-,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c)
1,68771,854020,851136,34272,31676,YOL145C,YLR418C,CTR9,CDC73,CDP1|L000003477,...,-,NP_013522,-,-,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c)
2,68774,851136,854290,31676,34518,YLR418C,YOR123C,CDC73,LEO1,L000002792,...,-,NP_014766,-,-,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c)
3,68775,854290,851136,34518,31676,YOR123C,YLR418C,LEO1,CDC73,L000000936,...,-,NP_013522,-,-,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c)
4,68778,851136,852582,31676,32973,YLR418C,YBR279W,CDC73,PAF1,L000002792,...,-,NP_009838,-,-,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c)


In [27]:
yeast_df.columns

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Publication Source',
       'Organism ID Interactor A', 'Organism ID Interactor B', 'Throughput',
       'Score', 'Modification', 'Qualifications', 'Tags', 'Source Database',
       'SWISS-PROT Accessions Interactor A', 'TREMBL Accessions Interactor A',
       'REFSEQ Accessions Interactor A', 'SWISS-PROT Accessions Interactor B',
       'TREMBL Accessions Interactor B', 'REFSEQ Accessions Interactor B',
       'Ontology Term IDs', 'Ontology Term Names', 'Ontology Term Categories',
       'Ontology Term Qualifier IDs', 'Ontology Term Qualifier Names',
       'Ontology

In [28]:
yeast_df.iloc[0]

#BioGRID Interaction ID                                          68770
Entrez Gene Interactor A                                        851136
Entrez Gene Interactor B                                        854020
BioGRID ID Interactor A                                          31676
BioGRID ID Interactor B                                          34272
Systematic Name Interactor A                                   YLR418C
Systematic Name Interactor B                                   YOL145C
Official Symbol Interactor A                                     CDC73
Official Symbol Interactor B                                      CTR9
Synonyms Interactor A                                       L000002792
Synonyms Interactor B                                  CDP1|L000003477
Experimental System                                Affinity Capture-MS
Experimental System Type                                      physical
Author                                                Krogan NJ (2004)
Public

Many of the same sequences appear in a line as '|'

In [29]:
yeast_df['SWISS-PROT Accessions Interactor A'] = yeast_df['SWISS-PROT Accessions Interactor A'].apply(lambda s: s.split('|'))
yeast_df['SWISS-PROT Accessions Interactor B'] = yeast_df['SWISS-PROT Accessions Interactor B'].apply(lambda s: s.split('|'))
yeast_df = yeast_df.explode('SWISS-PROT Accessions Interactor A', ignore_index=True).explode('SWISS-PROT Accessions Interactor B', ignore_index=True)

In [30]:
yeast_df.shape

(769110, 37)

Drop all for where swiss-prot ID is not available

In [31]:
yeast_df.drop(yeast_df[yeast_df['SWISS-PROT Accessions Interactor A'] == '-'].index, inplace=True)
yeast_df.drop(yeast_df[yeast_df['SWISS-PROT Accessions Interactor B'] == '-'].index, inplace=True)
yeast_df.shape

(766596, 37)

## Load domain-domain interactinos

In [32]:
# from https://3did.irbbarcelona.org/download.php
yeast_3DID = pd.read_csv("data/3did_flat_Apr_10_2020.dat",
                        sep = "\t", header = None,
                        names=range(7))

In [33]:
yeast_3DID.head()

Unnamed: 0,0,1,2,3,4,5,6
0,#=ID,1-cysPrx_C,1-cysPrx_C,(PF10417.9@Pfam,PF10417.9@Pfam),,
1,#=3D,5jcg,A:157-192,H:157-192,1.1,1.50646,2:2
2,Q,E,159,162,sm,,
3,Q,T,159,163,sm,,
4,E,Q,162,159,ms,,


In [34]:
# this is a flat database; find indices with #=ID, that's what we're interested in (domain domain pairs)
id_indices_3did = yeast_3DID[0].loc[yeast_3DID[0] == '#=ID'].index
id_indices_3did

Int64Index([       0,      489,    17346,    17505,    17551,    17560,
               24675,    24777,    25600,    25610,
            ...
            21980373, 21981657, 21981733, 21981833, 21981862, 21982157,
            21982178, 21983337, 21983498, 21983742],
           dtype='int64', length=14278)

In [35]:
# strip all that are not those indices in non-pfam columns
yeast_3DID_pfam = yeast_3DID[[3,4]].iloc[id_indices_3did]
yeast_3DID_pfam

Unnamed: 0,3,4
0,(PF10417.9@Pfam,PF10417.9@Pfam)
489,(PF10417.9@Pfam,PF00578.21@Pfam)
17346,(PF10417.9@Pfam,PF02195.18@Pfam)
17505,(PF10417.9@Pfam,PF00085.20@Pfam)
17551,(PF12574.8@Pfam,PF12574.8@Pfam)
...,...,...
21982157,(PF05707.12@Pfam,PF05707.12@Pfam)
21982178,(PF16916.5@Pfam,PF16916.5@Pfam)
21983337,(PF00791.20@Pfam,PF00791.20@Pfam)
21983498,(PF09817.9@Pfam,PF09817.9@Pfam)


In [36]:
# Reformat so it's just pfam IDs
yeast_3DID_pfam.columns=["Pfam ID A","Pfam ID B"]
yeast_3DID_pfam.reset_index(inplace=True, drop=True)

import re
# yeast_3DID_pfam['Pfam ID A'] = yeast_3DID_pfam['Pfam ID A'].apply(lambda x: x.strip('@Pfam'))
# yeast_3DID_pfam['Pfam ID A'] = yeast_3DID_pfam['Pfam ID A'].apply(lambda x: x.strip('('))
yeast_3DID_pfam['Pfam ID A'] = yeast_3DID_pfam['Pfam ID A'].apply(lambda x: re.findall(r'\((.+?)\.', x)[0])
yeast_3DID_pfam['Pfam ID B'] = yeast_3DID_pfam['Pfam ID B'].apply(lambda x: re.findall(r'.*(?=\.)', x)[0])

yeast_3DID_pfam.head()

Unnamed: 0,Pfam ID A,Pfam ID B
0,PF10417,PF10417
1,PF10417,PF00578
2,PF10417,PF02195
3,PF10417,PF00085
4,PF12574,PF12574


In [37]:
yeast_3DID_pfam

Unnamed: 0,Pfam ID A,Pfam ID B
0,PF10417,PF10417
1,PF10417,PF00578
2,PF10417,PF02195
3,PF10417,PF00085
4,PF12574,PF12574
...,...,...
14273,PF05707,PF05707
14274,PF16916,PF16916
14275,PF00791,PF00791
14276,PF09817,PF09817


In [38]:
yeast_3DID_pfam['Pfam ID A'].unique().size

7173

In [39]:
yeast_3DID_pfam['Pfam ID B'].unique().size

7167

In [40]:
yeast_3DID_pfam[yeast_3DID_pfam['Pfam ID A'] == yeast_3DID_pfam['Pfam ID B'].iloc[1]]

Unnamed: 0,Pfam ID A,Pfam ID B
749,PF00578,PF00578
750,PF00578,PF00881
751,PF00578,PF02195
752,PF00578,PF00085
753,PF00578,PF17991
754,PF00578,PF13905


In [41]:
yeast_3DID_pfam[yeast_3DID_pfam['Pfam ID B'] == yeast_3DID_pfam['Pfam ID B'].iloc[1]]

Unnamed: 0,Pfam ID A,Pfam ID B
1,PF10417,PF00578
749,PF00578,PF00578


### Create dictionary of domain pairs

In [72]:
pfam_pairs = pd.DataFrame(index=pd.concat([yeast_3DID_pfam['Pfam ID A'], yeast_3DID_pfam['Pfam ID B']]).unique(), columns=['pair values'])

In [43]:
pfam_pairs['pair values'] = [[] for _ in range(pfam_pairs.shape[0])]

In [44]:
pfam_pairs

Unnamed: 0,pair values
PF10417,[]
PF12574,[]
PF00244,[]
PF00389,[]
PF02826,[]
...,...
PF13894,[]
PF18366,[]
PF14835,[]
PF18586,[]


In [45]:

for A, B in zip(yeast_3DID_pfam['Pfam ID A'], yeast_3DID_pfam['Pfam ID B']):
    if B not in pfam_pairs['pair values'].loc[A]:
        pfam_pairs['pair values'].loc[A].append(B)

In [46]:
pfam_pairs

Unnamed: 0,pair values
PF10417,"[PF10417, PF00578, PF02195, PF00085]"
PF12574,[PF12574]
PF00244,"[PF00244, PF00583, PF03496, PF00130, PF00525, ..."
PF00389,"[PF00389, PF01842, PF11890]"
PF02826,"[PF02826, PF02222, PF17769]"
...,...
PF13894,[]
PF18366,[]
PF14835,[]
PF18586,[]


In [47]:
for A, B in zip(yeast_3DID_pfam['Pfam ID A'], yeast_3DID_pfam['Pfam ID B']):
    if A not in pfam_pairs['pair values'].loc[B]:
        pfam_pairs['pair values'].loc[B].append(A)

In [48]:
pfam_pairs

Unnamed: 0,pair values
PF10417,"[PF10417, PF00578, PF02195, PF00085]"
PF12574,[PF12574]
PF00244,"[PF00244, PF00583, PF03496, PF00130, PF00525, ..."
PF00389,"[PF00389, PF01842, PF11890]"
PF02826,"[PF02826, PF02222, PF17769]"
...,...
PF13894,[PF00096]
PF18366,[PF00096]
PF14835,[PF00097]
PF18586,[PF18585]


In [49]:
yeast_3DID_pfam[yeast_3DID_pfam['Pfam ID B'] == 'PF17979']

Unnamed: 0,Pfam ID A,Pfam ID B
14231,PF10283,PF17979


In [50]:
pfam_pairs.loc['PF10283']

pair values    [PF17979]
Name: PF10283, dtype: object

## Find indices of sites for pairs

Create columns in yeast dataframe for domain pairs

In [51]:
yeast_df['domain_a'] = [[] for _ in range(yeast_df.shape[0])]
yeast_df['domain_b'] = [[] for _ in range(yeast_df.shape[0])]
yeast_df['domain_seq_a'] = [[] for _ in range(yeast_df.shape[0])]
yeast_df['domain_seq_b'] = [[] for _ in range(yeast_df.shape[0])]

In [52]:
yeast_pfam_domains['envelope start'] = yeast_pfam_domains['envelope start'].round(0).astype(np.int)
yeast_pfam_domains['envelope end'] = yeast_pfam_domains['envelope end'].round(0).astype(np.int)
yeast_pfam_domains['envelope start'], yeast_pfam_domains['envelope end']

(0       620
 1        54
 2        87
 3        13
 4       278
        ... 
 3873    161
 3874    268
 3875    405
 3876     17
 3877    201
 Name: envelope start, Length: 3878, dtype: int32,
 0       724
 1       215
 2       110
 3        79
 4       497
        ... 
 3873    229
 3874    369
 3875    501
 3876    208
 3877    300
 Name: envelope end, Length: 3878, dtype: int32)

In [53]:
total_domains_found = 0
total_found = 0

for PA, PB, DA, DB, DSA, DSB in zip(yeast_df['SWISS-PROT Accessions Interactor A'], yeast_df['SWISS-PROT Accessions Interactor B'], yeast_df['domain_a'], yeast_df['domain_b'], yeast_df['domain_seq_a'], yeast_df['domain_seq_b']):
    found = False

    try:
        # Find locations in PFAM with relevant protein sequences
        pfam_indices_A = yeast_pfam_indices.loc[PA] 
        pfam_indices_B = yeast_pfam_indices.loc[PB]
        # Get iterable of PFAM IDs for domains for each protein
        # print(pfam_indices_A[0])
        pfam_ids_A = yeast_pfam_domains['hmm acc'].loc[pfam_indices_A]
        pfam_ids_B = yeast_pfam_domains['hmm acc'].loc[pfam_indices_B]
    except KeyError as inst:
#         print("No pfam entry found for protein")
#         print(inst.args)
        continue
    
    # for each domain PFAM id in A
    for pfam_index_A, pfam_id_A in pfam_ids_A.iteritems():
        # for each domain PFAM id in B
        for pfam_index_B, pfam_id_B in pfam_ids_B.iteritems():
            try:
                # get list of all domain interactions with B
                pfam_id_B_pairs = pfam_pairs['pair values'].loc[pfam_id_B]
            except KeyError as inst:
#                 print("No pairs found for domain " + inst.args[0])
                continue
            
            # if domain A is in the list of interactions for domain B, we have a match
            if pfam_id_A in pfam_id_B_pairs:

                # save domain starts and ends as tuples
                DA.append(pfam_id_A)
                DB.append(pfam_id_B)
                DSA.append((yeast_pfam_domains['envelope start'].loc[pfam_index_A], yeast_pfam_domains['envelope end'].loc[pfam_index_A]))
                DSB.append((yeast_pfam_domains['envelope start'].loc[pfam_index_B], yeast_pfam_domains['envelope end'].loc[pfam_index_B]))
                
                total_domains_found += 1
                found = True
    if found:
        total_found += 1

total_domains_found, total_found

(43651, 20419)

In [54]:
yeast_df

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B,domain_a,domain_b,domain_seq_a,domain_seq_b
0,68770,851136,854020,31676,34272,YLR418C,YOL145C,CDC73,CTR9,L000002792,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[],[],[],[]
1,68771,854020,851136,34272,31676,YOL145C,YLR418C,CTR9,CDC73,CDP1|L000003477,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[],[],[],[]
2,68774,851136,854290,31676,34518,YLR418C,YOR123C,CDC73,LEO1,L000002792,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[],[],[],[]
3,68775,854290,851136,34518,31676,YOR123C,YLR418C,LEO1,CDC73,L000000936,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[],[],[],[]
4,68778,851136,852582,31676,32973,YLR418C,YBR279W,CDC73,PAF1,L000002792,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769105,2874488,854450,855907,34665,35989,YOR276W,YPL194W,CAF20,DDC1,CAF2|CAP20|p20|L000000208|L000003291,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[],[],[],[]
769106,2874489,854450,856136,34665,36202,YOR276W,YPR025C,CAF20,CCL1,CAF2|CAP20|p20|L000000208|L000003291,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[],[],[],[]
769107,2874490,854450,856158,34665,36221,YOR276W,YPR045C,CAF20,THP3,CAF2|CAP20|p20|L000000208|L000003291,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[],[],[],[]
769108,2874491,854450,856184,34665,36243,YOR276W,YPR071W,CAF20,YPR071W,CAF2|CAP20|p20|L000000208|L000003291,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[],[],[],[]


## Generate masks

Filter for ones where we only found a single domain interaction pair

In [55]:
yeast_single_site = yeast_df[yeast_df.domain_a.str.len() == 1].reset_index()
yeast_single_site

Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,...,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B,domain_a,domain_b,domain_seq_a,domain_seq_b
0,58,68964,855476,856570,35594,36599,YNL245C,YHR165C,CWC25,PRP8,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[PF10197],[PF10596],"[(11, 47)]","[(1514, 1672)]"
1,153,71292,856463,854078,36498,34325,YHR066W,YOL077C,SSF1,BRX1,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[PF04427],[PF04427],"[(34, 341)]","[(38, 224)]"
2,167,71306,856463,856488,36498,36523,YHR066W,YHR088W,SSF1,RPF1,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[PF04427],[PF04427],"[(34, 341)]","[(101, 269)]"
3,168,71307,856463,853956,36498,34212,YHR066W,YKR081C,SSF1,RPF2,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[PF04427],[PF04427],"[(34, 341)]","[(35, 236)]"
4,179,71318,856463,856463,36498,36498,YHR066W,YHR066W,SSF1,SSF1,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[PF04427],[PF04427],"[(34, 341)]","[(34, 341)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12912,768188,2822383,851177,851810,31714,32276,YLR455W,YDR224C,PDP3,HTB1,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[PF00855],[PF00125],"[(5, 97)]","[(1, 105)]"
12913,768189,2822384,851177,851811,31714,32277,YLR455W,YDR225W,PDP3,HTA1,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[PF00855],[PF00125],"[(5, 97)]","[(4, 92)]"
12914,768220,2822415,853167,852373,33503,32783,YGR252W,YBR081C,GCN5,SPT7,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[PF00439],[PF00439],"[(336, 419)]","[(449, 530)]"
12915,768222,2822417,853167,853167,33503,33503,YGR252W,YGR252W,GCN5,GCN5,...,-,-,-,-,Saccharomyces cerevisiae (S288c),Saccharomyces cerevisiae (S288c),[PF00439],[PF00439],"[(336, 419)]","[(336, 419)]"


In [56]:
# Split out into numpy arrays
Uniprot_id_A = yeast_single_site['SWISS-PROT Accessions Interactor A'].to_numpy()
Uniprot_id_B = yeast_single_site['SWISS-PROT Accessions Interactor B'].to_numpy()
domain_pfam_a = yeast_single_site['domain_a'].to_numpy()
domain_pfam_b = yeast_single_site['domain_b'].to_numpy()
positions_a = yeast_single_site['domain_seq_a'].to_numpy()
positions_b = yeast_single_site['domain_seq_b'].to_numpy()

Load reference proteome for yeast

In [11]:
# reference proteome at https://www.uniprot.org/proteomes/UP000002311
# general proteome (reviewed S. cerevisiae) at https://www.uniprot.org/uniprot/?query=taxonomy:%22Saccharomyces%20cerevisiae%20(strain%20ATCC%20204508%20/%20S288c)%20(Baker%27s%20yeast)%20[559292]%22&fil=organism%3A%22Saccharomyces+cerevisiae+%28strain+ATCC+204508+%2F+S288c%29+%28Baker%27s+yeast%29+%5B559292%5D%22+AND+reviewed%3Ayes
uniprot_df = pd.read_csv("data/uniprot-proteome UP000002311.tab",
                        sep = "\t", index_col='Entry')

NameError: name 'pd' is not defined

In [12]:
uniprot_df.head()

NameError: name 'uniprot_df' is not defined

In [13]:
site_masks = []

for UA, UB, DA, DB, PA, PB in zip(Uniprot_id_A, Uniprot_id_B, domain_pfam_a, domain_pfam_b, positions_a, positions_b):
    interaction_area = 0
    try:
        # initialize mask with dimensions of protein sequence
        mask = np.zeros((uniprot_df['Length'].loc[UA], uniprot_df['Length'].loc[UB]), dtype=int)
        for pos_A, pos_B in zip(PA, PB):
            # print(pos_A, pos_B, mask.shape)
            # calculate area of interaction
            interaction_area += (pos_A[1]-pos_A[0])*(pos_B[1]-pos_B[0])
            # set area of interaction to 1
            mask[(pos_A[0]-1):(pos_A[1]-1), (pos_B[0]-1):(pos_B[1]-1)] = 1 # -1 as protein indexing starts by 0
        
        # filter for smaller interaction areas - we don't want to do the whole protein
        if interaction_area <= 0.5 * mask.shape[0]*mask.shape[1]:
            site_masks.append(mask)
        else:
            site_masks.append(np.NaN)
    except KeyError as inst:
        print(UA, UB)
        print("No uniprot entry found for protein")
        print(inst.args)
        site_masks.append(np.NaN)

NameError: name 'Uniprot_id_A' is not defined

In [14]:
site_masks = np.asarray(site_masks)

NameError: name 'np' is not defined

In [15]:
# Turn back into pandas dataframe
masks_domainsOnly = pd.DataFrame({'Uniprot ID A': Uniprot_id_A,
                                            'Uniprot ID B': Uniprot_id_B,
                                            'Domain_id_a': domain_pfam_a,
                                            'Domain_id_b': domain_pfam_b,
                                            'Domain positions A': positions_a,
                                            'Domain positions B': positions_b,
                                            'Sites Masks': site_masks})
# drop all masks for proteins we could not find/area was > 50%
masks_domainsOnly.dropna(inplace=True)
masks_domainsOnly.reset_index(drop=True, inplace=True)
masks_domainsOnly

NameError: name 'pd' is not defined

In [16]:
masks_domainsOnly['Sites Masks'].iloc[0].sum()

NameError: name 'masks_domainsOnly' is not defined

### Plot sample of masks

In [17]:
fig, axs = plt.subplots(10,10, figsize=(20, 20))
fig.set_tight_layout(True)
for i, ax in enumerate(axs.flat):
    ax.set_aspect('auto', adjustable='box')
    ax.matshow(masks_domainsOnly['Sites Masks'].iloc[i])
    

NameError: name 'plt' is not defined

In [18]:
masks_domainsOnly.to_pickle('data/masks_singlesite_domains_only.pkl')

NameError: name 'masks_domainsOnly' is not defined