## Retrieve interactions from BioGrid

In [161]:
import numpy as np
import pandas as pd
import os
import sys
import datetime
import time
# import idconversion as idc
from idconversion import idconversion as idc
from urllib.error import HTTPError

## Helper Functions


In [162]:
def time_stamper(tdate=datetime.datetime.now()):
    fmtdate = tdate.strftime('%Y-%m-%d.%H:%M:%S')
    return fmtdate

In [166]:
time_stamper(datetime.datetime.now())

'2018-12-19.06:52:27'

In [167]:
dir(idc)


['HTTPError',
 'IDLIST',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'built_id_list',
 'chromepath',
 'csv',
 'datetime',
 'from_csv',
 'generate_mapping_table',
 'idmapping',
 'main',
 'os',
 'show_mapping_table',
 'time_stamper',
 'url',
 'urllib',
 'webdriver']

In [7]:
!ls 

BioGridDataDec2018.ipynb mapping_table.csv        result_mapped.txt


In [8]:
os.getcwd()

'/Users/ken/Projects/Dec2018Notebooks'

In [168]:
!cat result_mapped.txt | cut -f 1,2 | head

yourlist:M201812188471C63D39733769F8E060B506551E1201F3EBJ	Entry
131072	Q86XK7
131079	Q5VW00
131082	Q6ZR62
131091	Q5GFL6
131092	Q8WY41
131098	A6NHT5
131110	Q6ZRI0
131112	P59894
131113	Q8IXP5


In [5]:
idc.show_mapping_table()

Scraping the web..
+----------------------------+------------------------+-----------+
|            Name            |      Abbreviation      | Direction |
+----------------------------+------------------------+-----------+
|      UniProtKB AC/ID       |         ACC+ID         |    from   |
|        UniProtKB AC        |          ACC           |    both   |
|        UniProtKB ID        |           ID           |    both   |
|          UniParc           |         UPARC          |    both   |
|          UniRef50          |          NF50          |    both   |
|          UniRef90          |          NF90          |    both   |
|         UniRef100          |         NF100          |    both   |
|         Gene name          |        GENENAME        |    both   |
|     EMBL/GenBank/DDBJ      |        EMBL_ID         |    both   |
|   EMBL/GenBank/DDBJ CDS    |          EMBL          |    both   |
|    Entrez Gene (GeneID)    |     P_ENTREZGENEID     |    both   |
|         GI number          

In [169]:
Datadir = '/Users/ken/Documents/From_KEN_MAC/2018Data'

In [170]:
biogridfile = os.path.join(Datadir, "BIOGRID-ALL-3.5.165.tab2.txt")

In [171]:
df = pd.read_csv(biogridfile, sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
df.head(2)

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


## Get interactors

In [10]:
df.columns

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Pubmed ID',
       'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score',
       'Modification', 'Phenotypes', 'Qualifications', 'Tags',
       'Source Database'],
      dtype='object')

In [18]:
len(df['Organism Interactor A'].unique().tolist())

62

In [19]:
df.shape

(1583787, 24)

## Get Human Interactions

In [172]:
human_mask = (df['Organism Interactor A'] == 9606) & (df['Organism Interactor B'] == 9606)

In [173]:
df_human = df.loc[human_mask].reset_index()

In [174]:
df_human.shape

(409173, 25)

In [14]:
df_human.head(2)

Unnamed: 0,index,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [15]:
df_human['Experimental System'].unique()

array(['Two-hybrid', 'Affinity Capture-Western', 'Reconstituted Complex',
       'FRET', 'Co-purification', 'Protein-peptide', 'Co-localization',
       'Affinity Capture-MS', 'Biochemical Activity',
       'Affinity Capture-RNA', 'Co-crystal Structure', 'Far Western',
       'Phenotypic Enhancement', 'Phenotypic Suppression',
       'Co-fractionation', 'Protein-RNA', 'Synthetic Rescue',
       'Affinity Capture-Luminescence', 'PCA', 'Dosage Rescue',
       'Negative Genetic', 'Dosage Lethality', 'Synthetic Growth Defect',
       'Proximity Label-MS', 'Synthetic Lethality', 'Positive Genetic'],
      dtype=object)

In [175]:
sub_mask = ['BioGRID ID Interactor A','BioGRID ID Interactor B', 'Experimental System Type']

In [176]:
df_human_exp = df_human.loc[:,sub_mask]

In [177]:
lsa = df_human_exp['BioGRID ID Interactor A'].unique().tolist()
lsb = df_human_exp['BioGRID ID Interactor B'].unique().tolist()

In [178]:
lsfull = set(lsa).union(set(lsb))

In [21]:
len(lsfull)

17299

In [179]:
lsboth = set(lsa).intersection(set(lsb))

In [180]:
lsBiogridProts = list(lsfull)[:]

In [181]:
len(lsboth)

12268

In [182]:
lsBiogridProts.__len__()

17299

## Break Long Lists into Chunks

In [183]:
def chunks(l, n=1000):
    for i in range(0, len(l), n):
        yield l[i:i+n]

In [192]:
lschunks = chunks(lsBiogridProts)
topchunks = (i[:10] for i in lschunks)

try:
    mapped = (idc.idmapping('BIOGRID_ID', 'ACC', lsids=j) for j in topchunks)
except KeyboardInterrupt as ke:
    print("Interrupted by the user")


In [193]:
next(mapped)

In [194]:
!ls

2018-12-18.21:00:00_result_mapped.txt mapping_table.csv
BioGridDataDec2018.ipynb              result_mapped.txt


## Process chunks

In [67]:
%%time
def process_chunks(idfro, idto, lstchunks):
    count = 0
    for chunk in lstchunks:
        print(len(chunk))
#         time.sleep(2)
        count +=1
#         yield len(chunk)
#         print("processing chunk_{}...".format(count))
# #         yield chunk[:10]
#         yield idc.idmapping(idfro,idto, lsids=chunk)
        

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 14.1 µs


In [68]:
%%time
my_chunks =  process_chunks('BIOGRID_ID', 'ACC', lsChunks)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 11 µs


In [70]:
my_chunks

In [57]:
list(p)

[]

In [40]:
!ls

2018-12-18.17:21:23_result_mapped.txt mapping_table.csv
2018-12-18.21:00:00_result_mapped.txt result_mapped.txt
BioGridDataDec2018.ipynb


In [85]:
%%time
idc.idmapping('BIOGRID_ID','ACC', lsids=lsBiogridProts)

CPU times: user 27.5 ms, sys: 6.32 ms, total: 33.8 ms
Wall time: 4.05 s


In [73]:
!ls

BioGridDataDec2018.ipynb mapping_table.csv        result_mapped.txt


In [65]:
!cut -f 1,2 result_mapped.txt | head 

yourlist:M20181218E579DAD5D4BBAF98E82B06D27107170A049E1FK	Entry
131072	Q86XK7
131079	Q5VW00
131082	Q6ZR62
131091	Q5GFL6
131092	Q8WY41
131098	A6NHT5
131110	Q6ZRI0
131112	P59894
131113	Q8IXP5


In [109]:
" ".join(list(map(lambda x: str(x), IDLIST)))

'P13368 P20806 Q9UM73 P97793'

In [139]:
idc.idconversion.IDLIST

['P13368', 'P20806', 'Q9UM73', 'P97793']

In [217]:
la = (i for i in range(10))

In [211]:
next(la)

3

In [218]:
squares = (i**2 for i in la)

In [224]:
next(squares)

25