In [1]:
import string
import os, sys
from pathlib import Path
import urllib
# import urllib2
import datetime
import time
import shutil
from glob import glob
import gzip
import numpy as np
import pandas as pd
import re

import json
import pickle

from collections import defaultdict

In [2]:
data_root = Path('/home/yl986/data/HINT/')
source_root = data_root / 'uniprot_source/release_202401/knowledgebase'
output_root = data_root / 'outputs_2023'

### Continue to fill UniProt IDs

**Based on data sources from raw_interactions.txt**
```{python}
target_ids = ['BioGRID',
                'ChEMBL',
                'ComplexPortal',
                'DIP',
                'EMBL',
                'Ensembl',
                'EnsemblGenome',
                'GeneID',
                'PDB',
                'Reactome',
                'RefSeq']
```

**Prior steps**

* Run `parse_source_data.py` to process
    * FASTA files (`uniprot_sprot.fasta.gz`, `uniprot_sprot_varsplic.fasta.gz`, `uniprot_trembl.fasta.gz`) - to extract protein meta information
    * species file `docs/speclist.txt` to extract species taxonomy information
    * secondary-to-primary accession mapping file `docs/sec_ac.txt`
* Run `prepare_dataset.py` to prepare protein interaction data from sources of interest

* Run `create_idmapping.py`: parse `idmapping.dat.gz` file obtained from UniProt FTP site
    * output 1: `target_type_to_uprot.json` dictionary of ID mapping organized by ID type (if an ID is mapped to multiple UniProt IDs, all UniProt IDs will be kept & concatenated by `'|'`)
    * output 2: `prot_gene_info.tsv` descriptions for each UniProt ID columns: `(uprot | UniProtKB-ID | Gene_Name | Gene_ORFName | NCBI_TaxID)`
    * `target_result.json` mapping of target IDs to UniProt when available  
    Format: `"ID_TYPE|SOURCE_ID": "UNIPROT_ID1(|UNIPROT_ID2|UNIPROT_ID3...)"` (Example: '"DIP|DIP-17064N": "Q9TW27"')

    
    
<!-- * Run `map_ids.py`: map target IDs to UniProt IDs
    * Inputs: 
        * `accepted_id_dict.json`: mapping of ID names from original database to supported ID type keywords in `idmapping.dat.gz`
        * `target_type_to_uprot.json` reference for ID mapping organized by ID type
        * `mapping_targets_by_type.json`: target IDs to map organized by ID type
    * Outputs:
        * `target_result.json` mapping of target IDs to UniProt when available  
        
          Format: `"ID_TYPE|SOURCE_ID": "UNIPROT_ID1(|UNIPROT_ID2|UNIPROT_ID3...)"` (Example: '"DIP|DIP-17064N": "Q9TW27"') -->

* Load cache interaction file

In [None]:
raw_interactions1 = pd.read_csv(output_root / 'cache/raw_interactions_filled_partial.txt', sep='\t', dtype={'method': str})
raw_interactions1['method'] = raw_interactions1['method'].str.zfill(4)

* Load accepted ID to keyword mapping

In [None]:
with open(data_root / 'outputs_2023/cache/accepted_id_dict.json', 'r') as f:
    accept_id_map = json.load(f)

In [None]:
# Load FTP mapping result
with open(output_root / 'cache/target_result1.json') as f:
    target_map_dict = json.load(f)

In [None]:
# Load REST-API mapping result (consider to discard this step and only use FTP result)
df_map2 = pd.read_csv(output_root / 'cache/api_id_map_result.txt', sep='\t', names=['source_id', 'uprot'])
df_map2 = df_map2.groupby('source_id').agg({'uprot': lambda x: '|'.join(sorted(set(x)))}).reset_index()
df_map2['source_id'] = df_map2['source_id'].str.replace('RefSeq_Protein', 'RefSeq')
df_map2['source'] = df_map2['source_id'].apply(lambda x: x.split('|')[0])

# update target_map_dict
target_map_dict.update(dict(zip(df_map2['source_id'], df_map2['uprot'])))

In [None]:
# check remaining IDs to be mapped
# raw_interactions1[raw_interactions1['UniProt_A'].isna() | raw_interactions1['UniProt_B'].isna()]['source'].value_counts()

#### Map source ID to UniProt (multiple mapping exists)

In [None]:
def map_to_uniprot(itype, source_id, accept_id_map, target_map_dict, default_val=np.nan):
    if itype in accept_id_map:
        key = '|'.join([accept_id_map[str(itype)], str(source_id)])
    elif itype in accept_id_map.values():
        key = '|'.join([itype, str(source_id)])
    else:
        return default_val
    return target_map_dict.get(key, default_val)

In [None]:
idx = raw_interactions1['UniProt_A'].isna()
raw_interactions1.loc[idx, 'UniProt_A'] = raw_interactions1.loc[idx].apply(lambda x: map_to_uniprot(x['idtype_A'], x['idA'], accept_id_map, target_map_dict, x['UniProt_A']), axis=1)

idx = raw_interactions1['UniProt_B'].isna()
raw_interactions1.loc[idx, 'UniProt_B'] = raw_interactions1.loc[idx].apply(lambda x: map_to_uniprot(x['idtype_B'], x['idB'], accept_id_map, target_map_dict, x['UniProt_B']), axis=1)

In [None]:
# save cache if necessary
# raw_interactions1.to_csv(output_source / 'cache/raw_interactions_filled_all.txt', sep='\t', index=False)

In [None]:
# proceed with filled entries
raw_ppi_filled = raw_interactions1[raw_interactions1['UniProt_A'].notna() & raw_interactions1['UniProt_B'].notna()].reset_index(drop=True)

In [None]:
# Additional cleaning steps
if raw_ppi_filled['taxa'].str.contains('\.').any():
    raw_ppi_filled['taxa'] = raw_ppi_filled['taxa'].apply(lambda x: str(x).split('.')[0])
if raw_ppi_filled['idA'].str.contains(' ').any():
    raw_ppi_filled['idA'] = raw_ppi_filled['idA'].apply(lambda x: str(x).split(' ')[0].strip())
if raw_ppi_filled['idB'].str.contains(' ').any():
    raw_ppi_filled['idB'] = raw_ppi_filled['idB'].apply(lambda x: str(x).split(' ')[0].strip())

* save to cache

In [None]:
raw_ppi_filled.to_csv(output_root / 'cache/raw_interactions_filled_all.txt', index=False, sep='\t')

#### Re-format ID-to-UniProt mapping file (one protein ID each row)

In [None]:
df_prot = pd.concat([raw_interactions1[['idtype_A', 'idA', 'taxa', 'UniProt_A']].rename(columns={'idtype_A': 'idtype', 'idA': 'id', 'UniProt_A': 'UniProt'}), 
                     raw_interactions1[['idtype_B', 'idB', 'taxa', 'UniProt_B']].rename(columns={'idtype_B': 'idtype', 'idB': 'id', 'UniProt_B': 'UniProt'})])
df_prot = df_prot.dropna().drop_duplicates().reset_index(drop=True)

In [None]:
# Manual cleaning
# checked manually: P31939, Q9UBE0 are both human protein (reviewed)
df_prot.loc[df_prot['taxa'].str.contains('-'), 'taxa'] = '9606'
df_prot['taxa'] = df_prot['taxa'].apply(lambda x: str(x).split('.')[0])  # clean up float format taxa ID (result from NaN in the column)

* Split entries with joined UniProt IDs into multiple rows

In [None]:
df_prot['uprot_lst'] = df_prot['UniProt'].str.split('|')
df_prot = df_prot.explode('uprot_lst').drop('UniProt', axis=1).drop_duplicates().rename(columns={'uprot_lst': 'UniProt'}).reset_index(drop=True)

In [None]:
# sanity check
df_prot.describe()

In [20]:
cleanup_idx = df_prot['UniProt'].str.contains(' ')
df_prot.loc[cleanup_idx, 'id'] = df_prot.loc[cleanup_idx, 'id'].apply(lambda x: x.strip().split(' ')[0])
df_prot.loc[cleanup_idx, 'UniProt'] = df_prot.loc[cleanup_idx, 'UniProt'].apply(lambda x: x.strip().split(' ')[0])

In [33]:
# save cache file
# df_prot.to_csv(output_root / 'cache/id_to_uniprot.txt', sep='\t', index=False)

#### Secondary accession file

In [34]:
# df_prot = pd.read_csv(output_root / 'cache/id_to_uniprot.txt', sep='\t')

In [24]:
df_prot['UniProt'].nunique()

199892

In [3]:
sec_merged = pd.read_csv(source_root / 'docs/cache/sec_acc_parsed.txt', sep='\t')

In [4]:
sec_merged.head()

Unnamed: 0,secondary,primary
0,A0A011PKA5,C7RW80
1,A0A011PPS3,C7RW54
2,A0A011Q4P3,C7RWC7
3,A0A011QYZ9,C7RW92
4,A0A016SR66,A0A0D6L478


In [12]:
sec_uprots = df_prot[df_prot['UniProt'].isin(sec_merged['secondary'])]['UniProt'].unique()
len(sec_uprots)

2741

In [26]:
df_prot = df_prot.merge(sec_merged.rename(columns={'secondary': 'UniProt'}), how='left').drop_duplicates()
df_prot.loc[df_prot['primary'].isna(), 'primary'] = df_prot.loc[df_prot['primary'].isna(), 'UniProt']

In [27]:
df_prot['primary'].nunique()

198432

In [37]:
df_prot.dtypes

idtype     object
id         object
taxa        int64
UniProt    object
primary    object
dtype: object

In [49]:
df_prot['primary_short'] = df_prot['primary'].apply(lambda x: x.split('-')[0])
df_prot['primary_short'].nunique()

185840

* save source ID to UniProt mapping file

In [50]:
df_prot.to_csv(data_root / 'outputs_2023/cache/id_to_uniprot.txt', sep='\t', index=False)

In [201]:
df_prot = pd.read_csv(data_root / 'outputs_2023/cache/id_to_uniprot.txt', sep='\t')

### Merge with protein description data

**Preparations**

* Generate `uniprot_descriptions.txt` (description for selected protein) from protein meta files (`query_uprot_desc.py`)
  * Input: 
    * `id_to_uniprot.txt`
    * protein meta files under $UNIPROT_KNOWLEDGEBASE/complete/meta

**Either run the following chunks or the python script**


In [None]:
def extract_uprot_meta_info(df_prot, source_path, out_path, pid_col='UniProt', meta_pid_col='UniProt', 
                            meta_prefix=['sprot', 'sprot_varsplic', 'trembl'], chunksize=1e6, 
                            output_name='uniprot_descriptions.txt'):
    """
    Extract description for selected protein IDs
    """
    if isinstance(source_path, str):
        source_path = Path(source_path)

    if isinstance(out_path, str):
        out_path = Path(out_path)
    
    if not out_path.exists():
        out_path.mkdir(parents=True)

    prot_remain = set(df_prot[pid_col])
    print('# UniProt IDs:', len(prot_remain))

    header = True
    mode = 'w'
    for prefix in meta_prefix:
        fname = f'{prefix}_meta.txt'
        if not (source_path / fname).exists():
            print('{} not found!'.format(str(source_path / fname)))
            continue

        chunks = pd.read_csv(source_path / fname, sep='\t', chunksize=chunksize)
        for df_meta in chunks:
            records = df_meta[df_meta[meta_pid_col].isin(prot_remain)]
            records.to_csv(out_path / output_name, mode=mode, header=header, sep='\t', index=False)
            prot_remain = prot_remain - set(records[meta_pid_col])
            print('# UniProt IDs remain:', len(prot_remain))
            if len(prot_remain) == 0:
                break
            header = False  # write header only the first time
            mode = 'a'

In [None]:
# Uncomment to run

# prot_meta_root = Path('/home/yl986/data/HINT/uniprot_source/release_202401/knowledgebase/complete/meta')
# df_prot = pd.read_csv(data_root / 'id_to_uniprot.txt', sep='\t')
# output_cache_root = output_root / 'cache'

# extract_uprot_meta_info(df_prot, prot_meta_root, out_path=output_cache_root, pid_col='primary_ac_short')

* Continue with processed uniprot description file

In [38]:
df_desc = pd.read_csv(output_root / 'cache/uniprot_descriptions.txt', sep='\t')

In [39]:
df_desc.head()

Unnamed: 0,tag,UniProt,name,species,taxa,gene_name,description
0,sp,P32234,128UP_DROME,Drosophila melanogaster,7227,128up,GTP-binding protein 128up
1,sp,Q8GBW6,12S_PROFR,Propionibacterium freudenreichii subsp. shermanii,1752,,Methylmalonyl-CoA carboxyltransferase 12S subunit
2,sp,P81928,140U_DROME,Drosophila melanogaster,7227,140up,RPII140-upstream gene protein
3,sp,P48347,14310_ARATH,Arabidopsis thaliana,3702,GRF10,14-3-3-like protein GF14 epsilon
4,sp,Q9S9Z8,14311_ARATH,Arabidopsis thaliana,3702,GRF11,14-3-3-like protein GF14 omicron


#### Curate species information

**Note**

Parsed species information from `parse_source_data.py` needs further processing to merge "redundant" taxonomy IDs

In [57]:
df_species = pd.read_csv(source_root / 'docs/cache/species_parsed.txt', sep='\t')

In [58]:
df_species['taxa'] = df_species['taxa'].astype(str)

In [61]:
df_species['taxa'].isna().any()

False

In [62]:
# Rule-based curation (consider to revise later)
df_species['scientific_name_short'] = df_species['scientific_name'].apply(lambda x: x.split('(')[0].strip())
ecoli_idx = df_species['scientific_name'].str.startswith('Escherichia coli')  # manual rule for E Coli
df_species.loc[ecoli_idx, 'scientific_name_short'] = 'Escherichia coli'

rice_idx = df_species['common_name'] == 'Rice' # manual rule for Rice
df_species.loc[rice_idx, 'scientific_name_short'] = 'Oryza sativa'

In [67]:
# generate aggregated taxa ID mapping dict

df_spe_agg = df_species.astype({'taxa': int}).groupby('scientific_name_short').agg({'taxa': lambda x: sorted(list(x))}).reset_index()
df_spe_agg['n_taxa'] = df_spe_agg['taxa'].apply(len)
df_multi = df_spe_agg.query('n_taxa > 1').reset_index(drop=True)
df_multi['taxa_unified'] = df_multi['taxa'].apply(lambda x: x[0])  # map to the smallest ID value within the same "species"
df_multi = df_multi.explode('taxa').reset_index(drop=True)

taxa_map_dict = dict(zip(df_multi['taxa'].astype(str), df_multi['taxa_unified'].astype(str)))

In [68]:
len(taxa_map_dict)

4493

* save taxanomy mapping file

In [None]:
with open(output_root / 'cache/taxa_map_dict.json', 'w') as f:
    json.dump(taxa_map_dict, f, indent=2)

#### Continue to clean up protein mapping & description

In [209]:
df_prot['taxa'].dtype

dtype('int64')

In [210]:
df_prot['taxa'].nunique()

1687

In [214]:
# revise taxa ID first

df_prot['taxa_rev'] = df_prot['taxa'].astype(str).apply(lambda x: taxa_map_dict[str(x)] if x in taxa_map_dict else x)
print(df_prot['taxa_rev'].nunique())

df_desc['taxa_rev'] = df_desc['taxa'].astype(str).apply(lambda x: taxa_map_dict[str(x)] if x in taxa_map_dict else x)
print(df_desc['taxa_rev'].nunique())

1418
1370


In [227]:
# Merge by UniProt AND unified taxa

pid_col = 'primary_short'

df_prot_info = df_prot[['idtype', 'id', 'taxa_rev', 'UniProt', 'primary_short']].rename(columns={'UniProt': 'UniProt_orig'}).\
                merge(df_desc, left_on=['primary_short', 'taxa_rev'], right_on=['UniProt', 'taxa_rev'], how='left').drop(columns=['UniProt', 'taxa']).\
                rename(columns={'taxa_rev': 'taxa'}).drop_duplicates().reset_index(drop=True)

In [239]:
df_prot_info['taxa'].nunique()

1418

In [251]:
df_prot_info.shape

(301635, 11)

In [333]:
df_prot_info_fil = df_prot_info.dropna(subset='name').drop_duplicates().reset_index(drop=True)
df_prot_info_fil['taxa'].nunique()

1300

In [334]:
df_prot_info_fil.shape

(296276, 11)

In [335]:
# Goal: filtering to keep only mappings with consistent taxonmy
# taxa_x: taxonomy ID in source (reference)
# taxa_y: taxonomy ID from id mapping file generated by `merge` operation
#         (may not be accurate because some secondary IDs map to multiple primary IDs regardless of taxonomy information)
# df_prot_info['taxa_x'] = df_prot_info['taxa_x'].apply(lambda x: taxa_map_dict[str(x)] if x in taxa_map_dict else x)
# df_prot_info['taxa_y'] = df_prot_info['taxa_y'].apply(lambda x: taxa_map_dict[str(x)] if x in taxa_map_dict else x)

In [336]:
# remove inconsistent taxa entries
# df_prot_info_fil = df_prot_info.query('taxa_x == taxa_y')
# df_prot_info_fil = df_prot_info_fil.rename(columns={'taxa_x': 'taxa'}).drop('taxa_y', axis=1)

# df_prot_info_fil['primary_short'].nunique()

In [337]:
df_prot_info_fil['is_reviewed'] = (df_prot_info_fil['tag'] == 'sp').astype(int)

In [259]:
df_prot_info_fil.head()

Unnamed: 0,idtype,id,taxa,UniProt_orig,primary_short,tag,name,species,gene_name,description,is_reviewed
0,DIP,DIP-445N,10090,P46414,P46414,sp,CDN1B_MOUSE,Mus musculus,Cdkn1b,Cyclin-dependent kinase inhibitor 1B,1
1,DIP,DIP-617N,9606,P01730,P01730,sp,CD4_HUMAN,Homo sapiens,CD4,T-cell surface glycoprotein CD4,1
2,DIP,DIP-1025N,562,P00968,P00968,sp,CARB_ECOLI,Escherichia coli (strain K12),carB,Carbamoyl phosphate synthase large chain,1
3,DIP,DIP-19N,7227,P07713,P07713,sp,DECA_DROME,Drosophila melanogaster,dpp,Protein decapentaplegic,1
4,DIP,DIP-25N,4932,P09798,P09798,sp,CDC16_YEAST,Saccharomyces cerevisiae (strain ATCC 204508 /...,CDC16,Anaphase-promoting complex subunit CDC16,1


In [None]:
df_prot_info_fil['info_tup'] = df_prot_info_fil.apply(lambda x: (x['UniProt'], x['is_reviewed']), axis=1)

In [338]:
df_prot_info_fil[df_prot_info_fil['gene_name'].isna()]['UniProt_orig'].nunique()

5916

In [339]:
df_prot_info_fil[~df_prot_info_fil['UniProt_orig'].isin(sec_merged['secondary']) & 
                 (df_prot_info_fil['UniProt_orig'] != df_prot_info_fil['primary_short']) & 
                 ~df_prot_info_fil['UniProt_orig'].str.contains('-')].drop_duplicates('UniProt_orig')

Unnamed: 0,idtype,id,taxa,UniProt_orig,primary_short,tag,name,species,gene_name,description,is_reviewed


In [340]:
na_gene_idx = df_prot_info_fil['gene_name'].isna()
is_sec = df_prot_info_fil['UniProt_orig'].isin(sec_merged['secondary'])
is_isoform = df_prot_info_fil['UniProt_orig'].str.contains('-')

In [264]:
# sanity check
# df_prot_info_fil[na_gene_idx & is_sec]

In [266]:
# sanity check
# df_prot_info_fil[na_gene_idx & is_isoform]

#### Reduce to best UniProt IDs by gene

##### Fill empty gene name with UniProt ID

From sanity check:
* UniProt isoforms without gene info are all viral proteins (maybe keep orginal UniProt ID with suffix later) -- use prefix for now
* secondary accessions without gene info: use primary accession
* others: use primary accessions

**Note**
For UniProt IDs with suffix ('-PRO', isoform...), now we fill empty gene name with only the prefix (may want to use the full UniProt name in final display)

In [341]:
df_prot_info_fil['gene_name_fil'] = df_prot_info_fil['gene_name']
df_prot_info_fil.loc[na_gene_idx, 'gene_name_fil'] = df_prot_info_fil.loc[na_gene_idx, 'primary_short']
# df_prot_info_fil.loc[na_gene_idx & is_isoform, 'gene_name_fil'] = df_prot_info_fil.loc[na_gene_idx, 'UniProt_orig']

##### Find best UniProt ID for each gene

In [342]:
df_prot_info_fil['gene_name_fil'].nunique()

123841

In [343]:
df_prot_info_fil['name'].isna().any()

False

In [344]:
df_prot_info_fil['id'].nunique()

232903

In [332]:
# df_prot_info_fil = df_prot_info_fil.sort_values(['gene_name_fil', 'taxa', 'is_reviewed'], ascending=[True, True, False])

In [154]:
# best_by_id = df_prot_info_fil.sort_values(['id','taxa', 'is_reviewed', 'gene_name_fil'], ascending=[True, True, False, True]).drop_duplicates(['id', 'taxa'], keep='first')

In [345]:
best_by_gene = df_prot_info_fil.sort_values(['gene_name', 'taxa', 'is_reviewed'], ascending=[True, True, False]).drop_duplicates(['gene_name_fil', 'taxa'], keep='first')

In [346]:
best_by_gene['is_reviewed'].value_counts()

1    74023
0    65285
Name: is_reviewed, dtype: int64

In [277]:
best_by_gene[['taxa', 'gene_name_fil', 'primary_short']].astype({'taxa': str}).describe()

Unnamed: 0,taxa,gene_name_fil,primary_short
count,139308,139308,139308
unique,1300,123841,139308
top,9606,N,P03775
freq,24231,21,1


In [281]:
gene2best_uprot_bytaxa = defaultdict(dict)

for cur_tid in best_by_gene['taxa'].unique():
    df_taxa = best_by_gene.query('taxa == @cur_tid')
    gene2best_uprot_bytaxa[cur_tid] = dict(zip(df_taxa['gene_name_fil'], df_taxa['primary_short']))

In [288]:
with open(output_root / 'cache/gene2best_uprot_bytaxa1.json', 'w') as f:
    json.dump(gene2best_uprot_bytaxa, f, indent=2)

In [289]:
len(set(taxa_map_dict.values()))

1262

In [347]:
df_prot_info_fil['best_uprot'] = df_prot_info_fil.apply(lambda x: gene2best_uprot_bytaxa[x['taxa']][x['gene_name_fil']], axis=1)

In [348]:
df_prot_info_fil.drop_duplicates().to_csv(output_root / 'cache/mapped_to_best_uprot.txt', sep='\t', index=False)

### Apply best UniProt mapping to interactions

In [None]:
# Load supporting documents from cache (if necessary)

# taxa ID mapping
with open(output_root / 'cache/taxa_map_dict.json', 'r') as f:
    taxa_map_dict = json.load(f)

# best UniProt ID mapping (by gene and taxa)
with open(output_root / 'cache/gene2best_uprot_bytaxa1.json', 'r') as f:
    gene2best_uprot_bytaxa = json.load(f)

# species information
df_species = pd.read_csv(source_root / 'docs/cache/species_parsed.txt', sep='\t')

# protein descriptions
df_desc = pd.read_csv(output_root / 'cache/uniprot_descriptions.txt', sep='\t')

# ID mapping (to best UniProt)
df_prot_info_fil = pd.read_csv(output_root / 'cache/mapped_to_best_uprot.txt', sep='\t', 
                            dtype={'id': str, 'taxa': str, 'gene_name': str, 'gene_name_fil': str})

# Raw interactions with filled UniProt (orig)
raw_ppi_filled = pd.read_csv(output_root / 'cache/raw_interactions_filled_all.txt', sep='\t', dtype={'method': str, 'taxa': str, 'publ': str})

In [319]:
df_prot_info_fil = pd.read_csv(output_root / 'cache/mapped_to_best_uprot.txt', sep='\t', 
                            dtype={'id': str, 'taxa': str, 'gene_name': str, 'gene_name_fil': str})

In [301]:
dtype_dict = {'method': str, 'idA': str, 'idB': str, 'taxa': str, 'publ': str}

raw_ppi_filled = pd.read_csv(output_root / 'cache/raw_interactions_filled_all.txt', sep='\t', dtype=dtype_dict)

In [292]:
raw_ppi_filled['taxa'].isna().any()

False

In [349]:
df_prot_info_fil['best_uprot'].nunique()

139308

In [None]:
# Additional cleaning steps
if raw_ppi_filled['taxa'].str.contains('\.').any():
    raw_ppi_filled['taxa'] = raw_ppi_filled['taxa'].apply(lambda x: str(x).split('.')[0])
if raw_ppi_filled['idA'].str.contains(' ').any():
    raw_ppi_filled['idA'] = raw_ppi_filled['idA'].apply(lambda x: str(x).split(' ')[0].strip())
if raw_ppi_filled['idB'].str.contains(' ').any():
    raw_ppi_filled['idB'] = raw_ppi_filled['idB'].apply(lambda x: str(x).split(' ')[0].strip())

Unnamed: 0,source,row_number,idA,idB,method,publ,taxa,idtype_A,idtype_B,UniProt_A,UniProt_B


In [295]:
# raw_ppi_filled.drop('row_number', axis=1).describe()

In [305]:
# check secondary accessions
# is_sec = df_prot_info_fil['UniProt_orig'].isin(sec_merged['secondary'])
# df_prot_info_fil[is_sec]

#### Revise for isoform

In [350]:
is_isoform = df_prot_info_fil['UniProt_orig'].str.contains('-')
is_sec = df_prot_info_fil['UniProt_orig'].isin(sec_merged['secondary'])

df_prot_info_fil['best_uprot_full'] = df_prot_info_fil['best_uprot']
df_prot_info_fil.loc[is_isoform, 'best_uprot_full'] = df_prot_info_fil.loc[is_isoform, 'UniProt_orig']

# use original UniProt ID when given in source
uprot_source_idx = df_prot_info_fil['idtype'].str.contains('uniprot') & (df_prot_info_fil['UniProt_orig'].str.contains('-'))
df_prot_info_fil.loc[uprot_source_idx & ~is_sec, 'best_uprot_full'] = df_prot_info_fil.loc[uprot_source_idx, 'UniProt_orig']

In [351]:
df_prot_info_fil.query('id == "NP_536350.2"')

Unnamed: 0,idtype,id,taxa,UniProt_orig,primary_short,tag,name,species,gene_name,description,is_reviewed,gene_name_fil,best_uprot,best_uprot_full
206255,RefSeq,NP_536350.2,9606,P63092,P63092,sp,GNAS2_HUMAN,Homo sapiens,GNAS,Guanine nucleotide-binding protein G(s) subuni...,1,GNAS,P84996,P84996
206256,RefSeq,NP_536350.2,9606,Q5JWF2-1,Q5JWF2,sp,GNAS1_HUMAN,Homo sapiens,GNAS,Guanine nucleotide-binding protein G(s) subuni...,1,GNAS,P84996,Q5JWF2-1


* reduce to unique ID mapping (some raw IDs map to multiple gene names --> multiple UniProt IDs)

In [354]:
# best Uniprot same as original UniProt ID or not
df_prot_info_fil['same_uprot'] = (df_prot_info_fil['primary_short'] == df_prot_info_fil['best_uprot_full']).astype(int)
df_prot_info_fil['is_canonical'] = ~df_prot_info_fil['best_uprot_full'].str.contains('-')

In [356]:
to_best_uprot = df_prot_info_fil.sort_values(['id', 'taxa', 'is_reviewed', 'same_uprot', 'is_canonical', 'best_uprot_full'],
                                          ascending=[True, True, False, False, False, False]).drop_duplicates(['idtype', 'id', 'taxa'], keep='first').reset_index(drop=True)

* save cache file

In [361]:
to_best_uprot.to_csv(output_root / 'cache/id_to_best_uprot_uniq.txt', index=False, sep='\t')

In [None]:
all_taxa = to_best_uprot['taxa'].unique()

uniq_id_mapping_by_taxa = dict()

for taxa in all_taxa:
    df_cur = to_best_uprot.query('taxa == @taxa')
    uniq_id_mapping_by_taxa[taxa] = dict(zip(df_cur['id'], df_cur['best_uprot']))

#### Revise taxonomy ID in raw interaction data

In [379]:
print(raw_ppi_filled[raw_ppi_filled['taxa'] == '-']['row_number'].nunique())
raw_ppi_filled = raw_ppi_filled.query('taxa != "-"').reset_index(drop=True)
raw_ppi_filled['taxa_rev'] = raw_ppi_filled['taxa'].apply(lambda x: int(taxa_map_dict.get(str(x), x))).astype(str)

0


In [369]:
to_best_uprot.query('best_uprot_full == "X2JAU8"').iloc[0]['taxa']

'7227'

In [371]:
raw_ppi_filled.query('row_number == 1533613').iloc[0]['taxa_rev']

7227

In [380]:
raw_ppi_filled['taxa_rev'].dtype

dtype('O')

In [381]:
cols = ['idtype', 'id', 'taxa', 'gene_name', 'name', 'best_uprot_full']
ppi_merge = raw_ppi_filled.drop('taxa', axis=1).rename(columns={'taxa_rev': 'taxa'}).\
                merge(to_best_uprot[cols], left_on=['idtype_A', 'idA', 'taxa'], right_on=['idtype', 'id', 'taxa'], how='left').\
                rename(columns={'best_uprot_full': 'best_uprotA', 'gene_name': 'gene_name_A', 'name': 'entry_name_A'}).drop(columns=['idtype', 'id'])
ppi_merge = ppi_merge.merge(to_best_uprot[cols], left_on=['idtype_B', 'idB', 'taxa'], right_on=['idtype', 'id', 'taxa'], how='left').\
                rename(columns={'best_uprot_full': 'best_uprotB', 'gene_name': 'gene_name_B', 'name': 'entry_name_B'}).drop(columns=['idtype', 'id'])

In [382]:
ppi_merge[ppi_merge['best_uprotA'].notna() & ppi_merge['best_uprotB'].notna()].drop_duplicates(['best_uprotA', 'best_uprotB']).shape

(2460651, 17)

### Apply filtering conditions

In [None]:
# dtype_dict = {'method': str, 'idA': str, 'idB': str, 'gene_name_A': str, 'gene_name_B': str}
# ppi_merge = pd.read_csv(output_root / 'cache/raw_interactome_draft.txt', sep='\t', dtype=dtype_dict)
# print(ppi_merge.shape[0])

In [383]:
ppi_merge.shape

(4827853, 17)

In [384]:
print("Missing raw ID:", ppi_merge[ppi_merge['idA'].isna() | ppi_merge['idB'].isna()].shape[0])
ppi_merge = ppi_merge[ppi_merge['idA'].notna() & ppi_merge['idB'].notna()].reset_index(drop=True)
ppi_merge.shape

Missing raw ID: 26


(4827827, 17)

In [389]:
ppi_merge.to_csv(output_root / 'cache/raw_interactome_draft.txt', index=False, sep='\t')

In [385]:
# sanity check for missing taxonomy ID
ppi_merge['taxa'].isna().any()

False

#### Proceed with interactions having complete UniProt information

In [386]:
# interactions with complete UniProt information
ppi_complete = ppi_merge[ppi_merge['best_uprotA'].notna() & ppi_merge['best_uprotB'].notna()].reset_index(drop=True)
print(ppi_complete.shape[0])

4415609


In [387]:
# Unique interaction ID (sorted by UniProt ID)
ppi_complete['ppi'] = ppi_complete.apply(lambda x: ':'.join(sorted([x['best_uprotA'], x['best_uprotB']])), axis=1)

In [388]:
ppi_complete['ppi'].nunique()

2117113

* Cleanup evidence code (remove punctuations)

In [391]:
pat = re.compile("[" + re.escape(string.punctuation) + "]")

In [392]:
# sanity check
ppi_complete['method'].str.contains(pat).any()

True

In [393]:
ppi_complete['method'] = ppi_complete['method'].apply(lambda x: re.sub(pat, '', x))

In [406]:
# some contains space and description
ppi_complete[ppi_complete['method'].str.len() > 4]['method'].unique()

array(['0055 fluore', '0012 biolumine'], dtype=object)

In [407]:
ppi_complete.loc[ppi_complete['method'].str.len() > 4, 'method'] = ppi_complete.loc[ppi_complete['method'].str.len() > 4, 'method'].apply(lambda x: x.split(' ')[0])

* Cleanup publication ID

In [395]:
ppi_complete['publ'].str.contains(':').any()

True

In [398]:
# additional cleaning for some entries (from DIP, e.g. rtd:11250202, these are pubmed ID)
ppi_complete.loc[ppi_complete['publ'].str.contains(':'), 'publ'] = ppi_complete.loc[ppi_complete['publ'].str.contains(':'), 'publ'].apply(lambda x: x.split(':')[1])

#### Unique interactions

In [408]:
ppi_uniq = ppi_complete.drop_duplicates(['source', 'ppi', 'taxa', 'publ', 'method']).reset_index(drop=True)
ppi_uniq.shape

(3843352, 18)

In [400]:
# sanity check: any interactions with multiple taxa ID?
ppi_uniq.groupby('ppi')['taxa'].nunique().max()

1

In [409]:
ppi_uniq['ppi'].nunique()

2117113

In [410]:
ppi_uniq['pmid_method'] = ppi_uniq.apply(lambda x: '{}:{}'.format(x['publ'], x['method']), axis=1)

In [411]:
ppi_uniq.to_csv(output_root / 'cache/raw_interactome_merge_uniq.txt', index=False, sep='\t')

### Prepare for output

In [412]:
ppi_agg = ppi_uniq.groupby('ppi').agg({'source': set, 'pmid_method': set}).reset_index()
ppi_agg['source'] = ppi_agg['source'].apply(lambda x: '|'.join(sorted(x)))
ppi_agg['pmid_method'] = ppi_agg['pmid_method'].apply(lambda x: '|'.join(sorted(x)))

In [413]:
ppi_agg.head()

Unnamed: 0,ppi,source,pmid_method
0,A0A021WW32:A0A021WW32,BioGrid,27996020:0004
1,A0A021WW32:A1Z8S6,BioGrid,25588834:0004
2,A0A021WW32:D6W4X5,IntAct,37061542:0397|37061542:1112
3,A0A021WW32:M9PEY7,BioGrid,27590505:0254
4,A0A021WW32:P05205,BioGrid,24990964:0004


In [414]:
ppi_agg = ppi_agg.merge(ppi_uniq[['ppi', 'taxa']].drop_duplicates()).drop_duplicates().reset_index(drop=True)
ppi_agg.shape

(2117113, 4)

In [415]:
ppi_agg['UniProt_A'], ppi_agg['UniProt_B'] = zip(*ppi_agg['ppi'].apply(lambda x: x.split(':')))

In [416]:
# raw interactome data
cols = ['source', 'UniProt_A', 'UniProt_B', 'taxa', 'pmid_method']
ppi_agg[cols].sort_values(['UniProt_A', 'UniProt_B']).to_csv(output_root / 'raw_interactome.txt', index=False, sep='\t')

#### Assign interaction type and quality

##### Assign binary / co-complex by publication and evidence code

In [417]:
ev_ref = pd.read_csv(data_root / 'data/evidence_code_20231120.txt', sep='\t', names=['code', 'name', 'group'], dtype={'code': str})
ev_ref['code'] = ev_ref['code'].str.zfill(4)

* Generate evidence code summary file
--- number of interactions per publication per evidence code

In [418]:
binary_idx = ppi_uniq['method'].isin(ev_ref.query('group == "binary"')['code'])
cocomp_idx = ppi_uniq['method'].isin(ev_ref.query('group == "co-complex"')['code'])

ppi_uniq.loc[binary_idx, 'type'] = 'binary'
ppi_uniq.loc[cocomp_idx, 'type'] = 'co-complex'

for grp in ['binary', 'co-complex']:
    ppi_grp = ppi_uniq.query('type == @grp')
    pub_stats = ppi_grp.groupby(['pmid_method', 'taxa'])['ppi'].nunique().sort_values(ascending=False).reset_index()
    pub_stats['publ'], pub_stats['method'] = zip(*pub_stats['pmid_method'].apply(lambda x: [x.split(':')[-2], x.split(':')[-1]]))
    pub_stats.to_csv(output_root / 'cache/publication_count_{}_with_evi_code.txt'.format(re.sub('-', '', grp)), sep='\t', index=False)

* some evidence code can be either binary or co-complex - evaluate by publication

In [419]:
binary_pub_stats = pd.read_csv(output_root / 'cache/publication_count_binary_with_evi_code.txt', sep='\t', dtype={'method': str})

In [421]:
# evidence code for both binary and co-complex
ev_code_both = [str(s).zfill(4) for s in [6,7,19,59,61,75,96]]
max_ppi = 25

to_cocomp = binary_pub_stats['method'].isin(ev_code_both) & (binary_pub_stats['ppi'] > max_ppi)
to_cocomp_publ = binary_pub_stats[to_cocomp]['pmid_method'].drop_duplicates().tolist()

**Curate publications**

* Binary: (1 OR 2) AND 3
  1. evidence code is binary
  2. evidence code = "0492" or "0493" for HPRD source
  3. publication NOT in blacklist and NOT forced to co-complex

* Co-complex: 1 AND 2
  1. evidence code is co-complex or publication forced as co-complex
  2. publication NOT in blacklist

In [24]:
flist = list((data_root / 'data/bouncer').glob('*.txt'))
publication_bouncer = [line.strip().split('\t') for f in flist for line in open(f).read().strip().split('\n')]

In [26]:
# forced_binary = set()
# pub_blacklist = set()
# forced_cocomp = set()
# general_forced_hq = set()
key_map = {'1': 'forced_binary', '-1': 'pub_blacklist', '*': 'pub_blacklist', '0': 'forced_cocomp', 
           '2': 'general_forced_hq', '-2': 'binary_blacklist_forced_cocomplex'}

special_cases = defaultdict(set)
for pub, _, status in publication_bouncer:
    status = status.strip()
    if status in key_map:
        special_cases[key_map[status]].add(pub.strip())
    elif '*' in status:
        special_cases['pub_blacklist'].add(pub.strip())

In [None]:
def is_binary_pub(pub_id, ev_code, source, exclude_list, binary_methods):
    # exclude_pub = special_cases['pub_blacklist'] + special_cases['binary_blacklist_forced_cocomplex']
    ev_code = str(ev_code).zfill(4)
    if pub_id not in exclude_list:
        if ev_code in binary_methods:
            return True
        if ev_code in ['0492', '0493'] and source.upper() == 'HPRD':
            return True
    return False

def is_cocomp_pub(pub_id, ev_code, blacklist, bin_forced_cocomp, cocomp_methods):
    if pub_id not in blacklist:
        if ev_code in cocomp_methods or pub_id in bin_forced_cocomp:
            return True
    return False

In [None]:
binary_code = ev_ref.query('group == "binary"')['code'].tolist()
cocomp_code = ev_ref.query('group == "co-complex"')['code'].tolist()

binary_exclude = list(special_cases['pub_blacklist']) + list(special_cases['binary_blacklist_forced_cocomplex'])
ppi_uniq['is_binary_pub'] = ppi_uniq.apply(lambda x: is_binary_pub(x['publ'], x['method'], x['source'], binary_exclude, binary_code), axis=1)
ppi_uniq['is_cocomp_pub'] = ppi_uniq.apply(lambda x: is_cocomp_pub(x['publ'], x['method'], special_cases['pub_blacklist'], 
                                                                   special_cases['binary_blacklist_forced_cocomplex'], cocomp_code), axis=1)

##### High-throughput (HT) vs literature-curated (LC)

**Proceed with only binary or co-complex interactions**

* high-throughput
  * binary
    * **binary** publication AND publication in `forced_binary`
  * co-complex
    * **co-complex** publication AND (publication in `forced_cocomp` OR publication selected as cocomplex by cutoff)
* literature curated otherwise

In [None]:
def assign_quality(record, cocomp_pub_by_cutoff, forced_binary, forced_cocomp):
    if record['is_binary_pub']:
        if record['publ'] in forced_binary:
            return 'HT'
        return 'LC'
    
    if record['is_cocomp_pub']:
        if record['publ'] in forced_cocomp or record['publ'] in cocomp_pub_by_cutoff:
            return 'HT'
    
        return 'LC'
    
    return np.nan

In [None]:
# ppi_uniq['quality'] = ppi_uniq.apply(lambda x: assign_quality(x, to_cocomp_publ, special_cases['forced_binary'], special_cases['forced_cocomp']), axis=1)
# ppi_uniq.groupby(['type', 'quality'])['ppi'].nunique()

In [None]:
# proceed with binary / co-complex interactions
ppi_uniq_bincocomp = ppi_uniq[ppi_uniq['is_binary_pub'] | ppi_uniq['is_cocomp_pub']].reset_index(drop=True)
ppi_uniq_bincocomp['ppi'].nunique()

In [None]:
ppi_uniq_bincocomp['quality'] = ppi_uniq_bincocomp.apply(lambda x: assign_quality(x, to_cocomp_publ, special_cases['forced_binary'], special_cases['forced_cocomp']), axis=1)
ppi_uniq_bincocomp.groupby(['type', 'quality'])['ppi'].nunique()

In [None]:
ppi_uniq_bincocomp['pmid:method:quality'] = ppi_uniq_bincocomp.apply(lambda x: ':'.join([x['publ'], x['method'], x['quality']]), axis=1)

##### High-quality

If an interaction meet any of the following conditions:

* Any publication is binary and is in `forced_binary`
* Any publication co-complex and is in `forced_cocomp` + `forced_cocomp_by_cutoff`
* more than one publication NOT in the blacklist
* source is PDB

In [None]:
def is_high_quality(record, forced_binary, forced_cocomp_all):
    """
    Whether or not a single record (with one publication+method) is high-quality itself
    """
    if record['is_binary_pub'] and record['publ'] in forced_binary:
        return True
    if record['is_cocomp_pub'] and record['publ'] in forced_cocomp_all:
        return True
    if record['source'].upper() == 'PDB':
        return True
    return False

In [None]:
forced_cocomp_all = to_cocomp_publ + list(special_cases['forced_cocomp'])
ppi_uniq_bincocomp['is_hq'] = ppi_uniq_bincocomp.apply(lambda x: is_high_quality(x, special_cases['forced_binary'], forced_cocomp_all), axis=1)

In [None]:
ppi_uniq_bincocomp.groupby(['type', 'is_hq'])['ppi'].nunique()

In [None]:
ppi_uniq_bincocomp.groupby(['type', 'quality', 'is_hq'])['ppi'].nunique()

[revision 2024.4.16] --- add interaction type to `pmid:method:quality` triplet

In [14]:
# revision 2024.4.16 --- load from cache
dtype_dict = {'idA': str, 'idB': str, 'gene_name_A': str, 'gene_name_B': str, 'method': str, 'publ': str}
ppi_uniq_bincocomp = pd.read_csv(output_root / 'bincocomp_interactome.txt', sep='\t', dtype=dtype_dict)

In [16]:
ppi_uniq_bincocomp.columns

Index(['source', 'row_number', 'idA', 'idB', 'method', 'publ', 'idtype_A',
       'idtype_B', 'UniProt_A', 'UniProt_B', 'taxa', 'gene_name_A',
       'entry_name_A', 'best_uprotA', 'gene_name_B', 'entry_name_B',
       'best_uprotB', 'ppi', 'pmid_method', 'type', 'is_binary_pub',
       'is_cocomp_pub', 'quality', 'is_hq', 'pmid:method:quality'],
      dtype='object')

In [15]:
ppi_uniq_bincocomp['type'].unique()

array(['binary', 'co-complex', nan], dtype=object)

In [43]:
# ppi_uniq_bincocomp[ppi_uniq_bincocomp['type'].isna()].drop_duplicates('pmid_method')['publ'].isin(special_cases['binary_blacklist_forced_cocomplex']).all()

In [45]:
# revise column name to avoid ambiguity (type --> method_type, not finalized)
ppi_uniq_bincocomp = ppi_uniq_bincocomp.rename(columns={'type': 'method_type'})

In [47]:
# sanity check
# each `pmid:method:quality` triplet should be exactly one of binary or co-complex
assert not (ppi_uniq_bincocomp['is_binary_pub'] & ppi_uniq_bincocomp['is_cocomp_pub']).any()
assert (ppi_uniq_bincocomp['is_binary_pub'] | ppi_uniq_bincocomp['is_cocomp_pub']).all()

In [46]:
ppi_uniq_bincocomp.columns

Index(['source', 'row_number', 'idA', 'idB', 'method', 'publ', 'idtype_A',
       'idtype_B', 'UniProt_A', 'UniProt_B', 'taxa', 'gene_name_A',
       'entry_name_A', 'best_uprotA', 'gene_name_B', 'entry_name_B',
       'best_uprotB', 'ppi', 'pmid_method', 'method_type', 'is_binary_pub',
       'is_cocomp_pub', 'quality', 'is_hq', 'pmid:method:quality'],
      dtype='object')

In [53]:
# ppi_type --- consider both method_type (based on evidence code) and publication
ppi_uniq_bincocomp['ppi_type'] = ppi_uniq_bincocomp.apply(lambda x: 'binary' if x['is_binary_pub'] else 'co-complex', axis=1)
ppi_uniq_bincocomp['pmid:method:quality:type'] = ppi_uniq_bincocomp['pmid:method:quality'] + ':' + ppi_uniq_bincocomp['ppi_type']

In [54]:
ppi_uniq_bincocomp['ppi_type'].value_counts()

binary        1402267
co-complex    1286181
Name: ppi_type, dtype: int64

In [55]:
ppi_uniq_bincocomp['is_cocomp_pub'].sum()

1286181

In [56]:
ppi_uniq_bincocomp.head()

Unnamed: 0,source,row_number,idA,idB,method,publ,idtype_A,idtype_B,UniProt_A,UniProt_B,...,ppi,pmid_method,method_type,is_binary_pub,is_cocomp_pub,quality,is_hq,pmid:method:quality,ppi_type,pmid:method:quality:type
0,DIP,2,DIP-617N,DIP-617N,114,9168119,DIP,DIP,P01730,P01730,...,P01730:P01730,9168119:0114,binary,True,False,LC,False,9168119:0114:LC,binary,9168119:0114:LC:binary
1,DIP,2,DIP-617N,DIP-617N,31,9168119,DIP,DIP,P01730,P01730,...,P01730:P01730,9168119:0031,binary,True,False,LC,False,9168119:0031:LC,binary,9168119:0031:LC:binary
2,DIP,3,DIP-1025N,DIP-1026N,114,9174345,DIP,DIP,P00968,P00907,...,P00968:P0A6F1,9174345:0114,binary,True,False,LC,False,9174345:0114:LC,binary,9174345:0114:LC:binary
3,DIP,3,DIP-1025N,DIP-1026N,676,15690043,DIP,DIP,P00968,P00907,...,P00968:P0A6F1,15690043:0676,co-complex,False,True,HT,True,15690043:0676:HT,co-complex,15690043:0676:HT:co-complex
4,DIP,3,DIP-1025N,DIP-1026N,27,11551199,DIP,DIP,P00968,P00907,...,P00968:P0A6F1,11551199:0027,binary,True,False,LC,False,11551199:0027:LC,binary,11551199:0027:LC:binary


#### Fill gene information

In [59]:
# curate uniprot-to-gene mapping from current interaction data
idx = ppi_uniq_bincocomp['gene_name_A'].notna()
uprot2gene = dict(zip(ppi_uniq_bincocomp.loc[idx, 'best_uprotA'], ppi_uniq_bincocomp.loc[idx, 'gene_name_A']))
print(len(uprot2gene))

idx = ppi_uniq_bincocomp['gene_name_B'].notna()
uprot2gene.update(dict(zip(ppi_uniq_bincocomp.loc[idx, 'best_uprotB'], ppi_uniq_bincocomp.loc[idx, 'gene_name_B'])))
print(len(uprot2gene))

90289
121320


#### Aggregate by interaction

--- each row becomes a unique interaction with all publication+method+quality information concatenated

**Aggregating criteria**
* An interaction is considered as **binary** if **any** publication + evidence code suggests it is binary
* An interaction is considered as **co-complex** if **any** publication + evidence code suggests it is co-complex
* High quality if **any** single record meets **high-quality** condition **or** has >1 publications not in the blacklist

* [revision 2024.4.16] change `pmid:method:quality` into `pmid:method:quality:type` (add interaction type)
* [revision 2024.4.16] add `high_quality` column to HINT format output

In [57]:
ppi_agg_bincomp = ppi_uniq_bincocomp.groupby(['ppi', 'taxa']).agg({'source': set, 'pmid:method:quality:type': set, 'publ': set, 
                                                                   'is_hq': any, 'is_binary_pub': any, 'is_cocomp_pub': any}).reset_index()
ppi_agg_bincomp['source'] = ppi_agg_bincomp['source'].apply(lambda x: '|'.join(sorted(x)))
ppi_agg_bincomp['pmid:method:quality:type'] = ppi_agg_bincomp['pmid:method:quality:type'].apply(lambda x: '|'.join(sorted(x)))
ppi_agg_bincomp['n_valid_pub'] = ppi_agg_bincomp['publ'].apply(len)
ppi_agg_bincomp = ppi_agg_bincomp.rename(columns={'is_binary_pub': 'is_binary', 'is_cocomp_pub': 'is_cocomp'}).drop('publ', axis=1)
# high quality tag when aggregating information from all publications
ppi_agg_bincomp['is_hq_agg'] = ppi_agg_bincomp['is_hq'] | (ppi_agg_bincomp['n_valid_pub'] > 1)

In [60]:
# Fill back information
ppi_agg_bincomp['UniProt_A'], ppi_agg_bincomp['UniProt_B'] = zip(*ppi_agg_bincomp['ppi'].apply(lambda x: x.split(':')))
ppi_agg_bincomp['Gene_A'] = ppi_agg_bincomp['UniProt_A'].apply(lambda x: uprot2gene.get(x, np.nan))
ppi_agg_bincomp['Gene_B'] = ppi_agg_bincomp['UniProt_B'].apply(lambda x: uprot2gene.get(x, np.nan))

In [61]:
ppi_agg_bincomp.head()

Unnamed: 0,ppi,taxa,source,pmid:method:quality:type,is_hq,is_binary,is_cocomp,n_valid_pub,is_hq_agg,UniProt_A,UniProt_B,Gene_A,Gene_B
0,A0A021WW32:A0A021WW32,7227,BioGrid,27996020:0004:LC:co-complex,False,False,True,1,False,A0A021WW32,A0A021WW32,vtd,vtd
1,A0A021WW32:A1Z8S6,7227,BioGrid,25588834:0004:LC:co-complex,False,False,True,1,False,A0A021WW32,A1Z8S6,vtd,pds5
2,A0A021WW32:D6W4X5,7227,IntAct,37061542:0397:LC:binary|37061542:1112:LC:binary,False,True,False,1,False,A0A021WW32,D6W4X5,vtd,CG15250-RA
3,A0A021WW32:P05205,7227,BioGrid,24990964:0004:LC:co-complex,False,False,True,1,False,A0A021WW32,P05205,vtd,Su(var)205
4,A0A021WW32:P23696,7227,BioGrid,24086141:0004:LC:co-complex|31110215:0004:LC:c...,False,True,True,2,True,A0A021WW32,P23696,vtd,mts


In [62]:
output_root

PosixPath('/home/yl986/data/HINT/outputs_2023')

In [63]:
# save to cache if needed
ppi_uniq_bincocomp.to_csv(output_root / 'bincocomp_interactome.txt', index=False, sep='\t')  # binary / co-complex (keep seperate rows for different publications/methods)
ppi_agg_bincomp.to_csv(output_root / 'bincocomp_agg_interactome.txt', index=False, sep='\t') # binary / co-complex (aggregated by interaction)

#### Generate HINT format output

In [64]:
hint_output_root = output_root / 'HINT_format'

if not hint_output_root.exists():
    hint_output_root.mkdir(parents=True)

In [65]:
# HINT format with selected columns
cols = ["Uniprot_A", "Uniprot_B", "Gene_A", "Gene_B", "pmid:method:quality:type", "taxid", "high_quality"]
ppi_output = ppi_agg_bincomp.rename(columns={'UniProt_A': 'Uniprot_A', 'UniProt_B': 'Uniprot_B', 'taxa': 'taxid', 'is_hq_agg': 'high_quality'}).sort_values(cols)

ppi_output[cols].to_csv(hint_output_root / 'both_all.txt', index=False, sep='\t')
ppi_output.query('is_binary == True')[cols].to_csv(hint_output_root / 'binary_all.txt', index=False, sep='\t')
ppi_output.query('is_cocomp == True')[cols].to_csv(hint_output_root / 'cocomp_all.txt', index=False, sep='\t')

ppi_output.query('is_hq == True')[cols].to_csv(hint_output_root / 'both_hq.txt', index=False, sep='\t')
ppi_output.query('is_hq == True & is_binary == True')[cols].to_csv(hint_output_root / 'binary_hq.txt', index=False, sep='\t')
ppi_output.query('is_hq == True & is_cocomp == True')[cols].to_csv(hint_output_root / 'cocomp_hq.txt', index=False, sep='\t')

#### HINT format interactome by species

* Load species supporting documents

In [66]:
df_species = pd.read_csv(source_root / 'docs/cache/species_parsed.txt', sep='\t')
with open(output_root / 'cache/taxa_map_dict.json', 'r') as f:
    taxa_map_dict = json.load(f)

In [67]:
# Rule-based curation (consider to revise later)
df_species['scientific_name_short'] = df_species['scientific_name'].apply(lambda x: x.split('(')[0].strip())
ecoli_idx = df_species['scientific_name'].str.startswith('Escherichia coli')  # manual rule for E Coli
df_species.loc[ecoli_idx, 'scientific_name_short'] = 'Escherichia coli'

rice_idx = df_species['common_name'] == 'Rice' # manual rule for Rice
df_species.loc[rice_idx, 'scientific_name_short'] = 'Oryza sativa'

In [68]:
df_species['taxa'] = df_species['taxa'].astype(str)
df_species[['taxa', 'scientific_name_short']].drop_duplicates().sort_values('taxa').to_csv(output_root / 'taxid2name_short.txt', index=False, sep='\t')

* Target species

Create separated directory for each species using species name (Pascal case naming rule, remove space and capitalize the first letter for each word)

In [69]:
target_species = ['Homo sapiens',
                  'Saccharomyces cerevisiae',
                  'Caenorhabditis elegans',
                  'Arabidopsis thaliana',
                  'Mus musculus',
                  'Escherichia coli',
                  'Rattus norvegicus',
                  'Oryza sativa',
                  'Schizosaccharomyces pombe',
                  'Drosophila melanogaster']
target_species = [s.lower() for s in target_species]

In [70]:
df_spe_target = df_species[df_species['scientific_name_short'].str.lower().isin(target_species)].reset_index(drop=True)
df_spe_target = df_spe_target.groupby('scientific_name_short').agg({'taxa': list}).reset_index()
select_name2id = dict(zip(df_spe_target['scientific_name_short'], df_spe_target['taxa']))

In [None]:
with open(output_root / 'select_species2id.json', 'w') as f:
    json.dump(select_name2id, f, indent=2)

In [73]:
flist = list(hint_output_root.glob('*txt'))

for fpath in flist:
    if fpath.name.startswith('protein_meta'):  # skip protein description file
        continue
    df = pd.read_csv(fpath, sep='\t', dtype=str)
    suffix = fpath.name
    
    for species, taxa_list in select_name2id.items():
        species_tag = re.sub(' ', '', species.title()) 
        out_dir = hint_output_root / 'taxa' / species_tag
        if not(out_dir).exists():
            out_dir.mkdir(parents=True)
        out_fpath = out_dir / f'{species_tag}_{suffix}'
        
        df_cur = df[df['taxid'].isin(taxa_list)]
        if len(df_cur):
            df_cur.to_csv(out_fpath, sep='\t', index=False)
        

### Generate protein meta data

Goal: fetch protein name and description that match UniProt IDs in each interaction 

(in curated interactome data, some protein name & description don't match the displayed UniProt ID because we selected the **best** UniProt ID by gene)

In [None]:
to_best_uprot = pd.read_csv(output_root / 'cache/id_to_best_uprot_uniq.txt', sep='\t', dtype={'id': str, 'taxa': str})
prot_desc = pd.read_csv(output_root / 'cache/uniprot_descriptions.txt', sep='\t', dtype=str)

In [None]:
meta_cols = ['best_uprot_full', 'taxa', 'gene_name_fil']
prot_meta = to_best_uprot[meta_cols].rename(columns={'best_uprot_full': 'uniprot', 'gene_name_fil': 'gene'}).astype({'taxa': str}).drop_duplicates().reset_index(drop=True)
prot_meta['primary_short'] = prot_meta['uniprot'].apply(lambda x: x.split('-')[0])

In [None]:
# Load taxa ID supporting document
with open(output_root / 'cache/taxa_map_dict.json') as f:
    taxa_map_dict = json.load(f)

In [None]:
# curate taxonomy ID
prot_desc['taxa'] = prot_desc['taxa'].astype(str).apply(lambda x: taxa_map_dict.get(x, str(x)))

In [None]:
# Merge information
prot_meta_complete = prot_meta.merge(prot_desc[['UniProt', 'name', 'taxa', 'description', 'tag']].rename(columns={'UniProt': 'primary_short'}))
prot_meta_complete['is_reviewed'] = (prot_meta_complete['tag'] == 'sp')

In [None]:
# save protein meta to file
prot_meta_complete.sort_values(['uniprot']).drop('tag', axis=1).to_csv(output_root / 'HINT_format/protein_meta.txt', index=False, sep='\t')