### Protein/Marker Meta Data Integration

This should filter PRO data in an application-specific manner and in this case, the most useful application of PRO is in building list of cell surface markers that augment the cytokines and transcription factors commonly measured in immunological studies.

In [1]:
import os.path as osp
import pandas as pd
import numpy as np
from tcre import lib
from tcre.lib import SPECIES_HUMAN_ID
from tcre.env import *

## Load and Filter

Load all PRO terms and filter to those for CD molecules.

In [2]:
df_pro = pd.read_csv(osp.join(META_DATA_DIR, 'raw', 'pro.raw.csv.gz'))

# Get groups and parents for CD molecules
tgt_ids = list(df_pro[df_pro['syn'].fillna('').str.startswith('CD')]['id'].unique())
tgt_ids = np.unique(list(df_pro.set_index('id').loc[tgt_ids]['parent'].unique()) + tgt_ids)
df_pro = (
    df_pro
    .pipe(lambda df: df[df['syn'].str.len() >= 3])
    .pipe(lambda df: df[(~df['syn'].str.contains(' ')) | (df['syn_typ'] == 'label')])
    .pipe(lambda df: df[df['id'].isin(tgt_ids)])
    # Remove synonyms like "hCFAP299/iso:h1"
    .pipe(lambda df: df[~df['syn'].str.contains('/|:')])
)

# Hack in manual entries for now
df_pro = df_pro.append(pd.DataFrame([
    {
    'category': 'organism-gene', 'id': 'PR:000001017', 'label': 'CD45RO', 
    'parent': 'PR:000001017', 'syn': 'CD45RO', 'syn_typ': 'label', 'species': 'any'
    },
    {
    'category': 'organism-gene', 'id': 'PR:000001084', 'label': 'CD8', 
    'parent': 'PR:000001084', 'syn': 'CD8', 'syn_typ': 'exact', 'species': 'any'
    },
    {
    'category': 'organism-gene', 'id': 'PRM:001', 'label': 'CD57', 
    'parent': 'PRM:001', 'syn': 'CD57', 'syn_typ': 'exact', 'species': 'any'
    },
    {
    'category': 'organism-gene', 'id': 'PR:000001919', 'label': 'PD1', 
    'parent': 'PR:000001919', 'syn': 'PD1', 'syn_typ': 'exact', 'species': 'any'
    },
]), sort=True)

df_pro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4818 entries, 4874 to 3
Data columns (total 10 columns):
category      4818 non-null object
id            4818 non-null object
label         4818 non-null object
namespace     4814 non-null object
parent        575 non-null object
priority      4814 non-null float64
species       4818 non-null object
species_id    4814 non-null float64
syn           4818 non-null object
syn_typ       4818 non-null object
dtypes: float64(2), object(8)
memory usage: 414.0+ KB


In [3]:
# Re-map preferred label to CD marker syms

def get_preferred_label(g):
    g = g.assign(syn_len=g['syn'].str.len()).sort_values('syn_len')
    mask = g['syn'].str.contains('CD')
    # Use CD name if possible
    if mask.any():
        r = g[mask].iloc[0]
    # Use shortest possible synonym
    else:
        r = g.iloc[0]
    return dict(lbl=r['syn'], id=r['id'])

pm_pro = df_pro.groupby('id').apply(get_preferred_label)
df_pro['pref_lbl'] = df_pro['id'].map(pm_pro.apply(lambda m: m['lbl']))
df_pro['pref_id'] = df_pro['id'].map(pm_pro.apply(lambda m: m['id']))

In [5]:
df_pro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4818 entries, 4874 to 3
Data columns (total 12 columns):
category      4818 non-null object
id            4818 non-null object
label         4818 non-null object
namespace     4814 non-null object
parent        575 non-null object
priority      4814 non-null float64
species       4818 non-null object
species_id    4814 non-null float64
syn           4818 non-null object
syn_typ       4818 non-null object
pref_lbl      4818 non-null object
pref_id       4818 non-null object
dtypes: float64(2), object(10)
memory usage: 489.3+ KB


In [6]:
df_pro.head()

Unnamed: 0,category,id,label,namespace,parent,priority,species,species_id,syn,syn_typ,pref_lbl,pref_id
4874,organism-gene,PR:P64554,7-carboxy-7-deazaguanine synthase (Escherichia...,protein,PR:000034945,2.0,any,3.0,ygcF,related,ygcF,PR:P64554
4880,organism-gene,PR:Q46893,2-C-methyl-D-erythritol 4-phosphate cytidylylt...,protein,PR:000023036,2.0,any,3.0,ygbP,related,ygbP,PR:Q46893
5540,gene,PR:000033765,uncharacterized protein YddB,protein,,4.0,any,3.0,yddB,exact,CDS103,PR:000033765
5541,gene,PR:000035174,inner membrane ABC transporter ATP-binding pro...,protein,,4.0,any,3.0,yddA,exact,CDS102,PR:000035174
6771,gene,PR:000005436,charged multivesicular body protein 4a,protein,,4.0,any,3.0,vps32-1,exact,Snf-1,PR:000005436


## Export

In [14]:
# Set enabled and external id for compatibility with other controlled vocab integrations
df = df_pro.assign(enabled=True).rename(columns={'id': 'extid'}).copy()
# Ensure synonyms are unique
assert df.groupby('syn').size().max() == 1

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4818 entries, 4874 to 3
Data columns (total 13 columns):
category      4818 non-null object
extid         4818 non-null object
label         4818 non-null object
namespace     4814 non-null object
parent        575 non-null object
priority      4814 non-null float64
species       4818 non-null object
species_id    4814 non-null float64
syn           4818 non-null object
syn_typ       4818 non-null object
pref_lbl      4818 non-null object
pref_id       4818 non-null object
enabled       4818 non-null bool
dtypes: bool(1), float64(2), object(10)
memory usage: 494.0+ KB


In [16]:
path = osp.join(META_DATA_DIR, lib.SURFACE_PROTEINS + '.csv')
df.to_csv(path, index=False)
path

'/lab/repos/t-cell-relation-extraction/data/meta/surface_proteins.csv'