### Protein/Marker Meta Data Integration

This should filter PRO data in an application-specific manner and in this case, the most useful application of PRO is in building list of cell surface markers that augment the cytokines and transcription factors commonly measured in immunological studies.

In [None]:
import os.path as osp
import pandas as pd
import numpy as np
from tcre import meta
from tcre import lib
from tcre.lib import SPECIES_HUMAN_ID
from tcre.env import *

In [None]:
# Symbols/synonyms to ignore (for all meta data sources)
bad_syms = ["ifi", "dif", "esp", "tc1", "til"]

In [None]:
df_pro = pd.read_csv(osp.join(META_DATA_DIR, 'raw', 'pro.raw.csv.gz'))

# Get groups and parents for CD molecules
tgt_ids = list(df_pro[df_pro['syn'].fillna('').str.startswith('CD')]['id'].unique())
tgt_ids = np.unique(list(df_pro.set_index('id').loc[tgt_ids]['parent'].unique()) + tgt_ids)
df_pro = (
    df_pro
    .pipe(lambda df: df[df['syn'].str.len() >= 3])
    .pipe(lambda df: df[(~df['syn'].str.contains(' ')) | (df['syn_typ'] == 'label')])
    .pipe(lambda df: df[df['id'].isin(tgt_ids)])
    # Remove synonyms like "hCFAP299/iso:h1"
    .pipe(lambda df: df[~df['syn'].str.contains('/|:')])
)

# Hack in manual entries for now
df_pro = df_pro.append(pd.DataFrame([
    {
    'category': 'organism-gene', 'id': 'PR:000001017', 'label': 'CD45RO', 
    'parent': 'PR:000001017', 'syn': 'CD45RO', 'syn_typ': 'label', 'species': 'any'
    },
    {
    'category': 'organism-gene', 'id': 'PR:000001084', 'label': 'CD8', 
    'parent': 'PR:000001084', 'syn': 'CD8', 'syn_typ': 'exact', 'species': 'any'
    },
    {
    'category': 'organism-gene', 'id': 'PRM:001', 'label': 'CD57', 
    'parent': 'PRM:001', 'syn': 'CD57', 'syn_typ': 'exact', 'species': 'any'
    },
    {
    'category': 'organism-gene', 'id': 'PR:000001919', 'label': 'PD1', 
    'parent': 'PR:000001919', 'syn': 'PD1', 'syn_typ': 'exact', 'species': 'any'
    },
]))

df_pro.info()

In [None]:
df_ck[df_ck['sym'].str.lower().isin(bad_syms)]

In [None]:
# Re-map preferred label to CD marker syms

def get_preferred_label(g):
    g = g.assign(syn_len=g['syn'].str.len()).sort_values('syn_len')
    mask = g['syn'].str.contains('CD')
    # Use CD name if possible
    if mask.any():
        r = g[mask].iloc[0]
    # Use shortest possible synonym
    else:
        r = g.iloc[0]
    return dict(lbl=r['syn'], id=r['id'])

pm_pro = df_pro.groupby('id').apply(get_preferred_label).to_dict()

In [None]:
# write as proteins.csv