## Metadata Integration

In [1]:
import os.path as osp
import pandas as pd
import numpy as np
%run env.py
%run src/lib.py

## Cytokines

In [2]:
# From https://www.ncbi.nlm.nih.gov/books/NBK6294/table/A13506/?report=objectonly
SRC_ID = 1
dfr = pd.read_csv(osp.join(META_DATA_DIR, 'raw', 'cytokines.csv'), sep='\t')
dfr.head()

Unnamed: 0,name,synonym,amino,chromosome_acids,mwt,receptor_form,receptor_location
0,IL-1α,hematopoietin-1,271,2q14,30606,"CD121a, CDw121b","2q12, 2q12-q22"
1,IL-1β,catabolin,269,2q14,20747,"CD121a, CDw121b","2q12,2q12-q22"
2,IL-1RA,IL-1 receptor antagonist,177,2q14.2,20055,CD121a,2q12
3,IL-18,interferon-γ inducing factor,193,11q22.2-q22.3,22326,"IL-18Rα, β",2q12
4,IL-2,T cell growth factor,153,4q26-q27,17628,"CD25, 122,132","10p15-p14, 22q13.1, Xq13.1"


In [3]:
dft = (
    pd.concat([
        dfr[['name', 'name']].set_axis(['sym', 'lbl'], axis=1, inplace=False),
        dfr.set_index('name')['synonym'].str.split(',', expand=True).stack().str.strip()
            .reset_index().set_axis(['lbl', 'i', 'sym'], axis=1, inplace=False)
            .filter(items=['sym', 'lbl'])
    ])
    .assign(species_id=SPECIES_HUMAN_ID, src_id=SRC_ID)
)
dft.head()

Unnamed: 0,sym,lbl,species_id,src_id
0,IL-1α,IL-1α,1,1
1,IL-1β,IL-1β,1,1
2,IL-1RA,IL-1RA,1,1
3,IL-18,IL-18,1,1
4,IL-2,IL-2,1,1


In [4]:
def transform(df, mask, fn):
    df = df.loc[mask.values].copy()
    df['sym'] = df['sym'].apply(fn)
    return df
df = dft.append(transform(dft, dft['sym'].str.match(r'IL-[^ ]+$|TNF-|IFN-|TGF-'), lambda v: v.replace('-', '')))
df['sym'] = df['sym'].str.strip()
df['lbl'] = df['lbl'].str.strip()
df = df[df['sym'].str.len() > 0]
df.head()

Unnamed: 0,sym,lbl,species_id,src_id
0,IL-1α,IL-1α,1,1
1,IL-1β,IL-1β,1,1
2,IL-1RA,IL-1RA,1,1
3,IL-18,IL-18,1,1
4,IL-2,IL-2,1,1


In [5]:
df['sym'].unique()

array(['IL-1α', 'IL-1β', 'IL-1RA', 'IL-18', 'IL-2', 'IL-4', 'IL-7',
       'IL-9', 'IL-13', 'IL-15', 'IL-3', 'IL-5', 'GM-CSF', 'IL-6',
       'IL-11', 'G-CSF', 'IL-12', 'LIF', 'OSM', 'IL-10', 'IL-20', 'IL-14',
       'IL-16', 'IL-17', 'IFN-α', 'IFN-β', 'IFN-γ', 'CD154', 'LT-β',
       'TNF-α', 'TNF-β', '4-1BBL', 'APRIL', 'CD70', 'CD153', 'CD178',
       'GITRL', 'LIGHT', 'OX40L', 'TALL-1', 'TRAIL', 'TWEAK', 'TRANCE',
       'TGF-β1', 'TGF-β2', 'TGF-β3', 'hematopoietin-1', 'catabolin',
       'IL-1 receptor antagonist', 'interferon-γ inducing factor',
       'T cell growth factor', 'BSF-1', 'T cell growth factor P40',
       'P600', 'multipotential CSF', 'MCGF', 'BCDF-1', 'CSF-2', 'IFN-β2',
       'BSF-2', 'AGIF', 'CSF-3', 'NK cell stimulatory factor',
       'leukemia inhibitory factor', 'oncostatin M', 'CSIF', 'HMW-BCGF',
       'LCF', 'CTLA-8', 'CD40L', 'TRAP', 'cachectin', 'LT-α', 'TALL-2',
       'CD27L', 'CD30L', 'FasL', 'Apo2L', 'Apo3L', 'OPGL', 'TGF-β',
       'IL1α', 'IL1β', 'I

In [7]:
df = df.assign(id=np.arange(1, len(df) + 1))
df.head()

Unnamed: 0,sym,lbl,species_id,src_id,id
0,IL-1α,IL-1α,1,1,1
1,IL-1β,IL-1β,1,1,2
2,IL-1RA,IL-1RA,1,1,3
3,IL-18,IL-18,1,1,4
4,IL-2,IL-2,1,1,5


In [8]:
path = osp.join(META_DATA_DIR, 'cytokines.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/cytokines.csv'

## Transcription Factors

In [10]:
df = pd.DataFrame([
    ('T-bet', 'T-bet'),
    ('Gata3', 'Gata3'),
    ('FoxP3', 'FoxP3')
], columns=['sym', 'lbl'])
df = df.assign(species=SPECIES_HUMAN_ID, src_id=1)
df = df.assign(id=np.arange(1, len(df) + 1))
df

Unnamed: 0,sym,lbl,species,src_id,id
0,T-bet,T-bet,1,1,1
1,Gata3,Gata3,1,1,2
2,FoxP3,FoxP3,1,1,3


In [11]:
path = osp.join(META_DATA_DIR, 'transcription_factors.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/transcription_factors.csv'

## Cell Types

In [12]:
df = pd.DataFrame([
    ('Th' + str(i), 'Th' + str(i))
    for i in [0, 1, 2, 17, 22]
], columns=['sym', 'lbl'])
df = df.assign(species=SPECIES_HUMAN_ID, src_id=1)
df = df.assign(id=np.arange(1, len(df) + 1))
df

Unnamed: 0,sym,lbl,species,src_id,id
0,Th0,Th0,1,1,1
1,Th1,Th1,1,1,2
2,Th2,Th2,1,1,3
3,Th17,Th17,1,1,4
4,Th22,Th22,1,1,5


In [13]:
path = osp.join(META_DATA_DIR, 'cell_types.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/repos/hammer/t-cell-relation-extraction/pm_subtype_protein_relations/data/meta/cell_types.csv'