In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import re
import os
import os.path as osp
import dotenv
dotenv.load_dotenv('../env.sh')
None
%run src/protein_tokenization.py

In [20]:
df_tag = pd.read_csv(osp.join(os.environ['DATA_DIR'], 'articles', 'corpus', 'corpus_01', 'tags-union.csv'))
pd.set_option('display.max_info_rows', 10000000)
df_tag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2311011 entries, 0 to 2311010
Data columns (total 11 columns):
id            2311011 non-null object
type          2311011 non-null object
ent_id        987781 non-null object
ent_lbl       987781 non-null object
ent_prefid    987781 non-null object
ent_src       2311011 non-null object
start_chr     2311011 non-null int64
end_chr       2311011 non-null int64
start_wrd     2311011 non-null int64
end_wrd       2311011 non-null int64
text          2311011 non-null object
dtypes: int64(4), object(7)
memory usage: 193.9+ MB


In [21]:
df_tag['ent_src'].value_counts()

jnlpba    1349734
lkp        961277
Name: ent_src, dtype: int64

In [95]:
df_tag[(df_tag['ent_src'] == 'jnlpba') & (df_tag['ent_id'].notnull())].head()

Unnamed: 0,id,type,ent_id,ent_lbl,ent_prefid,ent_src,start_chr,end_chr,start_wrd,end_wrd,text
0,PMC5704053,CELL_TYPE,CTB574584AD019ABB8,Treg,CTB574584AD019ABB8,jnlpba,145,149,33,34,Treg
13,PMC5704053,CELL_TYPE,CT6DE81C14BBB65271,TCM,CT143F3E7366C9650A,jnlpba,3296,3299,595,596,TCM
14,PMC5704053,CELL_TYPE,CTB574584AD019ABB8,Treg,CTB574584AD019ABB8,jnlpba,3618,3622,655,656,Treg
22,PMC5704053,CELL_TYPE,CKE90961F4F0AD8ED1,CCL21,CK17DA99E1B3CC6A24,jnlpba,5068,5071,920,921,SLC
75,PMC5704053,CELL_TYPE,CT7AC25DF3B0DDF9BE,Treg,CTB574584AD019ABB8,jnlpba,13569,13574,2608,2609,Tregs


In [33]:
df_pro = pd.read_csv(osp.join(os.environ['META_DATA_DIR'], 'raw', 'pro.raw.csv'))
df_pro = df_pro[df_pro['syn'].str.len() >= 3]
# REMOVE after next import
df_pro['syn'] = df_pro['syn'].str.replace(r'\(({})\)'.format('|'.join(list(df_pro['species'].unique()))), '')
df_pro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 628794 entries, 0 to 629164
Data columns (total 11 columns):
category      624351 non-null object
id            628794 non-null object
label         628794 non-null object
name          628794 non-null object
namespace     628794 non-null object
parent        488030 non-null object
syn           628794 non-null object
syn_typ       628794 non-null object
species       628794 non-null object
syn_typ_id    628794 non-null int64
species_id    628794 non-null int64
dtypes: int64(2), object(9)
memory usage: 57.6+ MB


In [63]:
df_vocab = (
    df_pro
    .pipe(lambda df: df[df['species'].isin(['human', 'any'])])
    .pipe(lambda df: df[df['syn'].str.len() >= 3])
    .pipe(lambda df: df[~df['syn'].str.contains(' ')])
    # REMOVE after next import
    .assign(syn=lambda df: df['syn'].str.replace(r'\(({})\)'.format('|'.join(list(df['species'].unique()))), ''))
    # Remove synonyms like "hCFAP299/iso:h1"
    .pipe(lambda df: df[~df['syn'].str.contains('/|:')])
    .drop_duplicates('syn')
    #.set_index('syn')
)
df_vocab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 174441 entries, 4 to 629078
Data columns (total 11 columns):
category      173712 non-null object
id            174441 non-null object
label         174441 non-null object
name          174441 non-null object
namespace     174441 non-null object
parent        109884 non-null object
syn           174441 non-null object
syn_typ       174441 non-null object
species       174441 non-null object
syn_typ_id    174441 non-null int64
species_id    174441 non-null int64
dtypes: int64(2), object(9)
memory usage: 16.0+ MB


In [68]:
df_vocab.head()

Unnamed: 0,category,id,label,name,namespace,parent,syn,syn_typ,species,syn_typ_id,species_id
4,gene,PR:000018242,zyxin,PR_000018242,obo,PR:000018242,zyxin-2,exact,any,2,3
11,gene,PR:000018242,zyxin,PR_000018242,obo,PR:000018242,zyxin,label,any,3,3
28,gene,PR:000007467,fasciculation and elongation protein zeta-2,PR_000007467,obo,PR:000007467,zygin-2,exact,any,2,3
31,gene,PR:000007466,fasciculation and elongation protein zeta-1,PR_000007466,obo,PR:000007466,zygin-1,exact,any,2,3
40,gene,PR:000018237,ZW10 interactor,PR_000018237,obo,PR:000018237,zwint-1,exact,any,2,3


In [71]:
tokenizer = ProteinTokenizer(df_vocab.set_index('syn').to_dict(orient='index'))
#string = 'CD4+CD45RA+CD45RO-4-1BB-CD62L+++CCR7loCD127posCD27positiveCD28hiCD95+CD122+'
string = 'CD4+Thy1.1+CD44hiLy6chiPSGL-1hi'

In [72]:
for t in tokenizer.tokenize(string):
    print(f'{t.text} [term={t.token_text}, sign={t.sign_text}, value={t.sign_value}, metadata={t.metadata}]')

CD4+ [term=CD4, sign=+, value=1, metadata={'category': 'gene', 'id': 'PR:000001004', 'label': 'CD4 molecule', 'name': 'PR_000001004', 'namespace': 'obo', 'parent': 'PR:000001004', 'syn_typ': 'exact', 'species': 'any', 'syn_typ_id': 2, 'species_id': 3}]
Thy1.1 [term=Thy1.1, sign=None, value=0, metadata=None]
CD44hi [term=CD44, sign=hi, value=1, metadata={'category': 'gene', 'id': 'PR:000001307', 'label': 'CD44 molecule', 'name': 'PR_000001307', 'namespace': 'obo', 'parent': 'PR:000001307', 'syn_typ': 'exact', 'species': 'any', 'syn_typ_id': 2, 'species_id': 3}]
Ly6chi [term=Ly6c, sign=hi, value=1, metadata={'category': 'gene', 'id': 'PR:000002980', 'label': 'lymphocyte antigen 6C2', 'name': 'PR_000002980', 'namespace': 'obo', 'parent': 'PR:000002980', 'syn_typ': 'related', 'species': 'any', 'syn_typ_id': 1, 'species_id': 3}]
PSGL-1hi [term=PSGL-1, sign=hi, value=1, metadata={'category': 'gene', 'id': 'PR:000001830', 'label': 'P-selectin glycoprotein ligand 1', 'name': 'PR_000001830', 'n

In [74]:
df_ct = (
    df_tag
    .pipe(lambda df: df[df['ent_src'] == 'jnlpba'])
    .pipe(lambda df: df[df['type'].isin(['CELL_TYPE', 'CELL_LINE'])])
    .pipe(lambda df: df[~df['text'].str.contains(' (?:and|or) ')])
)
df_ct.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1300591 entries, 0 to 2310902
Data columns (total 11 columns):
id            1300591 non-null object
type          1300591 non-null object
ent_id        26504 non-null object
ent_lbl       26504 non-null object
ent_prefid    26504 non-null object
ent_src       1300591 non-null object
start_chr     1300591 non-null int64
end_chr       1300591 non-null int64
start_wrd     1300591 non-null int64
end_wrd       1300591 non-null int64
text          1300591 non-null object
dtypes: int64(4), object(7)
memory usage: 119.1+ MB


In [82]:
len(df_ct[df_ct['text'].str.contains('CD') & ~df_ct['text'].str.contains('T[hH]')])

221799

In [81]:
len(df_ct[df_ct['text'].str.contains('CD') & df_ct['text'].str.contains('T[hH]')])

3345

In [92]:
df_ct[df_ct['text'].str.contains('CD') & ~df_ct['text'].str.contains('T[hH]') & df_ct['text'].str.contains('IL')].sample(10)

Unnamed: 0,id,type,ent_id,ent_lbl,ent_prefid,ent_src,start_chr,end_chr,start_wrd,end_wrd,text
1298064,PMC5543485,CELL_TYPE,,,,jnlpba,31791,31819,5498,5503,IL-17-secreting CD8+ T cells
939369,PMC3201195,CELL_LINE,,,,jnlpba,4915,4939,877,882,IL-4/IFN-γ+ CD4+ T cells
924882,PMC3925141,CELL_TYPE,,,,jnlpba,18207,18231,3468,3470,IL-10-producing CD1dhigh
1444266,PMC5940260,CELL_LINE,,,,jnlpba,33128,33155,6046,6050,Lin−CD127+KLRG1+ ILC2 cells
1946891,PMC2213172,CELL_LINE,,,,jnlpba,21965,22005,4052,4057,IL-18–induced CD83+/CCR7+/CD25+ NK cells
4524,PMC5294847,CELL_TYPE,,,,jnlpba,28702,28711,5572,5575,CD8+ TILs
1342839,PMC6116043,CELL_TYPE,,,,jnlpba,28124,28133,5023,5026,CD8+ TILs
2217532,PMC4424383,CELL_LINE,,,,jnlpba,15667,15710,2707,2712,intestinal-derived IL-18Rα+DR3+CD4+ T cells
1359,PMC5686050,CELL_LINE,,,,jnlpba,42520,42548,7171,7176,IL-4-responsive CD4+ T cells
733550,PMC6426975,CELL_TYPE,,,,jnlpba,34262,34278,6578,6581,CD90.1+CD8+ TILs
