In [5]:
%run utils.py
import os.path as osp
import pandas as pd
import numpy as np

In [2]:
df_hgnc = pd.read_csv('/Users/eczech/repos/hammer/flowrepository-metadata-db/data/param_map_hgnc.csv')
df_hgnc = df_hgnc.dropna()
df_hgnc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85231 entries, 0 to 85231
Data columns (total 2 columns):
symbol             85231 non-null object
approved_symbol    85231 non-null object
dtypes: object(2)
memory usage: 2.0+ MB


In [3]:
def get_hgnc_map(g):
    s = g['symbol'].tolist() + g['approved_symbol'].tolist()
    s = pd.Series(s)
    # Remove all non-alphanumerics, convert to ucase, remove short symbols, and make unique
    s = s.str.replace('\W+', '').str.upper().str.strip()
    s = s[s.str.len() >= 3]
    s = s.drop_duplicates()
    
    # Find the smallest symbol starting with CD and map all others to it
    scd = s[s.str.startswith('CD')]
    if len(scd) == 0:
        return None
    sym = scd.values[np.argmin(scd.str.len().values)]
    res = pd.Series({v: sym for v in s})
    res.index.name = 'key'
    return res

# Find all groups of records with a CD symbol
mask = df_hgnc['symbol'].str.startswith('CD') | df_hgnc['approved_symbol'].str.startswith('CD')
mask = df_hgnc['approved_symbol'].isin(df_hgnc.loc[mask.values]['approved_symbol'].unique())

# Group by approved symbol and map everything to shortest CD symbol
m_hgnc = df_hgnc.loc[mask.values].groupby('approved_symbol').apply(get_hgnc_map).rename('value').reset_index()
m_hgnc = m_hgnc.dropna()

# Drop any duplicated keys
dupe_keys = m_hgnc[m_hgnc['key'].duplicated()]['key'].unique()
m_hgnc = m_hgnc[~m_hgnc['key'].isin(dupe_keys)]
assert m_hgnc['key'].is_unique

# Roll into a dict
m_hgnc.head(25)

Unnamed: 0,approved_symbol,key,value
0,ABCB1,ABC20,CD243
1,ABCB1,CD243,CD243
2,ABCB1,CLCS,CD243
3,ABCB1,GP170,CD243
4,ABCB1,MDR1,CD243
5,ABCB1,PGP,CD243
6,ABCB1,PGY1,CD243
7,ABCB1,ABCB1,CD243
8,ABCG2,ABCP,CD338
9,ABCG2,BCRP,CD338


In [6]:
path = osp.join(DATA_DIR, 'hgnc_map.csv')
m_hgnc.to_csv(path, index=False)
path

'/Users/eczech/tmp/nlp/data/hgnc_map.csv'