In [1]:
import pandas as pd

In [2]:
from collections import Counter

In [3]:
columns =  ['CUI','LAT','TS','LUI','STT','SUI','ISPREF','AUI','SAUI',
            'SCUI','SDUI','SAB','TTY','CODE','STR','SRL','SUPPRESS','CVF',
                            'YEAR_OCCURENCE'] # indicates the last version of the UMLS in which each atom appeared.

In [4]:
s_types = pd.read_csv('../data/MRSTY.RRF', sep='|', header=None, dtype=str)
s_types.head()

Unnamed: 0,0,1,2,3,4,5,6
0,C0000005,T116,A1.4.1.2.1.7,"Amino Acid, Peptide, or Protein",AT17648347,256,
1,C0000005,T121,A1.4.1.1.1,Pharmacologic Substance,AT17575038,256,
2,C0000005,T130,A1.4.1.1.4,"Indicator, Reagent, or Diagnostic Aid",AT17634323,256,
3,C0000039,T109,A1.4.1.2.1,Organic Chemical,AT45562015,256,
4,C0000039,T121,A1.4.1.1.1,Pharmacologic Substance,AT17567371,256,


In [5]:
s_types = s_types.drop(6, axis=1)

In [6]:
s_types.columns = ['CUI','TUI','STN','STY','ATUI','CVF']

In [7]:
s_types = s_types[['CUI','TUI']].drop_duplicates()

In [8]:
UMLS_ST21pv_semantic_types_ids = {'T005', 'T007', 'T017', 'T022', 'T031', 'T033', 'T037', 'T038',
'T058', 'T062', 'T074', 'T082', 'T091', 'T092', 'T097', 'T098', 'T103', 'T168', 'T170', 'T201', 'T204'}

In [9]:
s_types = s_types.loc[s_types['TUI'].isin(UMLS_ST21pv_semantic_types_ids)]

In [10]:
from collections import Counter
Counter(s_types.groupby(['CUI']).count()['TUI'])

Counter({1: 1436096, 2: 53})

In [11]:
history_concepts = pd.read_csv('../data/MRCONSO_HISTORY.txt', sep='|', header=None, dtype=str)
history_concepts.columns = columns

In [12]:
current_concepts = pd.read_csv('../data/MRCONSO.RRF', sep='|', header=None, dtype=str)
current_concepts = current_concepts.drop(current_concepts.columns[-1], axis=1)
current_concepts.columns = columns[:-1]

In [13]:
concepts = pd.concat([history_concepts, current_concepts])
del history_concepts
del current_concepts 
concepts['YEAR_OCCURENCE'] = concepts['YEAR_OCCURENCE'].fillna('2022AA')

In [14]:
concepts = concepts.loc[concepts['YEAR_OCCURENCE'].apply(lambda x: int(x[:4])) >= 2017]

In [15]:
concepts = concepts.loc[concepts['LAT'] == 'ENG']

In [16]:
concepts = concepts.dropna(subset='STR')

In [17]:
concepts.head()

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,YEAR_OCCURENCE
31,C0000039,ENG,S,L0000038,PF,S0007563,N,A17876358,,N0000007747,,NDFRT,SY,N0000007747,"1,2-Dipalmitoyl-Glycerophosphocholine",0,N,,2019AA
32,C0000039,ENG,S,L0354989,PF,S0464922,N,A17900509,,N0000007747,,NDFRT,SY,N0000007747,"3,5,9-Trioxa-4-phosphapentacosan-1-aminium, 4-...",0,N,,2019AA
35,C0000039,ENG,S,L0000035,PF,S0007560,N,A17924851,,N0000007747,,NDFRT,SY,N0000007747,"1,2-Dihexadecyl-sn-Glycerophosphocholine",0,N,,2019AA
36,C0000039,ENG,S,L0012509,PF,S0033297,N,A17924852,,N0000007747,,NDFRT,SY,N0000007747,Dipalmitoyllecithin,0,N,256.0,2019AA
37,C0000039,ENG,P,L0000039,VC,S0007564,N,A17972823,,N0000007747,,NDFRT,PT,N0000007747,"1,2-Dipalmitoylphosphatidylcholine",0,N,,2019AA


In [18]:


UMLS_ST21pv_ontologies_ids = {'CPT', 'FMA', 'GO', 'HGNC', 'HPO', 'ICD10', 'ICD10CM', 'ICD9CM', 'MDR', 'MSH', 'MTH', 'NCBI', 'NCI', 'NDDF', 'NDFRT', 'OMIM', 'RXNORM', 'SNOMEDCT_US'}

In [19]:
concepts = concepts.loc[concepts['SAB'].isin(UMLS_ST21pv_ontologies_ids)]

In [20]:
concepts.head(1)

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,YEAR_OCCURENCE
31,C0000039,ENG,S,L0000038,PF,S0007563,N,A17876358,,N0000007747,,NDFRT,SY,N0000007747,"1,2-Dipalmitoyl-Glycerophosphocholine",0,N,,2019AA


In [21]:
concepts = concepts[['CUI', 'STR', 'SAB']].drop_duplicates()

In [22]:
s_types.to_csv('../data/all_concepts_semantic_types_mapping.csv', index=False)

In [23]:
s_types = s_types[['CUI']].drop_duplicates()

In [24]:
concepts = concepts.merge(s_types)

In [25]:
from difflib import SequenceMatcher

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [26]:
import numpy as np
import tqdm
def choose_best_string(x): 
    all_strings = np.unique(x['STR'].values)
    similarities = np.zeros((len(all_strings), len(all_strings)))
    for i in range(len(all_strings)):
        for j in range(len(all_strings)):
            similarity_val = similarity(all_strings[i], all_strings[j])
            similarities[i,j] = similarity_val
            similarities[j,i] = similarity_val
    
    best_string_idx = similarities.sum(axis=1).argmax()
    best_string = all_strings[best_string_idx]
    result = {
        'best_string': best_string,
        'all_strings': all_strings 
    }
    bar.update(1)
    return pd.Series(result, index=['best_string', 'all_strings'])
  
bar = tqdm.tqdm(total=len(set(concepts['CUI'])))
concepts = concepts.groupby('CUI').apply(choose_best_string).reset_index().drop('all_strings',axis=1)
bar.close()

100%|██████████████████████████████████████████████████████████████████████| 1073954/1073954 [12:47<00:00, 1399.81it/s]


In [49]:
ground_truth_concepts = pd.read_csv('../data/concept_names_tmp.csv')\
    .rename({'concept_id': 'CUI', 'concept_name': 'STR'}, axis=1)

In [51]:
ground_truth_concepts = ground_truth_concepts.groupby('CUI')\
    .apply(choose_best_string)\
    .reset_index()\
    .drop('all_strings',axis=1)

In [53]:
concepts = concepts.loc[~concepts['CUI'].isin(set(ground_truth_concepts['CUI']))]

In [54]:
concepts = pd.concat([concepts, ground_truth_concepts])

In [58]:
concepts.to_csv('../data/concepts_strings_with_ids.csv', index=False)

In [60]:
concepts = pd.read_csv('../data/concepts_strings_with_ids.csv')

In [61]:
n = 100000
for i in range(0, len(concepts),n): 
    print(i, i+n)
    concepts.iloc[i: i+n].to_csv(f'../data/concepts_strings_with_ids_{i//n}.csv', index=False)

0 100000
100000 200000
200000 300000
300000 400000
400000 500000
500000 600000
600000 700000
700000 800000
800000 900000
900000 1000000
1000000 1100000
1100000 1200000
