In [1]:
import pandas as pd
import numpy as np
from numba import jit, njit, prange
import glob
import os

In [2]:
emb_cols = [str(col) for col in range(767+1)]

In [3]:
@njit
def cosine_sim(a, b): 
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
[cosine_sim(np.random.normal(size=(800,)),np.random.normal(size=(800,))) for _ in range(20000)];

In [4]:
@njit(parallel=True)
def return_similarities(embedding, concept_embeddings):
    result = np.zeros((concept_embeddings.shape[0], ))
    for i in prange(concept_embeddings.shape[0]):
        result[i] = cosine_sim(embedding, concept_embeddings[i])
    return result

In [5]:
try:
    import google.colab
    colab = True
    from google.colab import drive
    lda_input = '/content/drive/MyDrive/NLP_files/CRAFT/results/lda/*whole*'
    bertopic_input = '/content/drive/MyDrive/NLP_files/CRAFT/results/bertopic/bertopic_lemmatize_nostopwords_data_2023-01-02_10-04-20.csv'

    embeddings_filepath = '/content/drive/MyDrive/NLP_files/CRAFT/data/embeddings/'
    emb_tagger_output = '/content/drive/MyDrive/NLP_files/CRAFT/results/emb_tagger'

    drive.mount('/content/drive')
    device_name = 'cuda'
except ImportError:
    colab = False
    lda_input = '../../0.RESULTS/lda/*lda*'
    bertopic_input = '../../0.RESULTS/bertopic/bertopic*.csv'

    embeddings_output_filepath = '../../0.RESULTS/embeddings/'
    emb_tagger_output = '../../0.RESULTS/emb_tagger/'
    device_name = 'cpu'


In [6]:
proper_onto_names = {'chebi', 'cl', 'go', 'mop', 'ncbitaxon', 'pr', 'so', 'uberon'}
def process_id(id_): 
    if ':' in id_:
        id_ = id_.split(':')
    else: 
        id_ = id_.split('_')
    try:
        id_[1] = int(id_[1])
    except (ValueError, IndexError) as e: 
        pass
    return tuple(id_)

In [10]:
emb_files = [file for file in glob.glob(embeddings_output_filepath + '*') if 'embeddings' not in os.path.basename(file)]

In [12]:
keywords_emb = pd.read_csv(os.path.join(embeddings_output_filepath, 'keywords_embeddings.csv'))

In [13]:
def return_embedded_words(embedding, onto_emb, onto_emb_raw, n_words=1):
    cosine_similarities = return_similarities(embedding, onto_emb_raw)
    argsort_indices = np.argsort(cosine_similarities)
    chosen_concepts_names = set()
    i = 0
    while len(chosen_concepts_names) != n_words: 
        max_indices = argsort_indices[-(n_words+i):]
        chosen_concepts_names = set(onto_emb.iloc[max_indices]['concept_name'])
        i += 1
    return max_indices, cosine_similarities[max_indices]

In [14]:
def tag_keywords_with_onto(onto_emb_file): 
    onto_emb = pd.read_csv(onto_emb_file)
    onto_emb['id'] = onto_emb['id'].apply(lambda x: process_id(str(x).lower()))
    onto_emb = onto_emb.loc[onto_emb['id'].apply(lambda x: x[0] in proper_onto_names)]
    onto_emb['id'] = onto_emb['id'].apply(lambda x: '_'.join([str(elem) for elem in x]))

    onto_emb_raw = np.ascontiguousarray(onto_emb[emb_cols].values.astype(np.float32))
    onto_emb = onto_emb.drop(emb_cols, axis=1)
    tagged_keywords = []
    for i, row in keywords_emb.iterrows():
        keyword_emb = row[emb_cols].values.astype(np.float32)
        indices, cosine_sims = return_embedded_words(keyword_emb, onto_emb,onto_emb_raw, n_words=5)
        concepts_names = onto_emb.iloc[indices]['concept_name'].to_list()
        concepts_ids = onto_emb.iloc[indices]['id'].to_list()
        tagged_keywords.append((
            row['keyword'],
            concepts_names,
            concepts_ids,
            cosine_sims
        ))
    return pd.DataFrame(tagged_keywords, columns=['keyword', 'concept_names', 'ids', 'cosine_similarities'])


In [16]:
emb_tagger_output

'../../0.RESULTS/emb_tagger/'

In [15]:
all_tagged_keywords = []
for onto_file in emb_files: 
    print(onto_file)
    tagged_keywords = tag_keywords_with_onto(onto_file)
    tagged_keywords['ontology'] = os.path.basename(onto_file).replace('_labels.csv', '')
    all_tagged_keywords.append(tagged_keywords)
all_tagged_keywords = pd.concat(all_tagged_keywords)

../../0.RESULTS/embeddings\chebi_labels.csv
../../0.RESULTS/embeddings\cl_labels.csv
../../0.RESULTS/embeddings\go_labels.csv
../../0.RESULTS/embeddings\mondo_labels.csv


  onto_emb = pd.read_csv(onto_emb_file)


../../0.RESULTS/embeddings\mop_labels.csv
../../0.RESULTS/embeddings\ncbitaxon_labels-001.csv
../../0.RESULTS/embeddings\pr_labels-002.csv


  onto_emb = pd.read_csv(onto_emb_file)


../../0.RESULTS/embeddings\so_labels.csv
../../0.RESULTS/embeddings\uberon_labels.csv


In [None]:
../../0.RESULTS/emb_tagger/

In [15]:
all_tagged_keywords.to_csv('../CRAFT/results/emb_tagger/all_tagged_keywords.csv', index=False)

In [16]:
all_tagged_keywords

Unnamed: 0,keyword,concept_names,ids,cosine_similarities,ontology
0,ear,"[mixture, GDP, poison, insulin, pharmaceutical]","[chebi_60004, chebi_17552, chebi_64909, chebi_...","[0.9289607405662537, 0.9298566579818726, 0.933...",chebi
1,human,"[pharmaceutical, Argentine, GDP, GA, insulin]","[chebi_52217, chebi_2819, chebi_17552, chebi_5...","[0.9161764979362488, 0.9169188141822815, 0.916...",chebi
2,dopamine,"[dopamine receptor D2 antagonist, dopaminergic...","[chebi_131787, chebi_48560, chebi_48561, chebi...","[0.9408226609230042, 0.9472912549972534, 0.951...",chebi
3,volume,"[solvent, protein, role, mixture, drug]","[chebi_46787, chebi_36080, chebi_50906, chebi_...","[0.904695451259613, 0.907651424407959, 0.91214...",chebi
4,bone,"[food, cocaine, insulin, mineral, pharmaceutical]","[chebi_33290, chebi_27958, chebi_145810, chebi...","[0.9199972748756409, 0.9212406277656555, 0.924...",chebi
...,...,...,...,...,...
86,mouse,"[lung, cheek, skull, brain, cell]","[uberon_2048, uberon_1567, uberon_3129, uberon...","[0.9472812414169312, 0.9493264555931091, 0.949...",uberon
87,learning,"[snout, nipple, mating, regeneration, learning]","[uberon_6333, uberon_2030, go_7618, go_31099, ...","[0.9347430467605591, 0.9348322153091431, 0.939...",uberon
88,rna,"[obsolete gena, pons, brille, scute, tusk]","[uberon_3154, uberon_988, uberon_10515, uberon...","[0.900005578994751, 0.9024689793586731, 0.9045...",uberon
89,bmp2,"[obsolete MiV2, obsolete RoM2m, obsolete RoL2c...","[uberon_2005191, uberon_2005206, uberon_200520...","[0.9145151376724243, 0.915044903755188, 0.9179...",uberon


In [13]:
example = all_tagged_keywords[all_tagged_keywords['keyword'] == all_tagged_keywords['keyword'].sample(1).iloc[0]]
example.explode(column=list(example.columns[1:-1])).sort_values('cosine_similarities')

Unnamed: 0,keyword,concept_names,ids,cosine_similarities,ontology
88,rna,nitrogen group,chebi_51144,0.861365,mop
88,rna,iodylation,mop_521,0.86448,mop
88,rna,defurylation,mop_1449,0.87154,mop
88,rna,bromylation,mop_518,0.872291,mop
88,rna,imination,mop_555,0.875122,mop
88,rna,obsolete insulin,go_16088,0.88908,go
88,rna,H plus,cl_2675,0.89038,cl
88,rna,rib,uberon_2228,0.892239,cl
88,rna,nasal hair,uberon_11931,0.892696,cl
88,rna,CMP salvage,go_6238,0.89411,go


In [29]:
def return_best_concept(x):
    result = {}
    argmax = np.argmax(x['cosine_similarities'])
    result['concept'] = x['concept_names'].to_list()[argmax]
    result['cosine_similarity'] = x['cosine_similarities'].to_list()[argmax]
    return pd.Series(result, index=['concept', 'cosine_similarity'])
best_concepts = all_tagged_keywords.explode(column=list(all_tagged_keywords.columns[1:-1]))\
    .groupby('keyword').apply(return_best_concept).reset_index()
best_concepts[best_concepts['cosine_similarity'] < 0.9999].sort_values('cosine_similarity')[['keyword','concept']]\
    .tail(20).to_csv('sample_tagging.csv', index=False)

In [17]:
def return_best_concept(x):
    result = {}
    max_mask = x['cosine_similarities'].values == x['cosine_similarities'].max()

    result['concept'] = x['concept_names'].values[max_mask]
   # result['cosine_similarities'] = x['cosine_similarities'].values[max_mask]
    result['id'] = x['ids'].values[max_mask]
    result['ontology'] = tuple(x['ontology'].values[max_mask])

    return pd.Series(result, index=['concept', 'id', 'ontology'])
best_concepts = all_tagged_keywords.explode(column=list(all_tagged_keywords.columns[1:-1]))\
    .groupby('keyword').apply(return_best_concept).reset_index()\
    .explode(['concept', 'id']).drop_duplicates()
best_concepts

Unnamed: 0,keyword,concept,id,ontology
0,Abstract,Absala,ncbitaxon_1430977,"(ncbitaxon,)"
1,D2,DG,chebi_190453,"(chebi,)"
2,DNA,DNA,so_352,"(so,)"
3,PAX6,protein PAXX,pr_60049,"(pr,)"
4,RanBP2,Ran-binding protein 9,pr_13716,"(pr,)"
...,...,...,...,...
86,taste,throat,uberon_341,"(mondo, uberon)"
87,tbx15,bacterium dtb45,ncbitaxon_155026,"(ncbitaxon,)"
88,transcription,promoter,so_167,"(so,)"
89,volume,drug,chebi_23888,"(chebi, cl, mondo)"


In [18]:
if not os.path.exists(emb_tagger_output): 
    os.mkdir(emb_tagger_output)

In [19]:
for keyword_file in glob.glob(lda_input) + glob.glob(bertopic_input): 
    keywords_df = pd.read_csv(keyword_file)

    keywords_df['topic_keywords'] = keywords_df['topic_keywords'].apply(eval)
    keywords_df['keyword'] = keywords_df['topic_keywords'].apply(lambda x: [keyword for keyword, prob in x])
    keywords_df = keywords_df.drop(['Unnamed: 0', 'topic_number', 'topic_probs', 'topic_keywords'], axis=1, errors='ignore')
    keywords_df = keywords_df.explode('keyword')
    keywords_df = keywords_df.merge(best_concepts, how='left')
    
    method = 'bertopic' if 'bertopic' in keyword_file else 'lda'
    print(method)
    keywords_df.to_csv(os.path.join(emb_tagger_output, f'{method}_tagged.csv'), index=False)

lda
bertopic
