In [1]:
import pandas as pd
import numpy as np
from numba import jit, njit, prange
import glob
import os

In [2]:
emb_cols = [str(col) for col in range(767+1)]

In [3]:
try:
    import google.colab
    colab = True
    from google.colab import drive
    lda_input = '/content/drive/MyDrive/NLP_files/CRAFT/results/lda/*whole*'
    bertopic_input = '/content/drive/MyDrive/NLP_files/CRAFT/results/bertopic/bertopic_lemmatize_nostopwords_data_2023-01-02_10-04-20.csv'

    embeddings_filepath = '/content/drive/MyDrive/NLP_files/CRAFT/data/embeddings/'
    emb_tagger_output = '/content/drive/MyDrive/NLP_files/CRAFT/results/emb_tagger'

    drive.mount('/content/drive')
    device_name = 'cuda'
except ImportError:
    colab = False
    lda_input = '../../0.RESULTS/lda/*lda*'
    bertopic_input = '../../0.RESULTS/bertopic/bertopic*.csv'

    embeddings_output_filepath = '../../0.RESULTS/embeddings/'
    emb_tagger_output = '../../0.RESULTS/emb_tagger/'
    device_name = 'cpu'

In [4]:
@njit
def cosine_sim(a, b): 
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
[cosine_sim(np.random.normal(size=(800,)),np.random.normal(size=(800,))) for _ in range(20000)];

In [5]:
@njit(parallel=True)
def return_cos_similarities(embedding, concept_embeddings):
    result = np.zeros((concept_embeddings.shape[0], ))
    for i in prange(concept_embeddings.shape[0]):
        result[i] = cosine_sim(embedding, concept_embeddings[i])
    return result

In [6]:
@njit
def return_negative_distance(x, y):
    return -np.linalg.norm(x-y) 
return_negative_distance(np.array([1.0,0.0]), np.array([0.0, 1.0]))

-1.4142135623730951

In [7]:
@njit(parallel=True)
def return_negative_distances(embedding, concept_embeddings):
    result = np.zeros((concept_embeddings.shape[0], ))
    for i in prange(concept_embeddings.shape[0]):
        result[i] = return_negative_distance(embedding, concept_embeddings[i])
    return result

In [8]:
proper_onto_names = {'chebi', 'cl', 'go', 'mop', 'ncbitaxon', 'pr', 'so', 'uberon'}
def process_id(id_): 
    if ':' in id_:
        id_ = id_.split(':')
    else: 
        id_ = id_.split('_')
    try:
        id_[1] = int(id_[1])
    except (ValueError, IndexError) as e: 
        pass
    return tuple(id_)

In [9]:
emb_files = [file for file in glob.glob(embeddings_output_filepath + '*') if 'embeddings' not in os.path.basename(file)]

In [10]:
keywords_emb = pd.read_csv(os.path.join(embeddings_output_filepath, 'keywords_embeddings.csv'))

In [11]:
def return_embedded_words(embedding, onto_emb, onto_emb_raw, similarity_func, n_words=1):
    similarities = similarity_func(embedding, onto_emb_raw)
    argsort_indices = np.argsort(similarities)
    chosen_concepts_names = set()
    i = 0
    while len(chosen_concepts_names) != n_words: 
        max_indices = argsort_indices[-(n_words+i):]
        chosen_concepts_names = set(onto_emb.iloc[max_indices]['concept_name'])
        i += 1
    return max_indices, similarities[max_indices]

In [12]:
similarity_funcs = {
    'distance': return_negative_distances,
    'cosine': return_cos_similarities
}

In [13]:
def tag_keywords_with_onto(onto_emb_file): 
    onto_emb = pd.read_csv(onto_emb_file)
    onto_emb['id'] = onto_emb['id'].apply(lambda x: process_id(str(x).lower()))
    onto_emb = onto_emb.loc[onto_emb['id'].apply(lambda x: x[0] in proper_onto_names)]
    onto_emb['id'] = onto_emb['id'].apply(lambda x: '_'.join([str(elem) for elem in x]))

    onto_emb_raw = np.ascontiguousarray(onto_emb[emb_cols].values.astype(np.float32))
    onto_emb = onto_emb.drop(emb_cols, axis=1)
    tagged_keywords = []
    for i, row in keywords_emb.iterrows():
        keyword_emb = row[emb_cols].values.astype(np.float32)
        for similarity_func_name, similarity_func in similarity_funcs.items():
            indices, sims = return_embedded_words(keyword_emb, onto_emb,onto_emb_raw, similarity_func, n_words=5)
            concepts_names = onto_emb.iloc[indices]['concept_name'].to_list()
            concepts_ids = onto_emb.iloc[indices]['id'].to_list()
            tagged_keywords.append((
                row['keyword'],
                similarity_func_name,
                concepts_names,
                concepts_ids,
                sims
            ))
    return pd.DataFrame(tagged_keywords, columns=['keyword','similarity_type', 'concept_names', 'ids', 'similarities'])


In [14]:
all_tagged_keywords = []
for onto_file in emb_files: 
    print(onto_file)
    tagged_keywords = tag_keywords_with_onto(onto_file)
    tagged_keywords['ontology'] = os.path.basename(onto_file).replace('_labels.csv', '')
    all_tagged_keywords.append(tagged_keywords)
all_tagged_keywords = pd.concat(all_tagged_keywords)

../../0.RESULTS/embeddings\chebi_labels.csv
../../0.RESULTS/embeddings\cl_labels.csv
../../0.RESULTS/embeddings\go_labels.csv
../../0.RESULTS/embeddings\mondo_labels.csv


  onto_emb = pd.read_csv(onto_emb_file)


../../0.RESULTS/embeddings\mop_labels.csv
../../0.RESULTS/embeddings\ncbitaxon_labels.csv
../../0.RESULTS/embeddings\pr_labels.csv


  onto_emb = pd.read_csv(onto_emb_file)


../../0.RESULTS/embeddings\so_labels.csv
../../0.RESULTS/embeddings\uberon_labels.csv


In [15]:
all_tagged_keywords.to_csv('../../0.RESULTS/emb_tagger/all_tagged_keywords.csv', index=False)

In [13]:
all_tagged_keywords

Unnamed: 0,keyword,similarity_type,concept_names,ids,similarities,ontology
0,acdp,distance,"['fpreQ1', 'dApdA', 'mpreQ1', 'dpreQ1', 'dCpdA']","['chebi_190459', 'chebi_165807', 'chebi_190456...",[-4.84209204 -4.69494486 -4.68973875 -4.465097...,chebi
1,acdp,cosine,"['dTpdA', 'mpreQ1', 'dApdA', 'dpreQ1', 'dCpdA']","['chebi_165818', 'chebi_190456', 'chebi_165807...",[0.91483617 0.91908842 0.92033052 0.92650104 0...,chebi
2,show,distance,"['GA', 'pharmaceutical', 'R', 'mixture', 'insu...","['chebi_5206', 'chebi_52217', 'chebi_8735', 'c...",[-5.41739035 -5.38768053 -5.37716866 -5.363625...,chebi
3,show,cosine,"['GA', 'R', 'mixture', 'pharmaceutical', 'insu...","['chebi_5206', 'chebi_8735', 'chebi_60004', 'c...",[0.91658378 0.91901428 0.91903907 0.91993684 0...,chebi
4,repair,distance,"['drug', 'mixture', 'cocaine', 'poison', 'insu...","['chebi_23888', 'chebi_60004', 'chebi_27958', ...",[-5.47865343 -5.45871401 -5.40830135 -5.336658...,chebi
...,...,...,...,...,...,...
3271,export,cosine,"['ion transport', 'hormone transport', 'export...","['go_6811', 'go_9914', 'go_140352', 'go_15031'...",[0.87440723 0.877505 0.88236552 0.88591486 0...,uberon
3272,mutant,distance,"['envelope', 'throat', 'gene', 'mating', 'role']","['go_31975', 'uberon_341', 'so_704', 'go_7618'...",[-5.90664911 -5.9003458 -5.89379311 -5.832910...,uberon
3273,mutant,cosine,"['crop', 'envelope', 'throat', 'mating', 'role']","['uberon_7356', 'go_31975', 'uberon_341', 'go_...",[0.9023447 0.90235406 0.90331608 0.9065026 0...,uberon
3274,MSUD,distance,"['rictal barbel', 'obsolete UCoD', 'obsolete M...","['uberon_4300288', 'uberon_2005235', 'uberon_2...",[-5.8149786 -5.69393206 -5.68946886 -5.644691...,uberon


In [14]:
example = all_tagged_keywords[all_tagged_keywords['keyword'] == all_tagged_keywords['keyword'].sample(1).iloc[0]]
example.explode(column=list(example.columns[2:-1])).sort_values('similarities')

Unnamed: 0,keyword,similarity_type,concept_names,ids,similarities,ontology
2468,pax6,distance,"['protein ECM19 (yeast)', 'protein ECM18 (yeas...","['pr_q06011', 'pr_q04623', 'pr_q8bhe3', 'pr_35...",[-5.28561354 -5.23975277 -5.04181767 -5.041636...,pr
2104,pax6,distance,"['Malaxella', 'Gordonia phage Horus', 'bacteri...","['ncbitaxon_871442', 'ncbitaxon_2301696', 'ncb...",[-5.28752041 -5.28111506 -5.2707324 -5.268652...,ncbitaxon
284,pax6,distance,"['PIM6', 'pipacycline', 'Aplotaxene', 'Calaxin...","['chebi_59636', 'chebi_75262', 'chebi_81167', ...",[-5.39859104 -5.33770037 -5.2891717 -5.231288...,chebi
1012,pax6,distance,"['MPF complex', 'Rix1 complex', 'XPC complex',...","['go_31387', 'go_97344', 'go_71942', 'go_2039'...",[-5.46608353 -5.46559954 -5.46224499 -5.400899...,go
2832,pax6,distance,"['DMv5_motif', 'UPD', 'Mat2P', 'priRNA', 'phag...","['so_1159', 'so_1744', 'so_2157', 'so_2022', '...",[-5.58035755 -5.57167339 -5.4996953 -5.420948...,so
3196,pax6,distance,"['obsolete MiD2i', 'adaxial cell', 'pes', 'pla...","['uberon_2005183', 'cl_7016', 'uberon_2387', '...",[-5.66894865 -5.66566706 -5.66474676 -5.604254...,uberon
648,pax6,distance,"['pes joint', 'pes nerve', 'adaxial cell', 'pe...","['uberon_1487', 'uberon_3445', 'cl_7016', 'ube...",[-5.68969393 -5.68506193 -5.66566706 -5.664746...,cl
1376,pax6,distance,"['pons', 'pes joint', 'pes nerve', 'pes', 'pes...","['uberon_988', 'uberon_1487', 'uberon_3445', '...",[-5.69360971 -5.68969393 -5.68506193 -5.664746...,mondo
1740,pax6,distance,"['C-plumbylation', 'N-plumbylation', 'plumbyla...","['mop_5348', 'mop_2348', 'mop_348', 'mop_3348'...",[-5.83728504 -5.81329107 -5.79836798 -5.774922...,mop
1741,pax6,cosine,"['C-plumbylation', 'N-plumbylation', 'plumbyla...","['mop_5348', 'mop_2348', 'mop_348', 'mop_3348'...",[0.87569839 0.87620288 0.87780893 0.87782884 0...,mop


In [36]:
def return_best_concept(x):
    result = {}
    argmax = np.argmax(x['similarities'])
    result['concept'] = x['concept_names'].to_list()[argmax]
    result['similarity'] = x['similarities'].to_list()[argmax]
    return pd.Series(result, index=['concept', 'similarity'])
best_concepts = all_tagged_keywords.explode(column=list(all_tagged_keywords.columns[2:-1]))\
    .groupby(['keyword', 'similarity_type']).apply(return_best_concept).reset_index()
best_concepts[best_concepts['similarity'] < 0.9999].sort_values('similarity')[['keyword','concept']]\
    .tail(20)#.to_csv('sample_tagging.csv', index=False)

Unnamed: 0,keyword,concept
0,A7,Aroa
180,hair,skull
94,cancer,Cancer
286,quantitative,Saga
196,individual,single
192,important,Phyllis
318,seizure,scalp
220,mammary,mammary lobe
282,pulmonary,chest
330,slow,direct


In [37]:
def return_best_concept(x):
    result = {}
    max_mask = x['similarities'].values == x['similarities'].max()

    result['concept'] = x['concept_names'].values[max_mask]
   # result['cosine_similarities'] = x['cosine_similarities'].values[max_mask]
    result['id'] = x['ids'].values[max_mask]
    result['ontology'] = tuple(x['ontology'].values[max_mask])

    return pd.Series(result, index=['concept', 'id', 'ontology'])
best_concepts = all_tagged_keywords.explode(column=list(all_tagged_keywords.columns[2:-1]))\
    .groupby(['keyword', 'similarity_type']).apply(return_best_concept).reset_index()\
    .explode(['concept', 'id']).drop_duplicates()
best_concepts

Unnamed: 0,keyword,similarity_type,concept,id,ontology
0,A7,cosine,Aroa,ncbitaxon_1706530,"(ncbitaxon,)"
1,A7,distance,Aroa,ncbitaxon_1706530,"(ncbitaxon,)"
2,Abstract,cosine,Absala,ncbitaxon_1430977,"(ncbitaxon,)"
3,Abstract,distance,Absala,ncbitaxon_1430977,"(ncbitaxon,)"
4,Annexin,cosine,Auxin b,chebi_171925,"(chebi,)"
...,...,...,...,...,...
359,volume,distance,drug,chebi_23888,"(chebi, cl, mondo)"
360,vs,cosine,substitute,so_48,"(so,)"
361,vs,distance,substitute,so_48,"(so,)"
362,γ2,cosine,Gammarida,ncbitaxon_1732204,"(ncbitaxon,)"


In [38]:
if not os.path.exists(emb_tagger_output): 
    os.mkdir(emb_tagger_output)

In [39]:
for keyword_file in glob.glob(lda_input) + glob.glob(bertopic_input): 
    for sim_type in ['cosine', 'distance']:
        keywords_df = pd.read_csv(keyword_file)
        
        basename = os.path.basename(keyword_file).replace('.csv', '')
        
        keywords_df['topic_keywords'] = keywords_df['topic_keywords'].apply(eval)
        keywords_df['keyword'] = keywords_df['topic_keywords'].apply(lambda x: [keyword for keyword, prob in x])
        keywords_df = keywords_df.drop(['Unnamed: 0', 'topic_number', 'topic_probs', 'topic_keywords'], axis=1, errors='ignore')
        keywords_df = keywords_df.explode('keyword')
        keywords_df = keywords_df.merge(best_concepts[best_concepts['similarity_type'] == sim_type], how='left')
        #print(keywords_df.count())
        method = 'bertopic' if 'bertopic' in keyword_file else 'lda'
        print(method)
        keywords_df.to_csv(os.path.join(emb_tagger_output, f'{basename}_{method}_{sim_type}_tagged.csv'), index=False)

lda
lda
lda
lda
bertopic
bertopic
bertopic
bertopic
