In [117]:
import pandas as pd 

In [2]:
import numpy as np
import glob
import os

In [119]:
emb_glob_regex = '../../0.RESULTS/embeddings/*_labels.csv'

In [120]:
def precision(TP, FP): 
    return TP/(TP+FP)

def recall(TP, FN): 
    return TP/(TP+FN)

def F1(precision, recall):
    return 2/(1/precision + 1/recall)

In [121]:
def return_TP_FN_FP(keywords, ground_truth):
    keywords = set(keywords)
    ground_truth = set(ground_truth)
    TP = len(keywords.intersection(ground_truth) )
    FN = len(keywords.difference(ground_truth))
    FP = len(ground_truth.difference(keywords))
    return np.array([TP, FN, FP])

In [122]:
def explode_list(pmid, pairs):
    result = []
    for pair in pairs:
        result.append((pmid, pair[0], pair[1]))
    return result

def explode_df(df): 
    result = []
    for i, row in df.iterrows(): 
        result += explode_list(row['PMID'], eval(row['ncbo_annotations_pairs']))
    return pd.DataFrame(result, columns=['PMID', 'keyword', 'tag'])

In [123]:
def process_id(id_): 
    if ':' in id_:
        id_ = id_.split(':')
    else: 
        id_ = id_.split('_')
    try:
        id_[1] = int(id_[1])
    except (ValueError, IndexError) as e: 
        pass
    return tuple(id_)

In [124]:
annotations = []
for file in glob.glob('../0.RESULTS/preprocessing/*/annotations.csv'): 
    onto = os.path.basename(os.path.dirname(file))
    if 'GO_' in onto:
        onto = 'GO'
    tmp = pd.read_csv(file)
    tmp['ontology'] = onto.lower()
    annotations.append(tmp)
annotations = pd.concat(annotations).drop(['Unnamed: 0','StartIndex','EndIndex'],axis=1)\
    .rename({'EntityID': 'id'}, axis=1)
annotations['id'] = annotations['id'].apply(lambda x: x.lower()).apply(process_id)
annotations.head()

Unnamed: 0,MentionTextSegment,id,PMID,ontology
0,biological pigment,"(chebi, 26130)",11532192,chebi
1,cation,"(chebi, 36916)",11897010,chebi
2,ion,"(chebi, 24870)",11897010,chebi
3,chromogenic compound,"(chebi, 75050)",12079497,chebi
4,electron,"(chebi, 10545)",12546709,chebi


In [125]:
set(annotations['id'].apply(lambda x: x[0]).drop_duplicates())

{'chebi', 'cl', 'go', 'mop', 'ncbitaxon', 'pr', 'so', 'uberon'}

In [126]:
all_tags = pd.concat([pd.read_csv(file) for file in glob.glob('../CRAFT/results/emb_tagger/*')])
all_tags[all_tags['id'].isin([
'1058',
'8620',
'BFO_0000023',
'BFO_0000034',
'CARO_0000013',
'ENVO_01000739',
'FOODON_00003004',
'MGI:1890498',
'MONDO_0021140',
'MONDO_0021152',
'NCBITaxon_family', 
'UPHENO_0001001',
'WBGene00003937',
'WBGene00004950',
'WBGene00006560',
'WBGene00011828',
'main.html?id=1591873',
'main.html?id=2666',
'main.html?id=620991'])]

Unnamed: 0,PMID,keyword,concept,id,ontology


In [128]:
annotations_groupped = annotations.groupby('PMID').apply(lambda x: 
                                 pd.Series({'true_ids': set(x['id'])}, index=['true_ids'])).reset_index()

In [129]:
stats = []
for file in glob.glob('../0.RESULTS/emb_tagger/*'):
    tagged = pd.read_csv(file)
    tagged['id'] = tagged['id'].apply(lambda x: x.lower()).apply(process_id)
    tagged_groupped = tagged.groupby('PMID').apply(lambda x: 
                                 pd.Series({'pred_ids': set(x['id'])}, index=['pred_ids'])).reset_index()
    merged = tagged_groupped.merge(annotations_groupped,how='outer')
    
    
    TP_FN_FP_ids = np.zeros((3,))
    for i, row in merged.iterrows(): 
        TP_FN_FP_ids += return_TP_FN_FP(row['pred_ids'], row['true_ids'])
    
    precision_ = precision(TP_FN_FP_ids[0],TP_FN_FP_ids[2])
    recall_ = recall(TP_FN_FP_ids[0], TP_FN_FP_ids[1])
    stats.append((os.path.basename(file), precision_*100, recall_*100, 100*F1(precision_, recall_)))
pd.DataFrame(stats, columns=['file', 'precision', 'recall', 'F1'])

Unnamed: 0,file,precision,recall,F1
0,bertopic_tagged.csv,5.711264,11.933702,7.725322
1,lda_tagged.csv,4.019038,8.315098,5.418895


In [14]:
ontologies_mappings = pd.concat([pd.read_csv(file) for file in glob.glob('../CRAFT/data/ontologies_mappings/*')])

In [15]:
all_ncbo_concepts_used = pd.concat([pd.read_csv(file) for file in glob.glob('../CRAFT/results/disambiguation/*')])
text_to_id = dict()
for elem in sum(all_ncbo_concepts_used['ncbo_annotations'].apply(eval), []):
    id_ = elem['annotatedClass']['@id'].split('/')[-1]
    for annotation in elem['annotations']:
        text_to_id[annotation['text'].lower()] = id_
all_ncbo_concepts_used = {elem['annotatedClass']['@id'].split('/')[-1] for elem in sum(all_ncbo_concepts_used['ncbo_annotations'].apply(eval), [])}

In [16]:
filtered_concepts = ontologies_mappings[ontologies_mappings['id'].isin(all_ncbo_concepts_used)].drop_duplicates()

In [17]:
all_ncbo_concepts_used.difference(filtered_concepts['id'])

{'T016', 'T025', 'T192'}

In [72]:
ontologies_mappings['label'] = ontologies_mappings['label'].apply(lambda x: x.lower())
label_to_ids = ontologies_mappings.groupby('label').apply(lambda x: list(set(x['id']))).reset_index().rename({0: 'ids'}, axis=1)
label_to_ids = dict(zip(label_to_ids['label'], label_to_ids['ids']))


In [105]:
def process_id(id_): 
    if ':' in id_:
        id_ = id_.split(':')
    else: 
        id_ = id_.split('_')
    try:
        id_[1] = int(id_[1])
    except (ValueError, IndexError) as e: 
        pass
    return tuple(id_)

In [116]:
pd.read_csv(file)['disambiguation_best_concept'].apply(eval).iloc[0]

{'CELL': ['BIOLOGICAL ENTITY'],
 'MOUSE': ['MATERIAL ENTITY'],
 'EMBRYO': ['MATERIAL ENTITY'],
 'EMBRYONIC': ['MATERIAL ENTITY'],
 'DNA': ['MATERIAL ENTITY'],
 'GENE': ['BIOLOGICAL_REGION']}

In [114]:
stats = []
for file in glob.glob('../0.RESULTS/disambiguation/*'):
    dis = pd.read_csv(file) 
    dis['disambiguation_best_concept'] = dis['disambiguation_best_concept'].apply(eval).apply(lambda x: [v[0].lower() for v in x.values()])
    dis = dis[['PMID', 'disambiguation_best_concept']].explode('disambiguation_best_concept')\
        .rename({'disambiguation_best_concept': 'label'}, axis=1)
    
    dis['id'] = dis['label'].map(label_to_ids)
    print(dis.count())
    dis['id'] = dis['id'].apply(lambda x: x if x == x else [])

    dis['id'] = dis['id'].apply(lambda x: [process_id(elem.lower()) for elem in x])
    dis = dis.groupby('PMID').apply(lambda x: 
                     pd.Series({'pred_ids': set(sum(x['id'], []))}, index=['pred_ids'])).reset_index()
    merged = dis.merge(annotations_groupped,on=['PMID'], how='outer')
    
    TP_FN_FP_ids = np.zeros((3,))
    for i, row in merged.iterrows(): 
        TP_FN_FP_ids += return_TP_FN_FP(row['pred_ids'], row['true_ids'])
    
    precision_ = precision(TP_FN_FP_ids[0],TP_FN_FP_ids[2])
    recall_ = recall(TP_FN_FP_ids[0], TP_FN_FP_ids[1])
    stats.append((os.path.basename(file), precision_, recall_, F1(precision_, recall_)))
pd.set_option('display.max_colwidth', None)
pd.DataFrame(stats, columns=['file', 'precision', 'recall', 'F1'])

PMID     710
label    710
id       705
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     710
label    710
id       705
dtype: int64
PMID     710
label    710
id       705
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     710
label    710
id       705
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     710
label    710
id       705
dtype: int64
PMID     589
label    589
id       559
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     589
label    589
id       559
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     589
label    589
id       559
dtype: int64


  return 2/(1/precision + 1/recall)


Unnamed: 0,file,precision,recall,F1
0,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_03_18_02_32_train_sorted.csv,0.0,0.0,0.0
1,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_03_18_02_32_weigthed_and_sorted.csv,0.001058,0.004193,0.001689
2,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_no_sorted_no_weighted.csv,0.0,0.0,0.0
3,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_sorted_no_weighted.csv,0.0,0.0,0.0
4,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_weighted_and_sorted.csv,0.001058,0.004193,0.001689
5,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_no_sorted_no_weighted.csv,0.0,0.0,0.0
6,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_sorted_no_weighted.csv,0.0,0.0,0.0
7,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_weighted_and_sorted.csv,0.0,0.0,0.0


In [24]:
stats = []
for file in glob.glob('../0.RESULTS/disambiguation/*'):
    dis = pd.read_csv(file) 
    dis['disambiguation_best_concept'] = dis['disambiguation_best_concept'].apply(eval).apply(lambda x: [k.lower() for k in x.keys()])
    dis = dis[['PMID', 'disambiguation_best_concept']].explode('disambiguation_best_concept')\
        .rename({'disambiguation_best_concept': 'label'}, axis=1)
    dis['id'] = dis['label'].map(text_to_id).apply(lambda x: process_id(x.lower()))
    dis = dis.groupby('PMID').apply(lambda x: 
                     pd.Series({'pred_ids': set(x['id'])}, index=['pred_ids'])).reset_index()
    merged = dis.merge(annotations_groupped,on=['PMID'], how='outer')
    
    TP_FN_FP_ids = np.zeros((3,))
    for i, row in merged.iterrows(): 
        TP_FN_FP_ids += return_TP_FN_FP(row['pred_ids'], row['true_ids'])
    
    precision_ = precision(TP_FN_FP_ids[0],TP_FN_FP_ids[2])
    recall_ = recall(TP_FN_FP_ids[0], TP_FN_FP_ids[1])
    stats.append((os.path.basename(file), precision_, recall_, F1(precision_, recall_)))
pd.set_option('display.max_colwidth', None)
pd.DataFrame(stats, columns=['file', 'precision', 'recall', 'F1'])

Unnamed: 0,file,precision,recall,F1
0,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_03_18_02_32_train_sorted.csv,0.052882,0.140845,0.076894
1,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_03_18_02_32_weigthed_and_sorted.csv,0.052882,0.140845,0.076894
2,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_no_sorted_no_weighted.csv,0.052882,0.140845,0.076894
3,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_sorted_no_weighted.csv,0.052882,0.140845,0.076894
4,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_weighted_and_sorted.csv,0.052882,0.140845,0.076894
5,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_no_sorted_no_weighted.csv,0.042834,0.137521,0.065323
6,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_sorted_no_weighted.csv,0.042834,0.137521,0.065323
7,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_weighted_and_sorted.csv,0.042834,0.137521,0.065323
