In [1]:
import pandas as pd 

In [2]:
import numpy as np
import glob
import os

In [3]:
emb_glob_regex = '../../0.RESULTS/embeddings/*_labels.csv'

In [4]:
def precision(TP, FP): 
    return TP/(TP+FP)

def recall(TP, FN): 
    return TP/(TP+FN)

def F1(precision, recall):
    return 2/(1/precision + 1/recall)

In [5]:
def return_TP_FN_FP(keywords, ground_truth):
    keywords = set(keywords)
    ground_truth = set(ground_truth)
    TP = len(keywords.intersection(ground_truth) )
    FN = len(keywords.difference(ground_truth))
    FP = len(ground_truth.difference(keywords))
    return np.array([TP, FN, FP])

In [6]:
def explode_list(pmid, pairs):
    result = []
    for pair in pairs:
        result.append((pmid, pair[0], pair[1]))
    return result

def explode_df(df): 
    result = []
    for i, row in df.iterrows(): 
        result += explode_list(row['PMID'], eval(row['ncbo_annotations_pairs']))
    return pd.DataFrame(result, columns=['PMID', 'keyword', 'tag'])

In [7]:
def process_id(id_): 
    if ':' in id_:
        id_ = id_.split(':')
    else: 
        id_ = id_.split('_')
    try:
        id_[1] = int(id_[1])
    except (ValueError, IndexError) as e: 
        pass
    return tuple(id_)

In [8]:
annotations = []
for file in glob.glob('../0.RESULTS/preprocessing/*/annotations.csv'): 
    onto = os.path.basename(os.path.dirname(file))
    if 'GO_' in onto:
        onto = 'GO'
    tmp = pd.read_csv(file)
    tmp['ontology'] = onto.lower()
    annotations.append(tmp)
annotations = pd.concat(annotations).drop(['Unnamed: 0','StartIndex','EndIndex'],axis=1)\
    .rename({'EntityID': 'id'}, axis=1)
annotations['id'] = annotations['id'].apply(lambda x: x.lower()).apply(process_id)
annotations.head()

Unnamed: 0,MentionTextSegment,id,PMID,ontology
0,biological pigment,"(chebi, 26130)",11532192,chebi
1,cation,"(chebi, 36916)",11897010,chebi
2,ion,"(chebi, 24870)",11897010,chebi
3,chromogenic compound,"(chebi, 75050)",12079497,chebi
4,electron,"(chebi, 10545)",12546709,chebi


In [9]:
set(annotations['id'].apply(lambda x: x[0]).drop_duplicates())

{'chebi', 'cl', 'go', 'mop', 'ncbitaxon', 'pr', 'so', 'uberon'}

In [10]:
all_tags = pd.concat([pd.read_csv(file) for file in glob.glob('../0.RESULTS/emb_tagger/*')[1:]])
all_tags[all_tags['id'].isin([
'1058',
'8620',
'BFO_0000023',
'BFO_0000034',
'CARO_0000013',
'ENVO_01000739',
'FOODON_00003004',
'MGI:1890498',
'MONDO_0021140',
'MONDO_0021152',
'NCBITaxon_family', 
'UPHENO_0001001',
'WBGene00003937',
'WBGene00004950',
'WBGene00006560',
'WBGene00011828',
'main.html?id=1591873',
'main.html?id=2666',
'main.html?id=620991'])]

Unnamed: 0,PMID,keyword,similarity_type,concept,id,ontology,keywords


In [11]:
annotations_groupped = annotations.groupby('PMID').apply(lambda x: 
                                 pd.Series({'true_ids': set(x['id'])}, index=['true_ids'])).reset_index()

In [12]:
pd.set_option('display.max_colwidth', None)

In [13]:
stats = []
for file in glob.glob('../0.RESULTS/emb_tagger/*'):
    tagged = pd.read_csv(file)
    tagged['id'] = tagged['id'].apply(lambda x: x.lower()).apply(process_id)
    tagged_groupped = tagged.groupby('PMID').apply(lambda x: 
                                 pd.Series({'pred_ids': set(x['id'])}, index=['pred_ids'])).reset_index()
    merged = tagged_groupped.merge(annotations_groupped,how='outer')
    
    
    TP_FN_FP_ids = np.zeros((3,))
    for i, row in merged.iterrows(): 
        TP_FN_FP_ids += return_TP_FN_FP(row['pred_ids'], row['true_ids'])
    
    precision_ = precision(TP_FN_FP_ids[0],TP_FN_FP_ids[2])
    recall_ = recall(TP_FN_FP_ids[0], TP_FN_FP_ids[1])
    stats.append((os.path.basename(file), precision_*100, recall_*100, 100*F1(precision_, recall_)))
pd.DataFrame(stats, columns=['file', 'precision', 'recall', 'F1'])

Unnamed: 0,file,precision,recall,F1
0,bertopic_lemmatize_nostopwords_data_2023-01-17_23-06-45_keywords_10_min_topic_size_3_bertopic_cosine_tagged.csv,4.124802,8.210526,5.491024
1,bertopic_lemmatize_nostopwords_data_2023-01-17_23-06-45_keywords_10_min_topic_size_3_bertopic_distance_tagged.csv,4.442094,8.493428,5.833333
2,bertopic_lemmatize_nostopwords_data_2023-01-19_20-23-22_keywords_22_min_topic_size_6_bertopic_cosine_tagged.csv,7.350608,6.857425,7.095457
3,bertopic_lemmatize_nostopwords_data_2023-01-19_20-23-22_keywords_22_min_topic_size_6_bertopic_distance_tagged.csv,7.350608,6.790425,7.059421
4,lda_results_2023-01-17_22-31-53_keywords_10_topics_9_lda_cosine_tagged.csv,4.918033,9.707724,6.528607
5,lda_results_2023-01-17_22-31-53_keywords_10_topics_9_lda_distance_tagged.csv,5.182443,9.89899,6.803193
6,lda_results_2023-01-19_20-01-44_keywords_22_topics_7_lda_cosine_tagged.csv,5.552618,5.381855,5.465903
7,lda_results_2023-01-19_20-01-44_keywords_22_topics_7_lda_distance_tagged.csv,5.817028,5.522088,5.665722


In [14]:
ontologies_mappings = pd.concat([pd.read_csv(file) for file in glob.glob('../0.RESULTS/preprocessing/_ontologies_mappings/*')])

In [15]:
all_ncbo_concepts_used = pd.concat([pd.read_csv(file) for file in glob.glob('../0.RESULTS/disambiguation/*')])
text_to_id = dict()
for elem in sum(all_ncbo_concepts_used['ncbo_annotations'].apply(eval), []):
    id_ = elem['annotatedClass']['@id'].split('/')[-1]
    for annotation in elem['annotations']:
        text_to_id[annotation['text'].lower()] = id_
all_ncbo_concepts_used = {elem['annotatedClass']['@id'].split('/')[-1] for elem in sum(all_ncbo_concepts_used['ncbo_annotations'].apply(eval), [])}

In [16]:
filtered_concepts = ontologies_mappings[ontologies_mappings['id'].isin(all_ncbo_concepts_used)].drop_duplicates()

In [17]:
all_ncbo_concepts_used.difference(filtered_concepts['id'])

{'T008', 'T016', 'T025', 'T126', 'T192'}

In [18]:
ontologies_mappings['label'] = ontologies_mappings['label'].apply(lambda x: x.lower())
label_to_ids = ontologies_mappings.groupby('label').apply(lambda x: list(set(x['id']))).reset_index().rename({0: 'ids'}, axis=1)
label_to_ids = dict(zip(label_to_ids['label'], label_to_ids['ids']))


In [19]:
def process_id(id_): 
    if ':' in id_:
        id_ = id_.split(':')
    else: 
        id_ = id_.split('_')
    try:
        id_[1] = int(id_[1])
    except (ValueError, IndexError) as e: 
        pass
    return tuple(id_)

In [20]:
stats = stats[:8]
for file in glob.glob('../0.RESULTS/disambiguation/*'):
    dis = pd.read_csv(file) 
    dis['disambiguation_best_concept'] = dis['disambiguation_best_concept'].apply(eval).apply(lambda x: [v[0].lower() for v in x.values()])
    dis = dis[['PMID', 'disambiguation_best_concept']].explode('disambiguation_best_concept')\
        .rename({'disambiguation_best_concept': 'label'}, axis=1)
    
    dis['id'] = dis['label'].map(label_to_ids)
    print(dis.count())
    dis['id'] = dis['id'].apply(lambda x: x if x == x else [])

    dis['id'] = dis['id'].apply(lambda x: [process_id(elem.lower()) for elem in x])
    dis = dis.groupby('PMID').apply(lambda x: 
                     pd.Series({'pred_ids': set(sum(x['id'], []))}, index=['pred_ids'])).reset_index()
    merged = dis.merge(annotations_groupped,on=['PMID'], how='outer')
    
    TP_FN_FP_ids = np.zeros((3,))
    for i, row in merged.iterrows(): 
        TP_FN_FP_ids += return_TP_FN_FP(row['pred_ids'], row['true_ids'])
    
    precision_ = precision(TP_FN_FP_ids[0],TP_FN_FP_ids[2])
    recall_ = recall(TP_FN_FP_ids[0], TP_FN_FP_ids[1])
    stats.append((os.path.basename(file), precision_, recall_, F1(precision_, recall_)))
pd.set_option('display.max_colwidth', None)
pd.DataFrame(stats, columns=['file', 'precision', 'recall', 'F1'])

PMID     1338
label    1338
id       1338
dtype: int64
PMID     1338
label    1338
id       1338
dtype: int64
PMID     1338
label    1338
id       1229
dtype: int64
PMID     1338
label    1338
id       1338
dtype: int64
PMID     1237
label    1237
id       1237
dtype: int64
PMID     1237
label    1237
id       1237
dtype: int64
PMID     1237
label    1237
id       1152
dtype: int64
PMID     1237
label    1237
id       1237
dtype: int64
PMID     710
label    710
id       705
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     710
label    710
id       705
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     710
label    710
id       705
dtype: int64
PMID     710
label    710
id       705
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     677
label    677
id       677
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     677
label    677
id       677
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     677
label    677
id       607
dtype: int64
PMID     677
label    677
id       677
dtype: int64


  return 2/(1/precision + 1/recall)


Unnamed: 0,file,precision,recall,F1
0,bertopic_lemmatize_nostopwords_data_2023-01-17_23-06-45_keywords_10_min_topic_size_3_bertopic_cosine_tagged.csv,4.124802,8.210526,5.491024
1,bertopic_lemmatize_nostopwords_data_2023-01-17_23-06-45_keywords_10_min_topic_size_3_bertopic_distance_tagged.csv,4.442094,8.493428,5.833333
2,bertopic_lemmatize_nostopwords_data_2023-01-19_20-23-22_keywords_22_min_topic_size_6_bertopic_cosine_tagged.csv,7.350608,6.857425,7.095457
3,bertopic_lemmatize_nostopwords_data_2023-01-19_20-23-22_keywords_22_min_topic_size_6_bertopic_distance_tagged.csv,7.350608,6.790425,7.059421
4,lda_results_2023-01-17_22-31-53_keywords_10_topics_9_lda_cosine_tagged.csv,4.918033,9.707724,6.528607
5,lda_results_2023-01-17_22-31-53_keywords_10_topics_9_lda_distance_tagged.csv,5.182443,9.89899,6.803193
6,lda_results_2023-01-19_20-01-44_keywords_22_topics_7_lda_cosine_tagged.csv,5.552618,5.381855,5.465903
7,lda_results_2023-01-19_20-01-44_keywords_22_topics_7_lda_distance_tagged.csv,5.817028,5.522088,5.665722
8,bertopic_keywords_22_min_topic_size_6_ncbo_2023_01_19_22_02_16_no_sorting_no_weighting_no_forcing.csv,0.021153,0.099256,0.034874
9,bertopic_keywords_22_min_topic_size_6_ncbo_2023_01_19_22_02_16_sorting_no_weighting_no_forcing.csv,0.013749,0.067885,0.022867


In [24]:
results = pd.DataFrame(stats, columns=['file', 'precision', 'recall', 'F1'])

In [25]:
results.to_csv('../0.RESULTS/results.csv', index=False)

In [30]:
for col in results.columns[1:]: 
    results[col] = results[col].apply(lambda x: round(x, 2))

In [31]:
results[results['file'].apply(lambda x: 'no_forcing' not in x)].to_csv('../0.RESULTS/results_without_no_forcing.csv', index=False)