In [117]:
import pandas as pd 

In [118]:
import numpy as np
import glob
import os

In [119]:
emb_glob_regex = '../results/embeddings/concepts_embeddings_*_biobert.csv'

In [120]:
def precision(TP, FP): 
    return TP/(TP+FP)

def recall(TP, FN): 
    return TP/(TP+FN)

def F1(precision, recall):
    return 2/(1/precision + 1/recall)

In [121]:
def return_TP_FN_FP(keywords, ground_truth):
    keywords = set(keywords)
    ground_truth = set(ground_truth)
    TP = len(keywords.intersection(ground_truth) )
    FN = len(keywords.difference(ground_truth))
    FP = len(ground_truth.difference(keywords))
    return np.array([TP, FN, FP])

In [122]:
def explode_list(pmid, pairs):
    result = []
    for pair in pairs:
        result.append((pmid, pair[0], pair[1]))
    return result

def explode_df(df): 
    result = []
    for i, row in df.iterrows(): 
        result += explode_list(row['PMID'], eval(row['ncbo_annotations_pairs']))
    return pd.DataFrame(result, columns=['PMID', 'keyword', 'tag'])

In [123]:
def process_id(id_): 
    if ':' in id_:
        id_ = id_.split(':')
    else: 
        id_ = id_.split('_')
    try:
        id_[1] = int(id_[1])
    except (ValueError, IndexError) as e: 
        pass
    return tuple(id_)

In [124]:
annotations = []
for file in glob.glob('../CRAFT/data/*/annotations.csv'): 
    onto = os.path.basename(os.path.dirname(file))
    if 'GO_' in onto:
        onto = 'GO'
    tmp = pd.read_csv(file)
    tmp['ontology'] = onto.lower()
    annotations.append(tmp)
annotations = pd.concat(annotations).drop(['Unnamed: 0','StartIndex','EndIndex'],axis=1)\
    .rename({'EntityID': 'id'}, axis=1)
annotations['id'] = annotations['id'].apply(lambda x: x.lower()).apply(process_id)
annotations.head()

Unnamed: 0,MentionTextSegment,id,PMID,ontology
0,biological pigment,"(chebi, 26130)",11532192,chebi
1,cation,"(chebi, 36916)",11897010,chebi
2,ion,"(chebi, 24870)",11897010,chebi
3,chromogenic compound,"(chebi, 75050)",12079497,chebi
4,electron,"(chebi, 10545)",12546709,chebi


In [125]:
set(annotations['id'].apply(lambda x: x[0]).drop_duplicates())

{'chebi', 'cl', 'go', 'mop', 'ncbitaxon', 'pr', 'so', 'uberon'}

In [126]:
all_tags = pd.concat([pd.read_csv(file) for file in glob.glob('../CRAFT/results/emb_tagger/*')])
all_tags[all_tags['id'].isin([
'1058',
'8620',
'BFO_0000023',
'BFO_0000034',
'CARO_0000013',
'ENVO_01000739',
'FOODON_00003004',
'MGI:1890498',
'MONDO_0021140',
'MONDO_0021152',
'NCBITaxon_family', 
'UPHENO_0001001',
'WBGene00003937',
'WBGene00004950',
'WBGene00006560',
'WBGene00011828',
'main.html?id=1591873',
'main.html?id=2666',
'main.html?id=620991'])]

Unnamed: 0,PMID,keyword,concept,id,ontology


In [127]:
set(pd.concat([pd.read_csv(file) for file in glob.glob('../CRAFT/results/emb_tagger/*')])['id'])

{'chebi_134360',
 'chebi_15854',
 'chebi_18243',
 'chebi_190453',
 'chebi_190456',
 'chebi_194078',
 'chebi_22586',
 'chebi_23888',
 'chebi_24621',
 'chebi_36080',
 'chebi_48706',
 'chebi_50906',
 'chebi_52217',
 'chebi_5391',
 'chebi_60004',
 'chebi_75710',
 'chebi_86222',
 'cl_0',
 'cl_210',
 'go_120302',
 'go_23052',
 'go_30496',
 'go_31099',
 'go_36166',
 'go_40007',
 'go_46903',
 'go_5634',
 'go_5874',
 'go_725',
 'go_7612',
 'go_7613',
 'go_785',
 'ncbitaxon_1088410',
 'ncbitaxon_114188',
 'ncbitaxon_1265449',
 'ncbitaxon_1430977',
 'ncbitaxon_155026',
 'ncbitaxon_1655545',
 'ncbitaxon_1732204',
 'ncbitaxon_1768868',
 'ncbitaxon_1869227',
 'ncbitaxon_274080',
 'ncbitaxon_32644',
 'ncbitaxon_336906',
 'ncbitaxon_364787',
 'ncbitaxon_448188',
 'ncbitaxon_family',
 'pr_1',
 'pr_13716',
 'pr_15039',
 'pr_44676',
 'pr_50012',
 'pr_60049',
 'pr_q8vi36',
 'so_1023',
 'so_167',
 'so_2033',
 'so_352',
 'so_667',
 'so_704',
 'so_783',
 'so_856',
 'so_900',
 'so_984',
 'uberon_1021',
 'uber

In [128]:
annotations_groupped = annotations.groupby('PMID').apply(lambda x: 
                                 pd.Series({'true_ids': set(x['id'])}, index=['true_ids'])).reset_index()

In [129]:
stats = []
for file in glob.glob('../CRAFT/results/emb_tagger/*'):
    tagged = pd.read_csv(file)
    tagged['id'] = tagged['id'].apply(lambda x: x.lower()).apply(process_id)
    tagged_groupped = tagged.groupby('PMID').apply(lambda x: 
                                 pd.Series({'pred_ids': set(x['id'])}, index=['pred_ids'])).reset_index()
    merged = tagged_groupped.merge(annotations_groupped,how='outer')
    
    
    TP_FN_FP_ids = np.zeros((3,))
    for i, row in merged.iterrows(): 
        TP_FN_FP_ids += return_TP_FN_FP(row['pred_ids'], row['true_ids'])
    
    precision_ = precision(TP_FN_FP_ids[0],TP_FN_FP_ids[2])
    recall_ = recall(TP_FN_FP_ids[0], TP_FN_FP_ids[1])
    stats.append((os.path.basename(file), precision_*100, recall_*100, 100*F1(precision_, recall_)))
pd.DataFrame(stats, columns=['file', 'precision', 'recall', 'F1'])

Unnamed: 0,file,precision,recall,F1
0,bertopic_tagged.csv,5.711264,11.933702,7.725322
1,lda_tagged.csv,4.019038,8.315098,5.418895


In [14]:
ontologies_mappings = pd.concat([pd.read_csv(file) for file in glob.glob('../CRAFT/data/ontologies_mappings/*')])

In [15]:
all_ncbo_concepts_used = pd.concat([pd.read_csv(file) for file in glob.glob('../CRAFT/results/disambiguation/*')])
text_to_id = dict()
for elem in sum(all_ncbo_concepts_used['ncbo_annotations'].apply(eval), []):
    id_ = elem['annotatedClass']['@id'].split('/')[-1]
    for annotation in elem['annotations']:
        text_to_id[annotation['text'].lower()] = id_
all_ncbo_concepts_used = {elem['annotatedClass']['@id'].split('/')[-1] for elem in sum(all_ncbo_concepts_used['ncbo_annotations'].apply(eval), [])}

In [16]:
filtered_concepts = ontologies_mappings[ontologies_mappings['id'].isin(all_ncbo_concepts_used)].drop_duplicates()

In [17]:
all_ncbo_concepts_used.difference(filtered_concepts['id'])

{'T016', 'T025', 'T192'}

In [72]:
ontologies_mappings['label'] = ontologies_mappings['label'].apply(lambda x: x.lower())
label_to_ids = ontologies_mappings.groupby('label').apply(lambda x: list(set(x['id']))).reset_index().rename({0: 'ids'}, axis=1)
label_to_ids = dict(zip(label_to_ids['label'], label_to_ids['ids']))


In [105]:
def process_id(id_): 
    if ':' in id_:
        id_ = id_.split(':')
    else: 
        id_ = id_.split('_')
    try:
        id_[1] = int(id_[1])
    except (ValueError, IndexError) as e: 
        pass
    return tuple(id_)

In [116]:
pd.read_csv(file)['disambiguation_best_concept'].apply(eval).iloc[0]

{'CELL': ['BIOLOGICAL ENTITY'],
 'MOUSE': ['MATERIAL ENTITY'],
 'EMBRYO': ['MATERIAL ENTITY'],
 'EMBRYONIC': ['MATERIAL ENTITY'],
 'DNA': ['MATERIAL ENTITY'],
 'GENE': ['BIOLOGICAL_REGION']}

In [114]:
stats = []
for file in glob.glob('../CRAFT/results/disambiguation/*'):
    dis = pd.read_csv(file) 
    dis['disambiguation_best_concept'] = dis['disambiguation_best_concept'].apply(eval).apply(lambda x: [v[0].lower() for v in x.values()])
    dis = dis[['PMID', 'disambiguation_best_concept']].explode('disambiguation_best_concept')\
        .rename({'disambiguation_best_concept': 'label'}, axis=1)
    
    dis['id'] = dis['label'].map(label_to_ids)
    print(dis.count())
    dis['id'] = dis['id'].apply(lambda x: x if x == x else [])

    dis['id'] = dis['id'].apply(lambda x: [process_id(elem.lower()) for elem in x])
    dis = dis.groupby('PMID').apply(lambda x: 
                     pd.Series({'pred_ids': set(sum(x['id'], []))}, index=['pred_ids'])).reset_index()
    merged = dis.merge(annotations_groupped,on=['PMID'], how='outer')
    
    TP_FN_FP_ids = np.zeros((3,))
    for i, row in merged.iterrows(): 
        TP_FN_FP_ids += return_TP_FN_FP(row['pred_ids'], row['true_ids'])
    
    precision_ = precision(TP_FN_FP_ids[0],TP_FN_FP_ids[2])
    recall_ = recall(TP_FN_FP_ids[0], TP_FN_FP_ids[1])
    stats.append((os.path.basename(file), precision_, recall_, F1(precision_, recall_)))
pd.set_option('display.max_colwidth', None)
pd.DataFrame(stats, columns=['file', 'precision', 'recall', 'F1'])

PMID     710
label    710
id       705
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     710
label    710
id       705
dtype: int64
PMID     710
label    710
id       705
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     710
label    710
id       705
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     710
label    710
id       705
dtype: int64
PMID     589
label    589
id       559
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     589
label    589
id       559
dtype: int64


  return 2/(1/precision + 1/recall)


PMID     589
label    589
id       559
dtype: int64


  return 2/(1/precision + 1/recall)


Unnamed: 0,file,precision,recall,F1
0,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_03_18_02_32_train_sorted.csv,0.0,0.0,0.0
1,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_03_18_02_32_weigthed_and_sorted.csv,0.001058,0.004193,0.001689
2,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_no_sorted_no_weighted.csv,0.0,0.0,0.0
3,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_sorted_no_weighted.csv,0.0,0.0,0.0
4,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_weighted_and_sorted.csv,0.001058,0.004193,0.001689
5,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_no_sorted_no_weighted.csv,0.0,0.0,0.0
6,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_sorted_no_weighted.csv,0.0,0.0,0.0
7,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_weighted_and_sorted.csv,0.0,0.0,0.0


In [24]:
stats = []
for file in glob.glob('../CRAFT/results/disambiguation/*'):
    dis = pd.read_csv(file) 
    dis['disambiguation_best_concept'] = dis['disambiguation_best_concept'].apply(eval).apply(lambda x: [k.lower() for k in x.keys()])
    dis = dis[['PMID', 'disambiguation_best_concept']].explode('disambiguation_best_concept')\
        .rename({'disambiguation_best_concept': 'label'}, axis=1)
    dis['id'] = dis['label'].map(text_to_id).apply(lambda x: process_id(x.lower()))
    dis = dis.groupby('PMID').apply(lambda x: 
                     pd.Series({'pred_ids': set(x['id'])}, index=['pred_ids'])).reset_index()
    merged = dis.merge(annotations_groupped,on=['PMID'], how='outer')
    
    TP_FN_FP_ids = np.zeros((3,))
    for i, row in merged.iterrows(): 
        TP_FN_FP_ids += return_TP_FN_FP(row['pred_ids'], row['true_ids'])
    
    precision_ = precision(TP_FN_FP_ids[0],TP_FN_FP_ids[2])
    recall_ = recall(TP_FN_FP_ids[0], TP_FN_FP_ids[1])
    stats.append((os.path.basename(file), precision_, recall_, F1(precision_, recall_)))
pd.set_option('display.max_colwidth', None)
pd.DataFrame(stats, columns=['file', 'precision', 'recall', 'F1'])

Unnamed: 0,file,precision,recall,F1
0,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_03_18_02_32_train_sorted.csv,0.052882,0.140845,0.076894
1,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_03_18_02_32_weigthed_and_sorted.csv,0.052882,0.140845,0.076894
2,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_no_sorted_no_weighted.csv,0.052882,0.140845,0.076894
3,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_sorted_no_weighted.csv,0.052882,0.140845,0.076894
4,bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_51_30_weighted_and_sorted.csv,0.052882,0.140845,0.076894
5,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_no_sorted_no_weighted.csv,0.042834,0.137521,0.065323
6,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_sorted_no_weighted.csv,0.042834,0.137521,0.065323
7,lda_ncbo_fulldata_lemmatize_nostopwords_data_2023_01_04_17_37_13_weighted_and_sorted.csv,0.042834,0.137521,0.065323


In [7]:
gt = pd.read_csv('../data/concepts_per_document.csv')\
    .rename({'Concepts': 'true_CUIs'}, axis=1)\
    .drop('Unique_concepts', axis=1)
gt['true_CUIs'] = gt['true_CUIs'].apply(eval).apply(set).apply(list).apply(lambda cuis: [cui[5:] for cui in cuis])
gt.head()

Unnamed: 0,PMID,true_CUIs
0,25763772,"[C0035204, C0033809, C0444245, C0010674, C0032..."
1,25847295,"[C0010749, C0007620, C0376669, C0219474, C0678..."
2,26316050,"[C1257792, C1849265, C2603343, C0442069, C1708..."
3,26406200,"[C0205143, C0037949, C0277814, C0679646, C0231..."
4,26424709,"[C0008819, C0018704, C0237096, C0679646, C0241..."


In [8]:
concepts_emb = pd.concat([pd.read_csv(file) for file in glob.glob(emb_glob_regex)]).reset_index(drop=True)

In [9]:
concepts_to_cuis = concepts_emb.groupby('concept_name').apply(lambda x: list(x['CUI'])).reset_index()

In [10]:
concepts_to_cuis = concepts_to_cuis.rename({0: 'CUIs'}, axis=1)

In [11]:
del concepts_emb

In [12]:
def concat_cuis(x): 
    result = {
        'pred_cuis': sum(x['CUIs'], []),
        'true_cuis': sum(x['true_CUIs'], [])
    }
    return pd.Series(result, index=['pred_cuis', 'true_cuis'])

In [13]:
UMLS_ST21pv_semantic_types_ids = {'T005', 'T007', 'T017', 'T022', 'T031', 'T033', 'T037', 'T038',
'T058', 'T062', 'T074', 'T082', 'T091', 'T092', 'T097', 'T098', 'T103', 'T168', 'T170', 'T201', 'T204'}

In [14]:
s_types = pd.read_csv('../data/MRSTY.RRF', sep='|', header=None, dtype=str)
s_types = s_types[[0, 1]].rename({0: 'CUI', 1: 'type'}, axis=1)
s_types = s_types.loc[s_types['type'].isin(UMLS_ST21pv_semantic_types_ids)]
s_types = s_types.groupby('CUI').apply(lambda x: list(x['type'])).reset_index()\
    .rename({0: 'types'}, axis=1)
s_types.head()

Unnamed: 0,CUI,types
0,C0000722,[T170]
1,C0000723,[T170]
2,C0000731,[T033]
3,C0000734,[T033]
4,C0000742,[T005]


In [15]:
s_types = dict(zip(s_types['CUI'], s_types['types']))


In [16]:
def change_cui_to_type(x):
    result = {}
    result['pred_type'] = [s_types[cui] for cui in set(x['pred_cuis']) if cui in s_types]
    result['true_type'] = [s_types[cui] for cui in set(x['true_cuis']) if cui in s_types]
    
    result['pred_type'] = sum(result['pred_type'], [])
    result['true_type'] = sum(result['true_type'], [])

    return pd.Series(result, index=['pred_type', 'true_type'])


In [17]:
import copy
stats = []
for file in glob.glob(r'..\results\emb_tagger\tagged_1*'):
    tagged = explode_df(pd.read_csv(file))
    tagged = tagged.merge(concepts_to_cuis, left_on='tag', right_on='concept_name', how='left')\
        .merge(gt, how='left')
    
    tagged = tagged.groupby('PMID').apply(concat_cuis).reset_index()
    
    tagged[['pred_type', 'true_type']] = tagged[['pred_cuis','true_cuis']].apply(change_cui_to_type, axis=1)
    
    TP_FN_FP_cuis = np.zeros((3,))
    TP_FN_FP_types = np.zeros((3,))
    for i, row in tagged.iterrows(): 
        TP_FN_FP_cuis += return_TP_FN_FP(row['pred_cuis'], row['true_cuis'])
        TP_FN_FP_types += return_TP_FN_FP(row['pred_type'], row['true_type'])
    
    precision_ = precision(TP_FN_FP_cuis[0], TP_FN_FP_cuis[2])
    recall_ = recall(TP_FN_FP_cuis[0], TP_FN_FP_cuis[1])
    stats.append(('cui', os.path.basename(file), precision_, recall_, F1(precision_, recall_)))
    
    precision_ = precision(TP_FN_FP_types[0], TP_FN_FP_types[2])
    recall_ = recall(TP_FN_FP_types[0], TP_FN_FP_types[1])
    stats.append(('type', os.path.basename(file), precision_, recall_, F1(precision_, recall_)))

pd.DataFrame(stats, columns=['agg_level', 'file', 'precision', 'recall', 'F1']).sort_values(['agg_level', 'file'])

Unnamed: 0,agg_level,file,precision,recall,F1
4,cui,tagged_1_word_LDA_test_2022-12-07_15-04-23_emb...,0.016789,0.026272,0.020486
6,cui,tagged_1_word_LDA_train_2022-12-07_15-04-23_em...,0.014899,0.022509,0.01793
0,cui,tagged_1_word_bertopic_lemmatize_nostopwords_d...,0.012911,0.019117,0.015413
2,cui,tagged_1_word_bertopic_lemmatize_nostopwords_d...,0.011797,0.017724,0.014166
5,type,tagged_1_word_LDA_test_2022-12-07_15-04-23_emb...,0.478951,0.443718,0.460662
7,type,tagged_1_word_LDA_train_2022-12-07_15-04-23_em...,0.423714,0.4022,0.412677
1,type,tagged_1_word_bertopic_lemmatize_nostopwords_d...,0.359718,0.390211,0.374344
3,type,tagged_1_word_bertopic_lemmatize_nostopwords_d...,0.361691,0.377584,0.369467


In [18]:
import copy
for file in glob.glob('../results/bertopic_nbco/disambiguation/*disambiguation.csv') + glob.glob('../results/lda_ncbo/disambiguation/*'):
    df = pd.read_csv(file)
    df = df[['PMID', 'disambiguation_fine_tuned_best_concept']]
    df['disambiguation_fine_tuned_best_concept'] = df['disambiguation_fine_tuned_best_concept']\
        .apply(eval).apply(lambda x: list(x.values()))
    df = df.explode('disambiguation_fine_tuned_best_concept')
    df = df.merge(concepts_to_cuis, left_on='disambiguation_fine_tuned_best_concept', right_on='concept_name', how='left')\
        .drop('disambiguation_fine_tuned_best_concept', axis=1)\
        .merge(gt, how='left')\
        .rename({'CUIs': 'pred_cuis', 'true_CUIs': 'true_cuis'}, axis=1)
    df['pred_cuis'] = df['pred_cuis'].apply(lambda x: [] if x != x else x)
    df[['pred_type', 'true_type']] = df[['pred_cuis','true_cuis']].apply(change_cui_to_type, axis=1)
    
    TP_FN_FP_cuis = np.zeros((3,))
    TP_FN_FP_types = np.zeros((3,))
    for i, row in df.iterrows(): 
        TP_FN_FP_cuis += return_TP_FN_FP(row['pred_cuis'], row['true_cuis'])
        TP_FN_FP_types += return_TP_FN_FP(row['pred_type'], row['true_type'])
    
    precision_ = precision(TP_FN_FP_cuis[0], TP_FN_FP_cuis[2])
    recall_ = recall(TP_FN_FP_cuis[0], TP_FN_FP_cuis[1])
    stats.append(('cui', os.path.basename(file), precision_, recall_, F1(precision_, recall_)))
    
    precision_ = precision(TP_FN_FP_types[0], TP_FN_FP_types[2])
    recall_ = recall(TP_FN_FP_types[0], TP_FN_FP_types[1])
    stats.append(('type', os.path.basename(file), precision_, recall_, F1(precision_, recall_)))
stats = pd.DataFrame(stats, columns=['agg_level', 'file', 'precision', 'recall', 'F1'])
stats.sort_values(['agg_level', 'file'])#.to_csv('../results')

  return 2/(1/precision + 1/recall)


Unnamed: 0,agg_level,file,precision,recall,F1
8,cui,bertopic_ncbo_fulldata_lemmatize_nostopwords_d...,0.000106,0.029283,0.000212
10,cui,bertopic_ncbo_fulldata_lemmatize_nostopwords_d...,0.000106,0.024451,0.000211
12,cui,ncbo_lda_test_21_results_2022-12-08.csv,0.0,0.0,0.0
14,cui,ncbo_lda_train_21_results_2022-12-08.csv,3.4e-05,0.000692,6.4e-05
4,cui,tagged_1_word_LDA_test_2022-12-07_15-04-23_emb...,0.016789,0.026272,0.020486
6,cui,tagged_1_word_LDA_train_2022-12-07_15-04-23_em...,0.014899,0.022509,0.01793
0,cui,tagged_1_word_bertopic_lemmatize_nostopwords_d...,0.012911,0.019117,0.015413
2,cui,tagged_1_word_bertopic_lemmatize_nostopwords_d...,0.011797,0.017724,0.014166
9,type,bertopic_ncbo_fulldata_lemmatize_nostopwords_d...,0.004637,0.574675,0.0092
11,type,bertopic_ncbo_fulldata_lemmatize_nostopwords_d...,0.004613,0.497829,0.009141


In [19]:
files_to_names = {'../results/bertopic_nbco/disambiguation\\bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2022-11-30_12-09-18_test_disambiguation.csv': 'bertopic_ncbo_test',
 '../results/bertopic_nbco/disambiguation\\bertopic_ncbo_fulldata_lemmatize_nostopwords_data_2022-11-30_12-09-18_train_disambiguation.csv': 'bertopic_ncbo_train',
 '../results/lda_ncbo/disambiguation\\ncbo_lda_test_21_results_2022-12-08.csv': 'lda_ncbo_test',
 '../results/lda_ncbo/disambiguation\\ncbo_lda_train_21_results_2022-12-08.csv': 'lda_ncbo_train',
 '..\\results\\emb_tagger\\tagged_1_word_bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_test_emb_tagger_biobert.csv':'bertopic_emb_test',
 '..\\results\\emb_tagger\\tagged_1_word_bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_train_emb_tagger_biobert.csv': 'bertopic_emb_train',
 '..\\results\\emb_tagger\\tagged_1_word_LDA_test_2022-12-07_15-04-23_emb_tagger_biobert.csv': 'lda_emb_test',
 '..\\results\\emb_tagger\\tagged_1_word_LDA_train_2022-12-07_15-04-23_emb_tagger_biobert.csv': 'lda_emb_train'}
files_to_names = {os.path.basename(file): name for file, name in files_to_names.items()}

In [20]:
stats['file'] = stats['file'].map(files_to_names)

In [21]:
for col in ['precision','recall','F1']: 
    stats[col] = stats[col].apply(lambda x: round(100*x, 2))

In [24]:
stats[stats['agg_level'] != 'cui'].drop('agg_level',axis=1).sort_values(['file']).to_csv('../results/stats.csv', index=False)