In [2]:
def get_now_str():
    import datetime
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

def transform_strings_to_arrays(df, col_names):
    for col in col_names:
        df[col] = df[col].apply(eval)
    return df

In [3]:
def get_keywords_bertopic(data_path, models_path, results_path, timestamp):
    """Performs bertopic keywords extraction for data after lemmatization.

    Parameters
    ----------
    data_path : str
        Path to preprocessed dataset. Dataset must contain a column with name 'tokenized_words_lemmatize'.
    
    models_path : str
        Path to save the model to (folder must exist).

    results_path : str
        Path to save the results to (folder must exist).

    timestamp : str
        timestamp that will be added to filenames

    Returns
    ------
    (result_path, model_save_name) : tuple[str]
        Frist element is the path to created file with extracted keywrods, second - path to created model.
    """

    import pandas as pd
    from bertopic import BERTopic
    import os 
    import numpy as np

    # basic BertTopic keyword extraction
    def train_transform_save(train_data, model_save_name, min_topic_size=10):
        
        # train transform
        topic_model = BERTopic(min_topic_size=min_topic_size)
        topics, probs = topic_model.fit_transform(train_data.values)

        # save model
        topic_model.save(model_save_name)

        return topic_model, topics, probs


    def load_transform_save(data, model_save_name, results_path):

        # load model
        loaded_model = BERTopic.load(model_save_name)

        # transform for data 
        samples_topics, samples_probs = loaded_model.transform(data.values)
        res_df = pd.DataFrame({
            'PMID': np.unique(data.index),
            'topic_number': samples_topics,
            'topic_probs': samples_probs,
            "topic_keywords": [loaded_model.get_topic(topic_number) for topic_number in samples_topics]
        })
        res_df.to_csv(results_path, index=False)
        return loaded_model, res_df

    ##############################################################################################################################

    full_data = transform_strings_to_arrays(pd.read_csv(data_path), col_names=['tokenized_words_lemmatize'])

    data = full_data.groupby(by = ['PMID'])['tokenized_words_lemmatize'].agg(lambda x: ' '.join(x.values[0] + x.values[1]))

    model_name = f'bertopic_keywords_{timestamp}'
    model_save_name = os.path.join(models_path, model_name)
    result_path = os.path.join(results_path, 'bertopic', f'{model_name}.csv')

    topic_model, topics, probs = train_transform_save(data, model_save_name, min_topic_size=3)
    _, res_df = load_transform_save(data, model_save_name, result_path)

    return result_path, model_save_name 

In [4]:
def tag_ncbo(ontologies, keywords_extractor_name, extracted_keywords_path, results_path, timestamp):
    """Performs NCBO tagging for keywords extracted with get_keywords_bertopic or get_keywords_lda functions.

    Parameters
    ----------
    ontologies : list[str]
        List of string of ontologies ids that will be queried in tagging process.
    
    keywords_extractor_name : str
        Name of the algorithm used to extract keywrods (for file/folders naming)

    extracted_keywords_path : str
        Path to the file returned by get_keywords_bertopic or get_keywords_lda functions.

    results_path : str
        Path to save the results to (folder must exist).

    timestamp : str
        timestamp that will be added to filenames

    Returns
    ------
    save_name : str
        Path to tagged words file.
    """

    import urllib.request, urllib.error, urllib.parse
    import json
    import os
    from pprint import pprint
    import pandas as pd
    import re
    REST_URL = "http://data.bioontology.org"
    API_KEY = "194c9635-ce67-4e70-81c5-898c3a2b30fb"

    def read_keywords_extraction_results(path):
        data = pd.read_csv(path, index_col=0)
        data = transform_strings_to_arrays(data, col_names = ['topic_keywords'])
        data['text_to_annotate'] = data.topic_keywords.apply(
            lambda row: re.sub(r"[\'\[\]]", "", str([keyword[0] for keyword in row]))
            )
        return data

    def get_json(url):
        opener = urllib.request.build_opener()
        opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
        return json.loads(opener.open(url).read())

    def create_annotation_pairs(sample_row, column_name):
        found_concepts = sample_row[column_name]
        res_ann_pairs= []
        for _, concept in enumerate(found_concepts):
            max_trials = 5
            trials_no = 0
            while trials_no < max_trials:
                try:
                    concept_class = get_json(concept["annotatedClass"]["links"]["self"])
                    concept_class_ancestors = get_json(concept["annotatedClass"]['links']['ancestors'])
                    break
                except:
                    trials_no+=1
                    continue
            if trials_no==max_trials:
                raise Exception("number of unsuccessfull connection attempts is max_trials")
            annotations = concept['annotations']
            # annotations for this class
            for annot in annotations:
                res_ann_pairs.append([annot['text'], concept_class["prefLabel"], 'DIRECT', concept["annotatedClass"]["links"]["self"]])
            # annotations for ancestors
            for annot in annotations:
                for ancestor in concept_class_ancestors:
                    res_ann_pairs.append([annot['text'], ancestor["prefLabel"], 'ANCESTOR', concept["annotatedClass"]['links']['ancestors']])
        unique_ann_pairs = [list(x) for x in set(tuple(x) for x in res_ann_pairs)]
        return unique_ann_pairs

        
    ##########################################################################################################################

    # read data
    data = read_keywords_extraction_results(extracted_keywords_path)

    # annotate data
    data['ncbo_annotations'] \
        = data.text_to_annotate.apply(lambda text:  \
            get_json(REST_URL + f"/annotator?ontologies={','.join(ontologies)}&text=" + urllib.parse.quote(text)))

    data = data.reset_index()[['PMID', 'text_to_annotate', 'ncbo_annotations']]

    data_to_annotate = data[['text_to_annotate', 'ncbo_annotations']]
    data_to_annotate = data_to_annotate.loc[data_to_annotate.astype(str).drop_duplicates().index]
    data_to_annotate['ncbo_annotation_pairs'] = data_to_annotate.apply(create_annotation_pairs, column_name='ncbo_annotations', axis = 1)

    # create annotation pairs
    data_to_annotate[['text_to_annotate', 'ncbo_annotation_pairs']].to_dict()
    text_to_annot_ncbo_pairs = dict(zip(data_to_annotate.text_to_annotate, data_to_annotate.ncbo_annotation_pairs))
    data['ncbo_annotations_pairs'] = data['text_to_annotate'].apply(lambda text: text_to_annot_ncbo_pairs[text])

    # save data
    res_folder = f'{results_path}/{keywords_extractor_name}_ncbo'
    if not os.path.exists(res_folder):
        os.makedirs(res_folder)
    save_name = f'{res_folder}/{keywords_extractor_name}_ncbo_{timestamp}.csv'
    data.to_csv(save_name, index=False)

    return save_name

In [5]:
def prepare_data(data_folder,results_folder,option):
    """Performs bertopic keywords extraction for data after lemmatization.

    Parameters
    ----------
    data_folder : str
        Path to folder with files with data (filenames are hardcoded)

    results_folder : str
        Path to folder where results will be saved (folder must exists)

    option : str
        MedM if MedMentions dataset, CRAFT if craft

    Returns
    ------
    result_path : str
        Path to results
    """
    import gzip
    import pandas as pd
    import tqdm
    import re
    import os
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    import xml.etree.ElementTree as ET
    from tqdm import tqdm
  

    def parse(file_path:str, data_columns:list,annotations_columns:list) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

        data = pd.DataFrame(columns = data_columns)
        annotations = pd.DataFrame(columns = annotations_columns)
        errors = pd.DataFrame()
        # hardcoded -  differentiate if the line contains content or annotation
        # if HEADED it is content
        HEADER = re.compile(r"(?P<PMID>[0-9]*)\|(?P<Type>[t|a])\|(?P<Content>.*)")
        with gzip.open(file_path, 'rb') as f:
            i = 0
            for line in tqdm(f.readlines()):
                i+=1
                l = line.decode("utf-8")
                if l == '\n':
                    continue
                h = HEADER.match(l)
                if h:
                    data = pd.concat([data,pd.DataFrame([{k:h.group(k) for k in data_columns}])], ignore_index=True)
                else:
                    _ = l.split('\t')
                    if len(_) == len(annotations_columns):
                        annotations = pd.concat([annotations,pd.DataFrame([dict(zip(annotations_columns,_))])], ignore_index=True)
                    else:
                        errors = pd.concat([errors,pd.DataFrame([l])],ignore_index=True)
        return data, annotations,errors


    def process_lemma(lemmatizer,sentence: list) -> list:
        """
        takes list of tokens and returns steamed tokens without stopwords
        If word contains non-letters it appends it to the final list
        """
        processed = []
        for word in sentence:
            try:
                word_lower = word.lower() 
                if word_lower not in stopwords.words():
                    processed.append(lemmatizer.lemmatize(word))
            except TypeError: # when word contains non-letters
                processed.append(word)
        return processed


    def get_folder_for_ontology(ontology):
        folder = os.path.join('concept-annotation',ontology,ontology,'knowtator')
        return folder


    def get_data_from_file(root):
        annotations = pd.DataFrame(columns=['StartIndex', 'EndIndex','MentionTextSegment','EntityID'])
        for child in root:
        # annotation
            if child.tag=='annotation':
                tmp = {}
                id_name = None
                for c in child:
                    if c.tag == 'mention':
                        id_name = c.attrib['id']
                    elif c.tag == 'span':
                        tmp['StartIndex'] = c.attrib['start']
                        tmp['EndIndex'] = c.attrib['end']
                annotations.loc[id_name,['StartIndex','EndIndex']] = tmp

        # classmention
            else:
                id_name =child.attrib['id']
                tmp = {}
                for c in child:
                    if c.tag == 'mentionClass' and 'id' in c.attrib.keys():
                        tmp['MentionTextSegment'] = c.text
                        tmp['EntityID'] = c.attrib['id']
                        annotations.loc[id_name,['MentionTextSegment','EntityID']] = tmp
        return annotations



    def get_data(texts,ontology,file_name):
        folder = get_folder_for_ontology(ontology)
        folder = os.path.join(data_folder,folder)
        data = pd.DataFrame()
        for file in os.listdir(folder):
            tree = ET.parse(os.path.join(folder,file))
            root = tree.getroot()
            annotations = get_data_from_file(root)
            annotations['PMID'] = file[:8]
            data = pd.concat([data,annotations])
        file_path = os.path.join('data',ontology,file_name)
        isExist = os.path.exists(os.path.join('data',ontology))
        if not isExist:
            os.makedirs(os.path.join('data',ontology))
        data.to_csv(file_path)
        return data


    
    data_columns = ['PMID', 'Type','Content']
    annotations_columns = ['PMID', 'StartIndex','EndIndex','MentionTextSegment','SemanticTypeID','EntityID']

    if option == 'MedM':

        data_21, annotations_21,errors_21 = parse(os.path.join(data_folder,'corpus_pubtator.txt.gz'), data_columns, annotations_columns)
        semantic_mapping = pd.read_csv(os.path.join(data_folder, 'semantic_type_mapping.txt'), sep = '|', header=None)[[1,2]]
        semanitc_mapper = dict(zip(semantic_mapping[1],semantic_mapping[2] ))
        annotations_21['EntityID'] = annotations_21['EntityID'].apply(lambda x : x.replace('\n',''))
        annotations_21['SemanticMeaning'] = annotations_21['SemanticTypeID'].apply(lambda x : semanitc_mapper[x])
        data_21 = data_21.reset_index()
        annotations_21 = annotations_21.reset_index()
        annotations_21.to_csv(os.path.join(results_folder,'annotations.csv'), index=False)


    if option == 'CRAFT':
        articles_folder = os.path.join('data','articles','txt')
        data_21 = pd.DataFrame(columns = ['PMID','Type','Content'])
        for i,file in enumerate(os.listdir(articles_folder)):
            if file[-3:] == 'txt':
                name = file[:-4]
                with open(os.path.join(articles_folder,file),'r',encoding='utf-8') as f:
                    lines = list(f.readlines())
                    data_21.loc[len(data_21)] = {'PMID':name, 'Type':'t','Content':lines[0]}
                    data_21.loc[len(data_21)] = {'PMID':name, 'Type':'a','Content':''.join(lines[1:])}

        for ontology in os.listdir(os.path.join('data','concept-annotation')):
            get_data(data_21,ontology,'annotations.csv')

    lemmatizer = WordNetLemmatizer()

    # sentences
    data_21['tokenized_sentences'] = data_21['Content'].apply(lambda text : nltk.sent_tokenize(text))
    data_21['tokenized_words'] = None
    data_21['tokenized_words_lemmatize'] = None
    for index, row in tqdm(data_21.iterrows(), total = len(data_21)):
        tokens = []
        tokens_lemma = []
        for sentence in row['tokenized_sentences']:
            tok_sen = nltk.word_tokenize(sentence)
            tokens.append(tok_sen)
            tokens_lemma.append(process_lemma(lemmatizer,tok_sen))
        data_21.at[index, 'tokenized_words_lemmatize'] = tokens_lemma


    for index, row in tqdm(data_21.iterrows(), total = len(data_21)):
        tokens = []
        for sentence in row['tokenized_words_lemmatize']:
            sen = []
            for word in sentence:
                if word.isalnum():
                    tokens.append(word)
            
        data_21.at[index,'tokenized_words_lemmatize']  = tokens


    unique_pmids = data_21['PMID'].drop_duplicates()

    results_path = os.path.join(os.path.join(results_folder,'data_processed_whole.csv'))
    data_21.to_csv(results_path, index=False)

    return results_path




In [30]:
def get_keywords_lda(data_path, models_path, results_path, timestamp, num_topics = 10):
    from gensim import corpora, models
    import os
    import pandas as pd
    """Performs lda keywords extraction for data after lemmatization.

    Parameters
    ----------
    data_path : str
        Path to preprocessed dataset. Dataset must contain a column with name 'tokenized_words_lemmatize'.
    
    models_path : str
        Path to save the model to (folder must exist).

    results_path : str
        Path to save the results to (folder must exist).

    timestamp : str
        Timestamp that will be added to filenames

    num_topic : int
        Number of disired topics

    Returns
    ------
    (result_path, model_save_name) : tuple
        Frist element is the path to created file with extracted keywrods, second - path to created model.
    """

    def get_topic_distribution(lda_model):
        topics_distrib = {}
        for t in lda_model.show_topics(21):
            topics_distrib[t[0]] =[(a.split('*')[1][1:-1],float(a.split("*")[0])) for a in t[1].split(' + ')]
        return topics_distrib


    train_data = pd.read_csv(data_path)
    columns = ['tokenized_sentences', 'tokenized_words_lemmatize']
    for col in columns:
        train_data[col] = train_data[col].apply(eval)

    texts = train_data.groupby('PMID')['tokenized_words_lemmatize'].agg(lambda x: x.iloc[0]+x.iloc[1])
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    
    lda_model = models.LdaMulticore(corpus=corpus,
                                        id2word=dictionary,
                                        num_topics=num_topics,
                                        passes = 20)
    doc_lda = lda_model[corpus]

    topic_distribution = get_topic_distribution(lda_model)
    topics_results = pd.DataFrame.from_records([topic_distribution]).T.reset_index().rename(columns = {'index':'topic_number',0:'topic_keywords'})
    topics_results.to_csv(os.path.join(results_path, f'topic_distribution_LDA_{timestamp}.csv'))
    

    docs_train= []
    for doc in doc_lda:
        docs_train.append({
            'topic_number':doc[0][0],
            'topic_probs': float(doc[0][1]),
            'topic_keywords': topics_results.iloc[doc[0][0]]['topic_keywords']

        })
    docs_train = pd.DataFrame.from_records(docs_train)

    train_results = train_data[['PMID']].drop_duplicates().reset_index(drop=True).join(docs_train)
    results_path = os.path.join(os.path.join(results_path, f'LDA_{timestamp}.csv'))
    train_results.to_csv(results_path)

    models_path = os.path.join(models_path,f"lda_model_{timestamp}")
    lda_model.save(models_path)
    return results_path,models_path

In [28]:
def prepare_disambiguation(results_folder,data_path, tagger_path, embedings_path,timestamp,with_21 = False,weigthing=False,sorting=False):
    """Performs disambiguation

    Parameters
    ----------
    results_folder : str
        Path to save results
    
    data_path : str
        Path to save the model to (folder must exist).

    tagger_path : str
        Path to save the results to (folder must exist).

    embedings_path : str
        Path to save the results to (folder must exist).

    timestamp : str
        Timestamp that will be added to filenames

    with_21: bool
        Is column correspnding to 21 semantic types avaiable

    weigthing: bool
        Should the weigthed voting be performed

    sorting: bool
        Should the initial sorting be performed


    Returns
    ------
    result_path : str
        Path to results
    """
    import copy
    import pandas as pd
    import os 
    import math
    import numpy as np

    def get_embeding(word,emb):
        return emb.loc[word]
    def create_tags_list_dict(row, with_sorting = False):
        # dictionary keyword: list of concepts
        # sorting enabled
        result = {}
        for tag in row:
            key = tag[0].split(',')[0].upper()
            value = tag[1].upper()
            if key in result.keys():
                if  value not in result[key]:
                    result[key][value]=1
                else:
                    result[key][value]+=1
            else:
                result[key]= {value:1}
        if with_sorting:
            for key in result.keys():
                d = result[key]
                d = {k:0 for k,v in dict(sorted(d.items(), key=lambda item: item[1], reverse = True)).items()}
                result[key] = d
        else:
            for key in result.keys():
                d = result[key]
                d = {k:0 for k,v in d.items()}
                result[key] = d
            

        return result

    def disambiguation(current_selection,embedings, weigths):
        ''' 
        current_selection : dictionary keyword: list of all unique concepts
        weigths: the importance of given keyword
        
        '''
        # we iterate over the current_selection MAX_ITER times
        vis = []
        iterations = dict(zip(current_selection.keys(),[0]*len(current_selection)))
        new_current_selction  = copy.deepcopy(current_selection)
        should_stop = False
        for i in range(7):

            for keyword, concepts_list in new_current_selction.items():
                if iterations[keyword]>0:
                    break
                distances = {} # for each possible concept calaculate the mean distance from other kewords (concepts of them)
                for concept in concepts_list.keys():
                    distances[concept] = []
                    for k, current_best_tags in new_current_selction.items():
                        # foreach keyword that is not a current one 
                        if k!=keyword:
                            current_best_tag = list(current_best_tags.keys())[0] # the first out of list of concepts
                            try:
                                distances[concept].append(weigths[k]*math.dist(get_embeding(concept,embedings),get_embeding(current_best_tag,embedings))) # append distance from this concept
                            except Exception as e:
                                print(e)
                    distances[concept] = np.mean(distances[concept]) # mean distance 
                if keyword == 'COFFEE':
                    vis.append((i,distances))
                if list(new_current_selction[keyword].values()) == list(dict(sorted(distances.items(), key=lambda item: item[1])).values()):
                    iterations[keyword] = i
                new_current_selction[keyword] = dict(sorted(distances.items(), key=lambda item: item[1]))  # upadate the current selection of this keyword
        return new_current_selction,vis, iterations
        
    def keywords_importance(grouped_data, tagger_data):
        return grouped_data.reset_index().merge(tagger_data[['PMID','topic_keywords']] ,on = 'PMID').set_index('text_to_annotate')

    def get_n_best_tags(data, n = 1):
        return [{k:sorted(v, key=v.get)[:n] for k,v in dd.items()} for dd in data['after_disambiguation']]
    def prepare_disambiguation(data, tagger, embedings,column_name = 'ncbo_annotations_pairs' ,  weighting = False, sorting = False, take_best = 1):
        grouped = data.groupby('text_to_annotate').nth(0)
        # get importance for each keyword -> will be used if weighting True
        grouped = keywords_importance(grouped, tagger )
        grouped['possible_tags'] = grouped[column_name].apply(lambda r: create_tags_list_dict(r, sorting))

        # disambiguation
        res = []
        vis = []
        its = []
        for idx, row  in grouped.iterrows():
            current_selection = row['possible_tags']
            if not weighting:
                weigths = dict(zip(list(row['topic_keywords'].keys()),[1] * len(row['topic_keywords'])))
            else:
                weigths = row['topic_keywords']
            r,v, it= disambiguation(current_selection, embedings,weigths)
            res.append(r)
            vis.append(v)
            its.append(it)
        grouped['after_disambiguation'] = res
        data = data.merge(grouped['after_disambiguation'].reset_index(), on = 'text_to_annotate' )
        data['disambiguation_best_concept'] = get_n_best_tags(data, take_best)
        return data,vis,its


    def prepare_data(data_name, tagger_name, embedings_name,with_21 = True):
        data = pd.read_csv(data_name)
        data['ncbo_annotations_pairs'] = data['ncbo_annotations_pairs'].apply(eval)
        data['ncbo_annotations_pairs']  = data['ncbo_annotations_pairs'].apply(lambda x : [[a[0].upper(),a[1]] for a in x])
        if with_21:
            data['ncbo_annotations_ST21pv_semtypes_pairs'] = data['ncbo_annotations_ST21pv_semtypes_pairs'].apply(eval)
            data['ncbo_annotations_ST21pv_semtypes_pairs']  = data['ncbo_annotations_ST21pv_semtypes_pairs'].apply(lambda x : [[a[0].upper(),a[1]] for a in x])

        tagger = pd.read_csv(tagger_name)
        tagger['topic_keywords'] = tagger['topic_keywords'].apply(eval).apply(lambda x: {k.upper():v for k,v in dict(x).items()})


        embedings = pd.read_csv(embedings_name)
        embedings = embedings.set_index('words')
        embedings.index = embedings.index.str.upper()
        embedings = embedings[~embedings.index.duplicated(keep='first')]

        return  data, tagger, embedings

    def prepare_data(data_name, tagger_name, embedings_name,with_21 = True):
        data = pd.read_csv(data_name)
        data['ncbo_annotations_pairs'] = data['ncbo_annotations_pairs'].apply(eval)
        data['ncbo_annotations_pairs']  = data['ncbo_annotations_pairs'].apply(lambda x : [[a[0].upper(),a[1]] for a in x])
        if with_21:
            data['ncbo_annotations_ST21pv_semtypes_pairs'] = data['ncbo_annotations_ST21pv_semtypes_pairs'].apply(eval)
            data['ncbo_annotations_ST21pv_semtypes_pairs']  = data['ncbo_annotations_ST21pv_semtypes_pairs'].apply(lambda x : [[a[0].upper(),a[1]] for a in x])

        tagger = pd.read_csv(tagger_name)
        tagger['topic_keywords'] = tagger['topic_keywords'].apply(eval).apply(lambda x: {k.upper():v for k,v in dict(x).items()})


        embedings = pd.read_csv(embedings_name)
        embedings = embedings.set_index('words')
        embedings.index = embedings.index.str.upper()
        embedings = embedings[~embedings.index.duplicated(keep='first')]

        return  data, tagger, embedings


    data, tagger, embedings = prepare_data(data_path, tagger_path, embedings_path,with_21)

    data_res, vis, its = prepare_disambiguation(data,tagger,embedings,'ncbo_annotations_pairs' )
    results_path = os.path.join(results_folder,f'disambiguation_res_{timestamp}.csv')
    data_res.to_csv(results_path)

    return results_path





In [29]:
import os
data_folder = '../data'
data_preprocessing_results = '../results_preprocessing'
data_path = "data_processed_whole.csv"
models_path = "../models"
results_path = "../results"
dataset = 'CRAFT'
CRAFT_ONTOLOGIES = ['CHEBI', 'CL', 'GO', 'MONDO', 'MOP', 'NCBITAXON', 'PR', 'SO', 'UBERON']
timestamp = get_now_str()

prepared_data_path = prepare_data(data_folder,data_preprocessing_results,dataset)
extracted_keywords_path, model_path = get_keywords_bertopic(prepared_data_path, models_path, results_path, timestamp)
extracted_keywords_path, model_path = get_keywords_lda(os.path.join(data_preprocessing_results,data_path), models_path, results_path, timestamp)
tagged_keywords_path = tag_ncbo(CRAFT_ONTOLOGIES, 'bertopic', extracted_keywords_path, results_path, timestamp)
dismbiguation_results_path = prepare_disambiguation(results_path,tagged_keywords_path,extracted_keywords_path,
'CRAFT/results/embedings/ncbo_embeddings.csv',timestamp)


'data/results\\disambiguation_res_2023-01-07_21-53-54.csv'