In [1]:
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import random
import itertools

In [2]:
cui_df = pickle.load(open('./cui_df.p', 'rb'))

In [3]:
lens = [len(set(indexes)) for indexes in cui_df['indexes']]
cui_df['doc_count'] = lens

In [4]:
lens = [len(indexes) for indexes in cui_df['indexes']]
cui_df['cui_count'] = lens

In [5]:
cui_df.sort_values(by='doc_count', ascending = False, inplace = True)

In [6]:
df = pickle.load(open('hpo_ret_df2.p', 'rb'))

In [11]:
def tfidf(df, cui_df):
    tfidf_d = {}
    total_docs = len(df)
    count = 0
    
    # get rid of none values and replace with empty lists.
    doc_ents = []
    for ents in df['hpo_ents']:
        if type(ents) == list:
            doc_ents.append(ents)
        else:
            doc_ents.append([])  
    df['hpo_ents'] = doc_ents
    
    for doc_ents in df['hpo_ents']:
        holding_d = {}
        counter_d = Counter(doc_ents)
        doc_ent_counts = len(doc_ents)
        for ent in doc_ents:
            cui_i = cui_df.index[list(cui_df['hpo_str']).index(ent)]
            # tf = number of terms in the doc / total num terms in the doc
            term_freq = counter_d[ent] / doc_ent_counts
            # DF(t) = log_e(Total number of documents / Number of documents with term t in it)
            inv_doc_freq = np.log(total_docs / cui_df.loc[cui_i,'doc_count'])
            tfidf = term_freq * inv_doc_freq
            holding_d.update({ent:tfidf})
        tfidf_d.update({df.index[count]:holding_d})
        count+=1
        
    return tfidf_d, df

In [12]:
# apply fcuntion to our dataframes
tfidf_d, df = tfidf(df, cui_df)

tfidf_df = pd.DataFrame.from_dict(tfidf_d, orient = 'index')

In [13]:
top_terms = {}
for index, row in tfidf_df.iterrows():
    top_ten = row.sort_values(ascending=False)[:10]
    kws = list(top_ten.keys())
    top_terms.update({index:kws})
# lastly we'll add the df_ents to our paed_df and save it to file
df['top_hpo'] = df.index.to_series().map(top_terms)

In [14]:
import spacy
import scispacy
nlp = spacy.load('en_core_sci_lg')

In [15]:
# function to process a cui_df to rank the entity sentences by most interesting.
# we are going to use the tfidf vectorizer and then normalize for number of entities in the sentence

def tfidf_best_sents(cui_df):
    
    
    # holding list for the top sentences for each term
    top_sents = []
    
    # dummy funct for vectorizer
    def dummy(ents):
        return(ents)
    
    # import the tfidf vectorizer from sklearn
    from sklearn.feature_extraction.text import TfidfVectorizer
    # instantiate with dummy funct for preprocessing and tokenization (we will already have the entities as a list)
    tfidf_vectorizer = TfidfVectorizer(preprocessor = dummy, tokenizer = dummy, ngram_range = (1,1))

    # iterate through the data frame, processing each term and term sentences
    count = 0
    for index, row in cui_df.iterrows():
        # holding list for the spacy ents
        ents = []
        # set the term explicitly
        term = row['hpo_str']
        # set the sentences for that term
        sents = row['sents']
        # process each sentence, extracting the ents
        # we can batch these to make it run a bit quicker
        for doc in nlp.pipe(sents, batch_size = 500):
            ents.append([str(ent) for ent in doc.ents])


        # now we have the ents, lets make a tfidf_vector, one vector for each sentence
        tfidf_vec = tfidf_vectorizer.fit_transform(ents)
        # sum the values for the vecor
        sums = [sum(vec) for vec in tfidf_vec.toarray()]
        # normalise the tfidf sum using the number of ents in that sentence
        norm_sum = [a / b for a, b in zip(sums, [len(ent_list) for ent_list in ents])]
        
        # build a dataframe to sort by norm sum
        df = pd.DataFrame(data={'sents':sents, 'ents':ents, 'sums':sums, 'norm_sum':norm_sum})
        
        # do the subset for sents with at least 4 entites present 
        condition = [len(ents) >= 4 for ents in df['ents']]
        df = df.loc[condition, :]
        
        # sort by largest normalised tfidf sum
        df.sort_values(by = 'norm_sum', ascending = False, inplace = True)
        # pick the top ten sentences
        df = df.head(10)
        # add these sentences to the cui dataframe
        top_sents.append(list(df['sents']))
        count +=1
        
        if count%100 ==0:
            print(f'{count} of {len(cui_df)}')
    cui_df['top_sents'] = top_sents
        
    return cui_df

In [None]:
cui_df = tfidf_best_sents(cui_df)


In [85]:
pickle.dump(cui_df, open('./cui_df.p', 'wb'))

In [None]:
# evaluating the output

In [80]:
for index, row in df.iterrows():
    print(row['hpo_str'])
    print(f'negations = {np.round(sum(row["negation"])/len(row["negation"]),3)}')
    print(f'Trigger: {Counter(row["triggers"])}')
    for sent in row['top_sents']:
        print(sent)
    print('\n\n-------------------\n')

Severe
negations = 0.094
Trigger: Counter({'severe': 5878, 'Severe': 300, 'S. Severe': 3, 'Severe 7': 1, 'M. Severe': 1})
Severe and critical cases were indicated for intensive care unit (ICU) admission.
Severe complications of the disease were not observed in any of the patients.
Severe patients, that is, those requiring ICU, represented 44% of the cohort (n=7).
Severe dyspnea demanded the emergency use of an oxygen face mask.
Rhabdomyolysis in a Patient with Severe Hypothyroidism.
Severe and critical cases were , , , , and for the age groups, < 1, 1 to 5, 6 to 10, 11 to 15, and >= 16 years, respectively.
Severe cases were significantly more common among men than women.
Nevertheless, there were still two severe cases of COVID-19 among these five patients.
Severe COVID-19 in children is rare.
Severe SIADH has previously been observed in a few adults with SARS-CoV-2 pneumonia [].


-------------------

Fever
negations = 0.125
Trigger: Counter({'fever': 4116, 'Fever': 260, 'pyrexia': 22,