In [2]:
from collections import Counter
import pandas as pd
import stanza
import re

In [3]:
en_gt = pd.read_csv('../ACTER/en/htfl/annotations/htfl_en_terms.ann', sep='	', engine='python',header=None)
en_preds = pd.read_csv('../results/en/ann/roberta_ann_predictions_3035.csv')
preds = list(en_preds['predictions'])
gts = list(en_gt[0])

In [4]:
def count_freq(preds, gts):
    preds_len = [len(x.split(' ')) for x in preds]
    gts_len = [len(x.split(' ')) for x in gts]
    print(Counter(preds_len))
    print(Counter(gts_len))

In [5]:
preds_len = [len(x.split(' ')) for x in preds]
gts_len = [len(x.split(' ')) for x in gts]
print(Counter(preds_len))
print(Counter(gts_len))

Counter({2: 1059, 1: 718, 3: 50, 4: 2})
Counter({1: 1064, 2: 766, 3: 357, 4: 120, 5: 31, 6: 18, 7: 4, 8: 1})


In [6]:
max(gts, key=len)

'heart failure with preserved left ventricular ejection fraction'

In [7]:
# gts[gts.str.split(' ').len() == 3]

In [8]:
# P = TP/(TP+FP)
# R =  TP/(TP+FN)
def evaluation_metrics(pred, gt):
    TP = len(set(pred) & set(gt)) 
    FP = len(set(pred)-set(gt))
    FN = len(set(gt)-set(pred))
    print(TP, FP, FN)
    precision = round((TP/(TP+FP))*100, 2)
    recall = round((TP/(TP+FN))*100,2)
    f1_score = round((2 * precision * recall) / (precision + recall),2)
    return precision, recall, f1_score

precision, recall, f1_score = evaluation_metrics(preds, gts)
print(len(set(preds)))
print(len(set(gts)))
precision, recall, f1_score

636 1193 1725
1829
2361


(34.77, 26.94, 30.36)

Stanza

In [9]:
nlp = stanza.Pipeline(lang='en')# processors='tokenize,mwt,pos,lemma'

2021-04-22 00:06:20 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-04-22 00:06:20 INFO: Use device: cpu
2021-04-22 00:06:20 INFO: Loading: tokenize
2021-04-22 00:06:20 INFO: Loading: pos
2021-04-22 00:06:21 INFO: Loading: lemma
2021-04-22 00:06:21 INFO: Loading: depparse
2021-04-22 00:06:21 INFO: Loading: sentiment
2021-04-22 00:06:21 INFO: Loading: ner
2021-04-22 00:06:22 INFO: Done loading processors!


In [10]:
def lemma(li):
    new_list = []
    for t in li:
        doc = nlp(str(t))
        doc1 = ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
        doc1 = re.sub('-',' ',doc1)
        doc1 = re.sub(' +', ' ',doc1)
        new_list.append(doc1)
    new_list = [s for s in new_list if len(s) >= 2]
    return new_list

In [11]:
preds_lem = lemma(preds)
gts_lem = lemma(gts)

print('After lemmatization')
print(len(set(preds_lem)))
print(len(set((gts_lem))))
print(evaluation_metrics(list(set(preds_lem)), list(set(gts_lem))))

After lemmatization
1776
2149
608 1168 1541
(34.23, 28.29, 30.98)


In [12]:
set(preds_lem)

{'haematopoietic cell',
 'crude',
 'cardiomyocyte',
 'shunt implant',
 'leave',
 'atrial',
 'case costing',
 'mechanical cardiac',
 'substrate metabolism',
 'for',
 'continuous flow',
 'breast',
 'administrative health',
 'hospitalize patient',
 'crude rate',
 'endothelial',
 'rct',
 'fragment',
 'type natriuretic peptide',
 'indeterminable anaerobic threshold',
 'metabolism',
 'infusion ,',
 'oxygen pulse',
 'interventricular',
 'myocardial insulin',
 'dynamic',
 'dopamine',
 'cytoskeletal',
 'respective molecular',
 'tricuspid',
 'transmitral flow',
 'signal transduction',
 'discrimination improvement',
 'oscillation',
 'transmitral blood',
 'neoplastic',
 'lateral',
 'lipoprotein',
 'specific monoclonal',
 'public reporting',
 'beagle',
 'myocardial fibrosis',
 'metabolic abnormality',
 'multicenter automatic',
 'cpet',
 'artery disease',
 'baroreflex sensitivity',
 'lysine',
 'ventricular pump',
 'comprehensive echocardiography',
 'percent relative',
 'remodeling',
 'urea nitrogen'

In [13]:
set(gts_lem)

{'cardiomyocyte',
 'psychosocial',
 'downregulation',
 'heart failure with preserve ejection fraction',
 'interleukin receptor',
 'pathophysiologic',
 'ser96',
 'atrial',
 'prognosis',
 'antagonism',
 'lad',
 'nfat',
 'atropine',
 'multivariate logistic regression analysis',
 'renal underperfusion',
 'nyha class',
 'continuous flow',
 'arni',
 'combine end point',
 'diastolic stiffness',
 'fully magnetically levitate left ventricular assist system',
 're hospitalization',
 'anabolic',
 'endothelial',
 'rct',
 'rheumatoid',
 'sts',
 'metabolism',
 'subclinical',
 'interventricular',
 'co primary outcome',
 'dopamine',
 'continuity equation',
 'energy substrate metabolism',
 'exercise induce pulmonary artery systolic pressure',
 'no',
 'tricuspid',
 'pcwp',
 'transmitral flow',
 'α myosin heavy chain',
 'signal transduction',
 'can ( 2 + ) sensitivity',
 'kidney infection',
 'pro bnp',
 'szeto schiller',
 'nyha class iii',
 'biosynthetic enzyme',
 'student t test',
 'lipoprotein',
 'end 

In [14]:
count_freq(set(preds_lem), set(gts_lem))

Counter({2: 1049, 1: 677, 3: 48, 4: 2})
Counter({1: 857, 2: 672, 3: 379, 4: 148, 5: 46, 6: 30, 7: 13, 8: 3, 10: 1})
