In [1]:
from collections import Counter
import pandas as pd
import stanza
import re

In [5]:
test = pd.read_csv('../training_corpus/en_partial_spacy_tokenizer.csv')
test.labels.value_counts()

O    125451
B     19648
I      4061
Name: labels, dtype: int64

In [2]:
en_gt = pd.read_csv('../ACTER/en/htfl/annotations/htfl_en_terms.ann', sep='	', engine='python',header=None)
en_preds = pd.read_csv('../results/en/ann/roberta_ann_lem_predictions_3143.csv')
preds = list(en_preds['predictions'])
gts = list(en_gt[0])

In [3]:
def count_freq(preds, gts):
    preds_len = [len(x.split(' ')) for x in preds]
    gts_len = [len(x.split(' ')) for x in gts]
    print(Counter(preds_len))
    print(Counter(gts_len))

In [4]:
preds_len = [len(x.split(' ')) for x in preds]
gts_len = [len(x.split(' ')) for x in gts]
print(Counter(preds_len))
print(Counter(gts_len))

Counter({2: 1088, 1: 740, 3: 63, 4: 4})
Counter({1: 1064, 2: 766, 3: 357, 4: 120, 5: 31, 6: 18, 7: 4, 8: 1})


In [5]:
max(gts, key=len)

'heart failure with preserved left ventricular ejection fraction'

In [6]:
# gts[gts.str.split(' ').len() == 3]

In [7]:
# P = TP/(TP+FP)
# R =  TP/(TP+FN)
def evaluation_metrics(pred, gt):
    TP = len(set(pred) & set(gt)) 
    FP = len(set(pred)-set(gt))
    FN = len(set(gt)-set(pred))
    print(TP, FP, FN)
    precision = round((TP/(TP+FP))*100, 2)
    recall = round((TP/(TP+FN))*100,2)
    f1_score = round((2 * precision * recall) / (precision + recall),2)
    return precision, recall, f1_score

precision, recall, f1_score = evaluation_metrics(set(preds), set(gts))
print(len(set(preds)))
print(len(set(gts)))
precision, recall, f1_score

669 1226 1692
1895
2361


(35.3, 28.34, 31.44)

In [8]:
# 669 1227 1692

Stanza

In [9]:
nlp = stanza.Pipeline(lang='en')# processors='tokenize,mwt,pos,lemma'

2021-04-23 14:00:48 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-04-23 14:00:48 INFO: Use device: cpu
2021-04-23 14:00:48 INFO: Loading: tokenize
2021-04-23 14:00:48 INFO: Loading: pos
2021-04-23 14:00:49 INFO: Loading: lemma
2021-04-23 14:00:49 INFO: Loading: depparse
2021-04-23 14:00:49 INFO: Loading: sentiment
2021-04-23 14:00:49 INFO: Loading: ner
2021-04-23 14:00:50 INFO: Done loading processors!


In [10]:
def lemma(li):
    new_list = []
    for t in li:
        doc = nlp(str(t))
        doc1 = ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
        doc1 = re.sub('-',' ',doc1)
        doc1 = re.sub(' +', ' ',doc1)
        new_list.append(doc1)
    new_list = [s for s in new_list if len(s) >= 2]
    return new_list

In [11]:
preds_lem = lemma(preds)
gts_lem = lemma(gts)

print('After lemmatization')
print(len(set(preds_lem)))
print(len(set((gts_lem))))
print(evaluation_metrics(list(set(preds_lem)), list(set(gts_lem))))

After lemmatization
1830
2149
640 1190 1509
(34.97, 29.78, 32.17)


In [12]:
set(preds_lem)

{'fractional flow',
 'intermediate frailty',
 'myocardial insulin',
 'relaxation',
 'oxidase inhibition',
 'tethering',
 'coat',
 'post',
 'cardiac washout',
 'aortic',
 'insulin product',
 'depression',
 'antirheumatic drug',
 'physiological proteolysis',
 'the efficacy',
 'cardiac vagal',
 'rethink the',
 'term cost',
 'peptidase 4',
 'various transcatheter',
 'type littermate',
 'rtq',
 'catheter ablation',
 'potassium',
 'cyclase',
 'adenosine',
 'septal',
 'cardiogenic shock',
 'prognostic relevance',
 'shorten velocity',
 'cardiomyopathy cohort',
 'bradycardia',
 'muscle cell',
 'stem cell',
 'ejection',
 'salt intake',
 'taxonomy',
 'cardiac specialty',
 'vascular effect',
 'haemodynamic',
 'fetal',
 'biventricular pacing',
 'artery bypass',
 'emergency cardiac',
 'congestive heart failure',
 'elevation',
 'longitudinal systolic',
 'coronary revascularization',
 'reverse',
 'sodium',
 'the differential',
 'hyperlipidaemia',
 'absolute',
 'interval oscillation',
 'sinus rhythm',


In [13]:
set(gts_lem)

{'idiopathic',
 'ivabradine',
 'respiratory gas analysis',
 'peak oxygen consumption',
 'cyclic guanosine monophosphate',
 'mitral valve plasty',
 'arni',
 'congenital',
 'life expectancy',
 'cardiac washout',
 'lv end diastolic volume',
 'aortic',
 'protein kinase r like er kinase',
 'downregulation',
 'nyha functional class iv',
 'rvpo',
 'mdc',
 'metabolic exercise',
 'propranolol',
 'cox modeling',
 'if channel inhibition',
 'angiotensin receptor neprilysin inhibitor',
 'bradycardic',
 'lvad therapy',
 'sd of all qt interval',
 'outcome',
 'acute care surgery',
 'time up & go',
 'catheter ablation',
 'thalassaemia major',
 'potassium',
 'preserve leave ventricular ejection fraction',
 'cyclase',
 'leave atrial',
 'microarray',
 'adenosine',
 'eo cfus',
 'vascular endothelial growth factor blockade',
 'molecular pathway',
 'xanthine oxidase',
 'septal',
 'cardiogenic shock',
 'catheterisation',
 'prospective study',
 'bradycardia',
 'interatrial shunt',
 'stem cell',
 'downregulate'

In [14]:
count_freq(set(preds_lem), set(gts_lem))

Counter({2: 1068, 1: 698, 3: 62, 4: 2})
Counter({1: 857, 2: 672, 3: 379, 4: 148, 5: 46, 6: 30, 7: 13, 8: 3, 10: 1})


In [15]:
preds

['comprehensive echocardiography',
 'nris',
 'annular velocity',
 'malignant',
 'congestive hf',
 'home environment',
 'arterial vasculature',
 'fractional flow',
 'adenoviral vector',
 'pluripotent',
 'overload hypertrophy',
 'epidemiology',
 'foundation',
 'chm',
 'skeletal',
 '\u2009 diet',
 'oxidative',
 'cardiologists',
 'gdmt',
 'elastance',
 'protocol',
 'sarcoplasmic reticular',
 'ethically',
 'spectrometric analysis',
 'rate',
 'confidence bound',
 'angiotensin system',
 'impact pay',
 'cardiovascular',
 'cardiac diameters',
 'interventricular septum',
 'nitroxyl',
 'severity',
 'pharmacological',
 'cardiac fibrosis',
 'bridging therapy',
 'pilot project',
 'kidney',
 'simple ecg',
 'vasodilator effects',
 'urine',
 'medicaid services',
 'fibrillation for',
 'iron homeostasis',
 'subsequent heart',
 'arterial tonometry',
 'haematopoietic cell',
 'costing',
 'hemoglobin levels',
 'urine volume',
 'anticoagulation',
 'epidemic',
 'cardiomyopathy cohort',
 'transfusion thresholds

In [16]:
gts

['(123)i-meta-iodobenzylguanidine',
 '(123)i-mibg',
 '1-alkyl-2-acetyl-sn-glycerol cholinephosphotransferase',
 '2-vessel cad',
 '2-way anova',
 '3-vessel cad',
 '6 min walk test',
 '6-min walk test',
 '6-minute hall walk',
 '6-minute walk distance',
 '6-minute walk test',
 '6-minute walked distance',
 '6-minute walking distance',
 '6-mwt',
 '6mwt',
 '8-fluo-camp',
 'aac',
 'abcc2',
 'abcd classification',
 'abdominal',
 'abdominal aortic constriction',
 'ablation',
 'absorptiometry',
 'acetyl-lysine reader proteins',
 'acetylcholinesterase inhibition',
 'acetylcholinesterase inhibitor',
 'acf',
 'acr',
 'acs',
 'actin cytoskeleton',
 'acute',
 'acute care surgery',
 'acute coronary syndrome',
 'acute coronary syndromes',
 'acute dyspnea',
 'acute heart failure',
 'acute hf',
 'acute ischemic right heart failure',
 'acute kidney injury',
 'acute myocardial infarction',
 'acute right heart failure',
 'acute rv failure',
 'adenine nucleotides',
 'adenosine',
 'adenoviral',
 'adenovirus',

In [17]:
import stanza
stanza.download('en')
import en_core_web_sm
nlp = en_core_web_sm.load()
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 24.5MB/s]                    
2021-04-23 14:05:17 INFO: Downloading default packages for language: en (English)...
2021-04-23 14:05:18 INFO: File exists: /Users/hanhtran/stanza_resources/en/default.zip.
2021-04-23 14:05:22 INFO: Finished downloading models and saved to /Users/hanhtran/stanza_resources.
2021-04-23 14:05:23 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2021-04-23 14:05:23 INFO: Use device: cpu
2021-04-23 14:05:23 INFO: Loading: tokenize
2021-04-23 14:05:23 INFO: Loading: pos
2021-04-23 14:05:23 INFO: Loading: lemma
2021-04-23 14:05:23 INFO: Done loading processors!


In [18]:
nlp(str(' nay')).sentences[0].word[0]

AttributeError: 'Sentence' object has no attribute 'word'