# Analysis of ECCNP Resolution and Impact on NER and EL Performance

In [3]:
from xmen import load_config, load_kb
from xmen.data import from_spacy

conf = load_config('xmen_ggponc3.yaml')

**Note:** For this notebook, we only use a small subset of GGPONC 2.0 for demonstration purposes
Change the version if you want to use any other GGPONC release

In [4]:
from ggponc import *

sent_df = read_sentences(conf.data_version, 'data')

  0%|          | 0/14 [00:00<?, ?it/s]

In [5]:
resolver = ECCNPResolver(**conf.eccnp)



In [6]:
sent_df = resolver.set_df(sent_df)

  0%|          | 0/374 [00:00<?, ?it/s]

In [7]:
ner = NERTagger()

In [8]:
ner_df_before = ner.set_df(sent_df, sent_col='sentence')
ds_ner_before = from_spacy(ner_df_before.spacy_ner, span_key='entities', doc_id_key='file_name')

  0%|          | 0/374 [00:00<?, ?it/s]

In [9]:
ner_df_after = ner.set_df(sent_df)
ds_ner_after = from_spacy(ner_df_after.spacy_ner, span_key='entities', doc_id_key='file_name')

  0%|          | 0/374 [00:00<?, ?it/s]

In [10]:
from ggponc import EntityLinker
linker = EntityLinker(**conf.linker.ranking, candidate_generation_kwargs=conf.linker.candidate_generation)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations




In [11]:
linking_results = {}
linking_results['before_eccnp'] = linker.transform(ds_ner_before, silent=True)
linking_results['before_eccnp_no_semtype'] = linker.transform(ds_ner_before, ['abbrv', 'candidates', 'reranking'], silent=True)
linking_results['after_eccnp'] = linker.transform(ds_ner_after, silent=True)
linking_results['after_eccnp_no_semtype'] = linker.transform(ds_ner_after, ['abbrv', 'candidates', 'reranking'], silent=True)

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

  global_matches = self.global_matcher(doc)


Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Context length: 128
Use NIL values: False


  0%|          | 0/1179 [00:00<?, ?it/s]

  0%|          | 0/1179 [00:00<?, ?it/s]

  0%|          | 0/1179 [00:00<?, ?it/s]

Batches:   0%|          | 0/1179 [00:00<?, ?it/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

  global_matches = self.global_matcher(doc)


Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Context length: 128
Use NIL values: False


  0%|          | 0/1179 [00:00<?, ?it/s]

  0%|          | 0/1179 [00:00<?, ?it/s]

  0%|          | 0/1179 [00:00<?, ?it/s]

Batches:   0%|          | 0/1179 [00:00<?, ?it/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

  global_matches = self.global_matcher(doc)


Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Context length: 128
Use NIL values: False


  0%|          | 0/1177 [00:00<?, ?it/s]

  0%|          | 0/1177 [00:00<?, ?it/s]

  0%|          | 0/1177 [00:00<?, ?it/s]

Batches:   0%|          | 0/1177 [00:00<?, ?it/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

  global_matches = self.global_matcher(doc)


Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Context length: 128
Use NIL values: False


  0%|          | 0/1177 [00:00<?, ?it/s]

  0%|          | 0/1177 [00:00<?, ?it/s]

  0%|          | 0/1177 [00:00<?, ?it/s]

Batches:   0%|          | 0/1177 [00:00<?, ?it/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

In [12]:
from nb_util import show_diff_single
from IPython.display import display
from nen_util import get_annotation_dataframe
from xmen.data import Sampler

def get_ent_texts(d):
    return [e['text'] for e in d['entities']]

# Impact of Semantic Type Filter

In [None]:
res_sem = linking_results['before_eccnp']['reranked']
res_no_sem = linking_results['before_eccnp_no_semtype']['reranked']

assert len(res_sem) == len(res_no_sem)
#s = Sampler(42, 10)
#l, r = s.transform_batch(res_sem, res_no_sem)

diff_sem = 0

for d_b, d_a in zip(res_no_sem, res_sem):
    t_a = d_a['passages'][0]['text'][0]
    df1 = get_annotation_dataframe([d_b], linker.kb, 1, 0.0)
    df2 = get_annotation_dataframe([d_a], linker.kb, 1, 0.0)
    if len(df1) == 0:
        continue
    mask = df1.canonical != df2.canonical
    if mask.any():
        print(t_a + '\n')
        #show_diff_single(t_b, t_a)
        print('No Filter:')
        display(df1[mask])
        print('Filter:')
        display(df2[mask])
        diff_sem += 1
        print('####')

In [None]:
diff_sem / len(res_no_sem)

# Impact of ECCNPs

In [None]:
res_before = linking_results['before_eccnp']['reranked']
res_after = linking_results['after_eccnp']['reranked']

assert len(res_sem) == len(res_no_sem)

diff_text = 0
diff_ents = 0
for d_b, d_a in zip(res_before, res_after):
    if d_b['passages'] != d_a['passages']: 
        diff_text += 1
        if get_ent_texts(d_b) != get_ent_texts(d_a):
            diff_ents += 1
            t_a = d_a['passages'][0]['text'][0]
            t_b = d_b['passages'][0]['text'][0]
            print('Difference', diff_ents)
            show_diff_single(t_b, t_a)
            display(get_annotation_dataframe([d_b], linker.kb, 1, 0.0))
            display(get_annotation_dataframe([d_a], linker.kb, 1, 0.0))

# Examples

In [None]:
df_analysis = get_annotation_dataframe(res_after, linker.kb, 3, 0.0, show_progress=False)

In [None]:
german_umls = load_kb(Path(conf.cache_dir) / 'german_umls' / 'german_umls.jsonl')

In [None]:
def get_example(i):
    doc = df_analysis.loc[i].document
    d = res_after.filter(lambda d: d['document_id'] == doc)[0]
    print(d['passages'])

### Language Mismatch

In [None]:
df_analysis[df_analysis.cui.map(lambda c: not c in german_umls.cui_to_entity)].sort_values('confidence', ascending=False)[0:20]

In [None]:
get_example(1404)

### Complex Entities

In [None]:
df_analysis[df_analysis.text.map(lambda s: len(s.split(' ')) > 5) & (df_analysis.type == 'Clinical_Drug')].sort_values('confidence', ascending=False)[0:20]

In [None]:
get_example(1590)

### Amibiguity

In [None]:
linker.candidate_generator.predict_no_context(['M. Paget'])