[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jianlins/BMI_NLP_2024/blob/main/Module%201%20Overview.ipynb)

# Define demo functions

In [None]:
import spacy
from spacy import displacy
def vis_sentences(doc, color='#89C4F4'):
  # convert sentences into entities for visualization
  doc.ents=[doc.char_span(s.start_char, s.end_char, label="Sentence") for s in doc.sents]
  displacy.render(doc, jupyter=True, style="ent", options={'colors':{'Sentence':color}})
def vis_tokens(doc, color='#A9A4A4'):
  # convert sentences into entities for visualization
  doc.ents=[doc.char_span(t.idx, t.idx+len(t), label="Tok") for t in doc]
  displacy.render(doc, jupyter=True, style="ent", options={'colors':{'Tok':color}})

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
input_txt='''CHIEF COMPLAINT: Burn, right arm.

HISTORY OF PRESENT ILLNESS: This is a Workers' Compensation injury. This patient, a 41 year-old male, was at a coffee shop, where he works as a cook, and hot oil splashed onto his arm, burning from the elbow to the wrist on the medial aspect. He has had it cooled, and presents with his friend to the Emergency Department for care.'''

## Visualize sentences

In [None]:
doc=nlp(input_txt)
vis_sentences(doc)

## Visualize tokens

In [None]:
doc=nlp(input_txt)
vis_tokens(doc)

### Different tokenization methods

In [None]:
%%capture
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
def vis_subtokenizer(tokenizer, input_txt, color='#A9A4A4'):
  encoding=tokenizer.encode(input_txt)
  tokens=[tokenizer.decode(t) for t in encoding]
  tokens=[t for t in tokens if t is not None and len(t)>0]
  output_txt=' '.join(tokens)
  doc=nlp(output_txt)
  start_offset=0
  end_offset=-1
  ents=[]
  for t in tokens:
    start_offset=end_offset+1
    end_offset=start_offset+len(t)
    ents.append(doc.char_span(start_offset,end_offset, label="Tok"))
  doc.ents=ents
  displacy.render(doc, jupyter=True, style="ent", options={'colors':{'Tok':color}})


In [None]:
vis_subtokenizer(tokenizer, input_txt)

In [None]:
vis_subtokenizer(tokenizer, input_txt)

In [None]:
%%capture
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
vis_subtokenizer(tokenizer, input_txt)

In [None]:
%%capture
tokenizer = AutoTokenizer.from_pretrained("xlnet-large-cased")

In [None]:
vis_subtokenizer(tokenizer, input_txt)

## Visualize named entities

In [None]:
doc=nlp(input_txt)
displacy.render(doc, jupyter=True, style="ent")

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
displacy.render(doc, jupyter=True, style="ent")