In [7]:
!pip install spacy-transformers


Collecting spacy-transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.37.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.8.0->spacy-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.8.0->spacy-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from to

In [8]:
import glob
import os
import pandas as pd
import re
import json

orig_text_dir = "/data"
annotate_text_dir = "/annotations"

files = glob.glob(os.path.join(orig_text_dir, "*.txt"))
ner_annotation = []

for file in files:
    base = os.path.basename(file)
    annotations = os.path.join(annotate_text_dir, base)
    annotations = pd.read_csv(annotations, sep='\t')
    with open(file, "r") as f:
        full_txt = f.read()
    chunks, chunk_size = len(full_txt), 256
    text = [full_txt[i:i + chunk_size] for i in range(0, chunks, chunk_size)]

    for txt in text:
        for k, col in enumerate(annotations.columns):
            if col.lower().strip() == 'tag':
                col_tag = col
            if col.lower().strip() == 'value':
                col_value = col

        entities = []
        st_covered = []
        end_covered = []

        for i in range(annotations.shape[0]):
            try:
                value = annotations[col_value].iloc[i].strip()
                value_escaped = re.escape(value)  # Escape the value for regex
                occurrences = [m.span() for m in re.finditer(value_escaped, txt)]
                tag = annotations[col_tag].iloc[i]
                for occurance in occurrences:
                    st = occurance[0]
                    end = occurance[1]
                    if st not in st_covered and end not in end_covered:
                        st_covered.append(st)
                        end_covered.append(end)
                        entities.append((st, end, tag))
            except Exception as ex:
                print(f"Exception occurred for value '{value}': {ex}")
                continue

        final_entities = []
        for entity in entities:
            st = entity[0]
            end = entity[1]
            overlap = False
            for entity_ in entities:
                st_ = entity_[0]
                end_ = entity_[1]
                if st_ < st and end_ > end:
                    overlap = True
            if not overlap:
                final_entities.append(entity)
        ner_annotation.append((txt, {'entities': final_entities}))

# Optionally, save the ner_annotation for later use
with open('ner_annotation.json', 'w') as f:
    json.dump(ner_annotation, f, indent=4)


In [9]:
ner_annotation

[('\nRecord 175:\n\n\n\nHISTORY OF PRESENT ILLNESS:  The patient is a 47-year-old female who returns to the clinic for a followup visit.  The patient has multiple medical problems including severe and extreme hypertension as well as high blood pressure and chronic',
  {'entities': []}),
 (' renal failure for which she is currently attending dialysis sessions 3 times per week.  The patient also is currently being treated for hot flashes with Prempro.  The patient states that she ran out of her blood pressure medications including the patch 2 ',
  {'entities': []}),
 ('days ago.  She also complains of frequent popping and loss of hearing in her right ear.\n\n\n\nPHYSICAL EXAMINATION:  A middle-aged female in no acute distress.  Blood pressure is 182/94, weight is 211 pounds, and pulse is 76.  Chest is clear bilaterally.  Hea',
  {'entities': []}),
 ('rt, regular rate and rhythm, positive S1, S2.  Negative S3, S4, no murmur.  Abdomen is soft, nontender, positive bowel sounds.  Lower extre

In [10]:
import random
import json

random.shuffle(ner_annotation)
train_index = int(0.7 * len(ner_annotation))
valid_index = int(0.85 * len(ner_annotation))

TRAIN_DATA = ner_annotation[:train_index]
VALID_DATA = ner_annotation[train_index:valid_index]
TEST_DATA = ner_annotation[valid_index:]

# Save the data to JSON files for use with SpaCy
def save_to_json(data, filepath):
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)

save_to_json(TRAIN_DATA, "train_data.json")
save_to_json(VALID_DATA, "valid_data.json")
save_to_json(TEST_DATA, "test_data.json")


In [11]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

def convert_to_docbin(data, output_file):
    nlp = spacy.blank("en")  # load a new spacy model
    db = DocBin()  # create a DocBin object

    for text, annot in tqdm(data):
        doc = nlp.make_doc(text)  # create doc object from text
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents  # label the text with the ents
        db.add(doc)

    db.to_disk(output_file)  # save the docbin object

convert_to_docbin(TRAIN_DATA, "train.spacy")
convert_to_docbin(VALID_DATA, "valid.spacy")
convert_to_docbin(TEST_DATA, "test.spacy")


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  4%|▍         | 150/3477 [00:00<00:06, 508.57it/s]

Skipping entity
Skipping entity
Skipping entity


  8%|▊         | 269/3477 [00:00<00:05, 538.01it/s]

Skipping entity
Skipping entity
Skipping entity


 11%|█▏        | 398/3477 [00:00<00:05, 597.67it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 16%|█▌        | 540/3477 [00:00<00:04, 660.42it/s]

Skipping entity
Skipping entity
Skipping entity


 23%|██▎       | 802/3477 [00:01<00:03, 804.16it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 28%|██▊       | 970/3477 [00:01<00:03, 821.07it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 33%|███▎      | 1153/3477 [00:01<00:02, 860.00it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 38%|███▊      | 1316/3477 [00:02<00:03, 551.15it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 46%|████▌     | 1597/3477 [00:02<00:02, 631.60it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 51%|█████     | 1766/3477 [00:02<00:02, 706.45it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 57%|█████▋    | 1988/3477 [00:03<00:02, 566.76it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 68%|██████▊   | 2366/3477 [00:03<00:01, 798.23it/s]

Skipping entity
Skipping entity
Skipping entity


 74%|███████▍  | 2565/3477 [00:03<00:01, 876.81it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 79%|███████▉  | 2740/3477 [00:04<00:00, 786.70it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 81%|████████  | 2821/3477 [00:04<00:00, 745.18it/s]

Skipping entity
Skipping entity


 85%|████████▌ | 2963/3477 [00:04<00:00, 589.27it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 91%|█████████ | 3151/3477 [00:04<00:00, 552.97it/s]

Skipping entity
Skipping entity
Skipping entity


 94%|█████████▍| 3275/3477 [00:05<00:00, 552.69it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


100%|██████████| 3477/3477 [00:05<00:00, 652.09it/s]
 13%|█▎        | 100/745 [00:00<00:02, 316.09it/s]

Skipping entity


 27%|██▋       | 199/745 [00:00<00:01, 290.49it/s]

Skipping entity
Skipping entity


 40%|████      | 300/745 [00:00<00:01, 317.71it/s]

Skipping entity


 52%|█████▏    | 386/745 [00:01<00:00, 373.72it/s]

Skipping entity
Skipping entity


 69%|██████▊   | 511/745 [00:01<00:00, 396.37it/s]

Skipping entity


 80%|███████▉  | 595/745 [00:01<00:00, 396.72it/s]

Skipping entity
Skipping entity


100%|██████████| 745/745 [00:02<00:00, 364.14it/s]

Skipping entity
Skipping entity



 14%|█▍        | 103/746 [00:00<00:01, 496.17it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 40%|████      | 302/746 [00:00<00:00, 643.24it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 71%|███████▏  | 532/746 [00:00<00:00, 729.15it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


100%|██████████| 746/746 [00:01<00:00, 707.71it/s]


Skipping entity
Skipping entity
Skipping entity


In [12]:
!python -m spacy init fill-config base_config.cfg config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [13]:
!python -m spacy debug config config.cfg

[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[1m
[1m
[38;5;2m✔ Config is valid[0m


In [14]:
!python -m spacy train config.cfg --output ./output --gpu-id 0

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of BertModel were not initialized from the model checkpoint at bert_model_data 1 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        1473.67    648.05    0.38    0.19   13.01    0.00
  1     200       88364.67  72688.49   33.89   61.36   23.41    0.34
  3     400       12598.56  54739.71   40.91   59.34   31.21 

In [16]:
!python -m spacy evaluate ./output/model-best ./test.spacy --gpu-id 0


[38;5;4mℹ Using GPU: 0[0m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[1m

TOK     100.00
NER P   57.93 
NER R   44.92 
NER F   50.60 
SPEED   6124  

[1m

                  P       R       F
RiskFactor    62.24   57.01   59.51
Macro         60.96   44.28   51.30
Micro         44.00   34.38   38.60
Preventions   33.33   20.59   25.45

