In [52]:
import stanza

# Download the English model with NER
stanza.download('en')

python(91632) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-10 20:32:58 INFO: Downloaded file to /Users/kowsalya/stanza_resources/resources.json
2024-04-10 20:32:58 INFO: Downloading default packages for language: en (English) ...
2024-04-10 20:32:59 INFO: File exists: /Users/kowsalya/stanza_resources/en/default.zip
2024-04-10 20:33:05 INFO: Finished downloading models and saved to /Users/kowsalya/stanza_resources


In [41]:
import json

# Path to your .jsonl file
file_path = '/Users/kowsalya/Downloads/admin.jsonl'

data_list = []

# Read the first line of the file
with open(file_path, 'r') as file:
    for line in file:  # This iterates over each line until the end of the file
        line = line.strip()
        if line:  # Check if line is not empty
            line_data = json.loads(line) # Parse the JSON content
            data_list.append(line_data)

In [42]:
import random

# Randomly shuffle the list
random.shuffle(data_list)

# Calculate split indices
first_split = int(len(data_list) * 0.6)
second_split = first_split + int(len(data_list) * 0.2)

# Split the list
train = data_list[:first_split]
dev = data_list[first_split:second_split]
test = data_list[second_split:]

In [45]:
def adjust_label_spans(text, labels):
    """
    Adjusts label spans to ensure they do not start or end with whitespace.
    """
    adjusted_labels = []
    for start, end, label_type in labels:
        # Adjust start index if it points to a space
        while text[start] == ' ' and start < end:
            start += 1
        # Adjust end index if it points to a space
        while text[end] == ' ' and end > start:
            end -= 1
        adjusted_labels.append((start, end, label_type))
    return adjusted_labels

def convert_to_bioes_adjusted(text, labels):
    tokens = word_tokenize(text)
    token_offsets = []
    cursor = 0

    # Tokenize and record the character offsets of each token
    for token in tokens:
        start = text.find(token, cursor)
        end = start + len(token) - 1  # Adjust for inclusive range
        token_offsets.append((start, end))
        cursor = end + 1

    # Initialize BIOES tags as 'O'
    bioes_tags = ['O'] * len(tokens)

    # Adjust labels to ensure they do not start or end on whitespace
    adjusted_labels = adjust_label_spans(text, labels)

    # Apply BIOES tagging
    for start, end, label_type in adjusted_labels:
        start_token = end_token = None
        for i, (t_start, t_end) in enumerate(token_offsets):
            if start >= t_start and start <= t_end:
                start_token = i
            if end >= t_start and end <= t_end:
                end_token = i
                break
        if start_token is not None and end_token is not None:
            if start_token == end_token:  # Single-token entity
                bioes_tags[start_token] = f"S-{label_type}"
            else:
                bioes_tags[start_token] = f"B-{label_type}"
                for j in range(start_token + 1, end_token):
                    bioes_tags[j] = f"I-{label_type}"
                bioes_tags[end_token] = f"E-{label_type}"

    return list(zip(tokens, bioes_tags))


def save_to_bioes_file(filename, tokens_tags):
    """
    Saves the tokens and their BIOES tags to a file in BIOES format.
    
    :param filename: Path to the output file.
    :param tokens_tags: A list of tuples, each containing a token and its BIOES tag.
    """
    # Ensure the directory exists
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    with open(filename, 'a', encoding='utf-8') as f:
        previous_tag = None
        for token, tag in tokens_tags:
            # Write token and tag separated by a space
            f.write(f"{token} {tag}\n")
            
            # Add a blank line if the current tag is the end of an entity or if the previous tag was 'O' and the current tag starts an entity
            if tag.startswith('E-') or tag.startswith('S-') or (previous_tag == 'O' and tag.startswith('B-')):
                f.write("\n")
            previous_tag = tag

In [46]:
for doc in train:
    output_filename = "data/ner/en_sample.train.bioes"
    tokens_bioes_adjusted = convert_to_bioes_adjusted(doc["text"], doc["label"])
    save_to_bioes_file(output_filename, tokens_bioes_adjusted)

In [47]:
for doc in dev:
    output_filename = "data/ner/en_sample.dev.bioes"
    tokens_bioes_adjusted = convert_to_bioes_adjusted(doc["text"], doc["label"])
    save_to_bioes_file(output_filename, tokens_bioes_adjusted)

In [48]:
for doc in test:
    output_filename = "data/ner/en_sample.test.bioes"
    tokens_bioes_adjusted = convert_to_bioes_adjusted(doc["text"], doc["label"])
    save_to_bioes_file(output_filename, tokens_bioes_adjusted)

In [49]:
import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file

output_jsons = ['data/ner/en_sample.train.json', 'data/ner/en_sample.dev.json', 'data/ner/en_sample.test.json']
input_bioes = ['data/ner/en_sample.train.bioes', 'data/ner/en_sample.dev.bioes', 'data/ner/en_sample.test.bioes']
for i in range(3):
    prepare_ner_file.process_dataset(input_bioes[i], output_jsons[i])

2975 examples loaded from data/ner/en_sample.train.bioes
Generated json file data/ner/en_sample.train.json
1074 examples loaded from data/ner/en_sample.dev.bioes
Generated json file data/ner/en_sample.dev.json
1335 examples loaded from data/ner/en_sample.test.bioes
Generated json file data/ner/en_sample.test.json


In [50]:
!python3 -m stanza.utils.training.run_ner en_sample --max_steps 500 --word_emb_dim 5

python(60477) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


2024-04-10 20:01:09 INFO: Training program called with:
/Users/kowsalya/anaconda3/lib/python3.11/site-packages/stanza/utils/training/run_ner.py en_sample --max_steps 500 --word_emb_dim 5
2024-04-10 20:01:09 DEBUG: en_sample: en_sample
2024-04-10 20:01:09 INFO: Using model /Users/kowsalya/stanza_resources/en/forward_charlm/1billion.pt for forward charlm
2024-04-10 20:01:09 INFO: Using model /Users/kowsalya/stanza_resources/en/backward_charlm/1billion.pt for backward charlm
2024-04-10 20:01:09 INFO: Using default pretrain for en:sample, found in /Users/kowsalya/stanza_resources/en/pretrain/fasttextcrawl.pt  To use a different pretrain, specify --wordvec_pretrain_file
2024-04-10 20:01:09 INFO: en_sample: saved_models/ner/en_sample_charlm_nertagger.pt does not exist, training new model
2024-04-10 20:01:09 INFO: Using model /Users/kowsalya/stanza_resources/en/forward_charlm/1billion.pt for forward charlm
2024-04-10 20:01:09 INFO: Using model /Users/kowsalya/stanza_resources/en/backward_char

2024-04-10 20:17:17 INFO: 2024-04-10 20:17:17: step 320/500, loss = 2.683011 (1.951 sec/batch), lr: 0.100000
2024-04-10 20:18:22 INFO: 2024-04-10 20:18:22: step 340/500, loss = 2.047757 (8.031 sec/batch), lr: 0.100000
2024-04-10 20:19:35 INFO: 2024-04-10 20:19:35: step 360/500, loss = 3.501266 (1.940 sec/batch), lr: 0.100000
2024-04-10 20:20:36 INFO: 2024-04-10 20:20:36: step 380/500, loss = 2.389971 (1.549 sec/batch), lr: 0.100000
2024-04-10 20:22:02 INFO: 2024-04-10 20:22:02: step 400/500, loss = 2.475004 (2.972 sec/batch), lr: 0.100000
2024-04-10 20:23:21 INFO: 2024-04-10 20:23:21: step 420/500, loss = 2.207893 (1.146 sec/batch), lr: 0.100000
2024-04-10 20:24:09 INFO: 2024-04-10 20:24:09: step 440/500, loss = 2.058061 (2.294 sec/batch), lr: 0.100000
2024-04-10 20:25:17 INFO: 2024-04-10 20:25:17: step 460/500, loss = 3.483303 (4.824 sec/batch), lr: 0.100000
2024-04-10 20:26:36 INFO: 2024-04-10 20:26:36: step 480/500, loss = 2.783307 (10.910 sec/batch), lr: 0.100000
2024-04-10 20:27:5

2024-04-10 20:30:08 DEBUG: 42 batches created.
2024-04-10 20:30:08 INFO: Start evaluation...
2024-04-10 20:31:09 INFO: Score by entity:
Prec.	Rec.	F1
58.04	60.82	59.40
2024-04-10 20:31:09 INFO: Score by token:
Prec.	Rec.	F1
40.77	40.09	40.43
2024-04-10 20:31:09 INFO: Weighted f1 for non-O tokens: 0.391684
2024-04-10 20:31:09 INFO: NER tagger score: en_sample saved_models/ner/en_sample_charlm_nertagger.pt data/ner/en_sample.test.json 59.40
2024-04-10 20:31:09 INFO: NER Entity F1 scores:
  CASE_NUMBER: 77.27
  COURT: 66.67
  DATE: 97.66
  GPE: 52.94
  JUDGE: 23.33
  LAWYER: 30.53
  ORG: 75.14
  OTHER_PERSON: 24.12
  PETITIONER: 0.00
  PRECEDENT: 27.69
  PROVISION: 73.91
  RESPONDENT: 0.00
  STATUTE: 36.84
  WITNESS: 0.00
2024-04-10 20:31:09 INFO: NER token confusion matrix:
          t\p                   O    CASE_NUMBER          COURT           DATE            GPE          JUDGE         LAWYER            ORG   OTHER_PERSON     PETITIONER      PRECEDENT      PROVISION     RESPONDENT    

In [55]:
# Loading the trained NER model
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

# Processing a new text
doc = nlp("Stanford University is located in California.")
for ent in doc.entities:
    print(ent.text, ent.type)


2024-04-10 20:58:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-10 20:58:39 INFO: Downloaded file to /Users/kowsalya/stanza_resources/resources.json
2024-04-10 20:58:40 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2024-04-10 20:58:40 INFO: Using device: cpu
2024-04-10 20:58:40 INFO: Loading: tokenize
2024-04-10 20:58:41 INFO: Loading: mwt
2024-04-10 20:58:41 INFO: Loading: ner
2024-04-10 20:58:42 INFO: Done loading processors!


Stanford University ORG
California GPE
