In [2]:
import stanza

# Download the English model with NER
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-09 03:30:21 INFO: Downloaded file to /Users/kowsalya/stanza_resources/resources.json
2024-04-09 03:30:21 INFO: Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip:   0%|          | 0…

2024-04-09 03:30:33 INFO: Downloaded file to /Users/kowsalya/stanza_resources/en/default.zip
2024-04-09 03:30:41 INFO: Finished downloading models and saved to /Users/kowsalya/stanza_resources


In [1]:
import json

# Path to your .jsonl file
file_path = '/Users/kowsalya/Downloads/admin.jsonl'

data_list = []

# Read the first line of the file
with open(file_path, 'r') as file:
    for line in file:  # This iterates over each line until the end of the file
        line = line.strip()
        if line:  # Check if line is not empty
            line_data = json.loads(line) # Parse the JSON content
            data_list.append(line_data)

In [2]:
# Function to accurately map entity spans and convert data to CoNLL format
def convert_to_conll(data):
    text = data["text"]
    entities = sorted(data["label"], key=lambda x: x[0])  # Ensure entities are processed in order
    
    # Tokenize the text and calculate start and end positions for each token
    tokens = text.split()
    token_positions = []
    offset = 0
    for token in tokens:
        start = text.index(token, offset)
        end = start + len(token)
        token_positions.append((start, end))
        offset = end
    
    conll_lines = []
    entity_idx = 0  # To keep track of which entity we are in
    for start, end in token_positions:
        token_label = "O"  # Default label
        
        # Adjust entity end index to not include trailing spaces
        while entity_idx < len(entities) and entities[entity_idx][1] <= start:
            entity_idx += 1  # Move to the next entity if we've passed the current one
        
        if entity_idx < len(entities):
            entity_start, entity_end, entity_type = entities[entity_idx]
            entity_end -= 1  # Adjust to exclude space by reducing the end index by 1
            
            if entity_start <= start <= entity_end:
                if start == entity_start:
                    token_label = f"B-{entity_type}"
                else:
                    token_label = f"I-{entity_type}"
        
        token = text[start:end]  # Extract token text using its start and end positions
        conll_lines.append(f"{token} {token_label}")
    
    return "\n".join(conll_lines)

In [3]:
import os
import json

# Assuming your convert_to_conll function is defined here

# Path to your .jsonl file
file_path = '/Users/kowsalya/Downloads/admin.jsonl'

# Folder paths
train_folder = '/conll/train'
test_folder = '/conll/test'
dev_folder = '/conll/dev'

# Ensure the folders exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)
os.makedirs(dev_folder, exist_ok=True)


# Distribute documents
train_docs = data_list[:6]
test_docs = data_list[6:8]
dev_docs = data_list[8:]

# Function to save documents in CoNLL format
def save_docs_in_conll(docs, folder):
    for idx, doc in enumerate(docs):
        conll_text = convert_to_conll(doc)  # Assuming this function exists and works correctly
        file_path = os.path.join(folder, f"doc_{idx}.conll")
        with open(file_path, 'w') as file:
            file.write(conll_text)

# Save documents in respective folders
save_docs_in_conll(train_docs, train_folder)
save_docs_in_conll(test_docs, test_folder)
save_docs_in_conll(dev_docs, dev_folder)


OSError: [Errno 30] Read-only file system: '/conll'

In [None]:
from stanza.models import ner

# Replace 'path_to_training_data', 'path_to_dev_data', and 'path_to_output' with your paths
ner.train_model(lang='en', 
                train_file='path_to_training_data', 
                dev_file='path_to_dev_data', 
                save_dir='path_to_output', 
                batch_size=32, 
                epochs=50, 
                learning_rate=1e-3)


In [None]:
# Loading the trained NER model
nlp = stanza.Pipeline(lang='en', processors={'ner': {'model_path': 'path_to_your_trained_model'}})

# Processing a new text
doc = nlp("Stanford University is located in California.")
for ent in doc.entities:
    print(ent.text, ent.type)
