## **Dataset Generation** to train Spacy Model

In [None]:
!pip install Faker;

Collecting Faker
  Downloading Faker-24.2.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Faker
Successfully installed Faker-24.2.0


Set the desired locale to English - India

In [None]:
import re
import json
from faker import Faker


faker = Faker('en_IN')


paragraphs = []
for _ in range(500):
    # Generate synthetic personal data
    name = faker.name()
    age = faker.random_int(min=18, max=100)
    phone_number = faker.phone_number()
    address = faker.address()


    paragraph = f"Hello there! I hope you're having a good day. Could you please tell me your name? " \
                f"Sure, my name is {name}. " \
                f"Great! And how old are you? " \
                f"I'm {age} years old. " \
                f"Thanks! What's your phone number, in case we need to reach you? " \
                f"My phone number is {phone_number}. " \
                f"Perfect! And where do you live? " \
                f"My address is {address}."

    # Define patterns to search for entities
    patterns = {
        "NAME OF PATIENT": name,
        "AGE": str(age),
        "NUMBER": phone_number,
        "ADDRESS": address
    }

    # Find positions of entities using regular expressions
    entity_positions = {}
    for entity, value in patterns.items():
        matches = re.finditer(re.escape(value), paragraph)
        entity_positions[entity] = [(match.start(), match.end()) for match in matches][0]

    # Annotation format
    annotation = {
        "paragraph": paragraph,
        "entities": [
            {"start": start, "end": end, "label": entity}
            for entity, (start, end) in entity_positions.items()
        ]
    }

    # Add annotation to list
    paragraphs.append(annotation)


with open("generated_paragraphs.json", "w") as json_file:
    json.dump(paragraphs, json_file, indent=2)

print("Annotations saved to generated_paragraphs.json")

Annotations saved to generated_paragraphs.json


In [None]:
import os
import re
import json
from faker import Faker
from faker.providers import person, date_time
from google.colab import drive


drive.mount('/content/drive')


faker = Faker('en_IN')


faker.add_provider(person)
faker.add_provider(date_time)


def generate_paragraph():
    # Generate synthetic personal data
    name = faker.name()
    gender = faker.random_element(elements=('Male', 'Female', 'Other'))
    age = faker.random_int(min=18, max=100)
    weight = faker.random_int(min=40, max=150)
    phone_number = faker.phone_number()
    address = faker.address()
    admission_date = faker.date_this_year()


    paragraph = f"Good morning! Welcome to our clinic. " \
                f"Could you please provide me with your name? Sure, my name is {name}. " \
                f"I am {gender.lower()}, {age} years old. My weight is {weight} kg. " \
                f"My phone number is {phone_number}. My address is {address}. " \
                f"I visited the clinic today, {admission_date.strftime('%B %d, %Y')}."


    patterns = {
        "NAME OF PATIENT": name,
        "AGE": str(age),
        "GENDER": gender,
        "WEIGHT": str(weight),
        "NUMBER": phone_number,
        "ADDRESS": address,
        "DATE OF ADMISSION": admission_date.strftime('%B %d, %Y')
    }

    # Find positions of entities using regular expressions
    entity_positions = {}
    for entity, value in patterns.items():
        matches = re.finditer(re.escape(value), paragraph)
        matches_list = list(matches)
        if matches_list:
            entity_positions[entity] = [(match.start(), match.end()) for match in matches_list][0]

    # Return the paragraph and its annotation
    return {
        "paragraph": paragraph,
        "annotation": {
            "entities": [
                [start, end, entity]
                for entity, (start, end) in entity_positions.items()
            ]
        }
    }

paragraphs_with_annotations = [generate_paragraph() for _ in range(500)]

# Define the directory path
directory_path = '/content/drive/MyDrive/dataset_generator/'


if not os.path.exists(directory_path):
    os.makedirs(directory_path)


file_path = os.path.join(directory_path, 'generated_paragraphs.json')
with open(file_path, "w") as file:
    json.dump(paragraphs_with_annotations, file, indent=2)

print(f"500 paragraphs with annotations saved to {file_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
500 paragraphs with annotations saved to /content/drive/MyDrive/dataset_generator/generated_paragraphs.json


In [None]:
import os
import re
import json
from faker import Faker
from faker.providers import person, date_time, address
import string


faker = Faker('en_IN')


faker.add_provider(person)
faker.add_provider(date_time)
faker.add_provider(address)


def generate_paragraph():
    # Generate synthetic personal data
    name = faker.name()
    gender = faker.random_element(elements=('Male', 'Female', 'Other'))
    age = faker.random_int(min=18, max=100)
    weight = faker.random_int(min=40, max=150)
    admission_date = faker.date_this_year()
    address_line = faker.address().replace('\n', ', ')


    paragraph = f"Hi there! I hope you're doing well today. Can I ask for your name? " \
                f"Of course! My name is {name}. " \
                f"And what is your gender? " \
                f"I am {gender.lower()}. " \
                f"Thank you! And how old are you? " \
                f"I'm {age} years old. " \
                f"Great! Also, can you provide your weight? " \
                f"My weight is {weight} kg. " \
                f"Perfect! Just one more thing, do you know today's date? " \
                f"Yes, today is {admission_date.strftime('%B %d, %Y')}. " \
                f"By the way, where do you live? " \
                f"I live at {address_line}."


    paragraph = paragraph.translate(str.maketrans('', '', string.punctuation))

    # Define patterns to search for entities
    patterns = {
        "NAME OF PATIENT": name,
        "GENDER": gender,
        "AGE": str(age),
        "WEIGHT": str(weight),
        "DATE OF ADMISSION": admission_date.strftime('%B %d, %Y'),
        "ADDRESS": address_line
    }

    # Find positions of entities using regular expressions
    entity_positions = {}
    for entity, value in patterns.items():
        matches = re.finditer(re.escape(value), paragraph)
        matches_list = list(matches)
        if matches_list:
            entity_positions[entity] = [(match.start(), match.end()) for match in matches_list][0]

    # Return the paragraph and its annotation
    return {
        "paragraph": paragraph,
        "annotation": {
            "entities": [
                [start, end, entity]
                for entity, (start, end) in entity_positions.items()
            ]
        }
    }

paragraphs_with_annotations = [generate_paragraph() for _ in range(500)]


directory_path = '/content/drive/MyDrive/dataset_generator/'

if not os.path.exists(directory_path):
    os.makedirs(directory_path)

file_path = os.path.join(directory_path, 'generated_paragraphs_with_annotations.json')
with open(file_path, "w") as file:
    json.dump(paragraphs_with_annotations, file, indent=2)

print(f"500 paragraphs with annotations saved to {file_path}")

500 paragraphs with annotations saved to /content/drive/MyDrive/dataset_generator/generated_paragraphs_with_annotations.json


Combined all the 3 json files and shuffled them

In [None]:
import os
import json
import random

# Define the paths to the JSON files
file_paths = [
    '/content/drive/MyDrive/dataset_generator/generated_paragraphs.json',
    '/content/drive/MyDrive/dataset_generator/generated_paragraphs2.json',
    '/content/drive/MyDrive/dataset_generator/generated_paragraphs3.json'
]

# Load the content of each JSON file
merged_content = []
for file_path in file_paths:
    with open(file_path, 'r') as file:
        content = json.load(file)
        merged_content.extend(content)

# Shuffle the merged content while keeping paragraphs and annotations aligned
random.shuffle(merged_content)

# Define the path for the new merged and shuffled JSON file
merged_file_path = '/content/drive/MyDrive/dataset_generator/merged_paragraphs_shuffled.json'

# Save the shuffled merged content to a new JSON file
with open(merged_file_path, 'w') as file:
    json.dump(merged_content, file, indent=2)

print(f"Shuffled content saved to {merged_file_path}")

Shuffled content saved to /content/drive/MyDrive/dataset_generator/merged_paragraphs_shuffled.json


Dividing the merged_paragraphs_shuffled.json into testing and training data
*80% of the data for training, 20% for testing*

In [None]:
import os
import json
import random

# Load the shuffled merged content
merged_file_path = '/content/drive/MyDrive/dataset_generator/merged_paragraphs_shuffled.json'
with open(merged_file_path, 'r') as file:
    merged_content = json.load(file)

# Define the percentage split between training and testing data
train_percent = 0.8  # 80% of the data for training, 20% for testing

# Calculate the number of examples for training and testing
num_examples = len(merged_content)
num_train = int(train_percent * num_examples)
num_test = num_examples - num_train

# Split the data into training and testing sets
train_data = merged_content[:num_train]
test_data = merged_content[num_train:]

# Define paths for the training and testing JSON files
train_file_path = '/content/drive/MyDrive/dataset_generator/train_data.json'
test_file_path = '/content/drive/MyDrive/dataset_generator/test_data.json'

# Save the training data to a JSON file
with open(train_file_path, 'w') as file:
    json.dump(train_data, file, indent=2)

# Save the testing data to a JSON file
with open(test_file_path, 'w') as file:
    json.dump(test_data, file, indent=2)

print(f"Training data saved to {train_file_path}")
print(f"Testing data saved to {test_file_path}")

Training data saved to /content/drive/MyDrive/dataset_generator/train_data.json
Testing data saved to /content/drive/MyDrive/dataset_generator/test_data.json


In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import json
with open('/content/drive/MyDrive/dataset_generator/train_data.json', 'r') as f:
    data = json.load(f)

In [None]:
data[0]['annotation']['entities']

[[89, 99, 'NAME OF PATIENT'],
 [174, 176, 'AGE'],
 [243, 245, 'WEIGHT'],
 [320, 337, 'DATE OF ADMISSION']]

In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()

In [None]:
from spacy.util import filter_spans
from spacy.tokens import DocBin
import spacy
import json

# Load the spaCy model
nlp = spacy.blank("en")

# Load the dataset from the JSON file
with open('/content/drive/MyDrive/dataset_generator/train_data.json', 'r') as f:
    data = json.load(f)

# Initialize a DocBin to store the processed documents
doc_bin = DocBin()

# Iterate through each training example
for training_example in data:
    text = training_example['paragraph']

    # Check if the 'annotation' key exists in the current example
    if 'annotation' in training_example:
        labels = training_example['annotation']['entities']

        # Create a Doc object from the text
        doc = nlp.make_doc(text)

        # Initialize a list to store the entities
        ents = []

        # Iterate through each entity label in the example
        for start, end, label in labels:
            # Create a span for the entity and add it to the list of entities
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)

        # Filter the spans to ensure there are no overlaps
        filtered_ents = filter_spans(ents)

        # Assign the filtered entities to the Doc object
        doc.ents = filtered_ents

        # Add the processed document to the DocBin
        doc_bin.add(doc)

# Define the file path where you want to save the spaCy binary file
file_path = '/content/drive/MyDrive/dataset_generator/train.spacy'

# Save the processed documents to the specified file path
doc_bin.to_disk(file_path)

print(f"Processed documents saved to {file_path}")

Processed documents saved to /content/drive/MyDrive/dataset_generator/train.spacy


In [None]:
!python -m spacy init fill-config /content/drive/MyDrive/dataset_generator/base_config.cfg /content/drive/MyDrive/dataset_generator/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/dataset_generator/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
import spacy

# Download and install the large English model
spacy.cli.download("en_core_web_lg")

# Load the model
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!python -m spacy train /content/drive/MyDrive/dataset_generator/config.cfg --output /content/drive/MyDrive/dataset_generator --paths.train /content/drive/MyDrive/dataset_generator/train.spacy --paths.dev /content/drive/MyDrive/dataset_generator/train.spacy

[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/dataset_generator[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     34.92    0.00    0.00    0.00    0.00
  0     200         25.37    844.63   99.45   99.32   99.57    0.99
  0     400          4.45     33.34   99.45   99.32   99.57    0.99
  0     600          6.12     21.78   99.77   99.65   99.90    1.00
  1     800          4.32     15.79   99.77   99.65   99.90    1.00
  2    1000         39.81     93.56   99.45   99.32   99.57    0.99
  2    1200         17.57     41.52   99.82   99.75   99.90    1.00
  3    1400         22.27     27.16   99.77   99.65   99.90    1.00
[38;5;2m✔ Saved pipeline to output directory[0m
/content/drive/MyDrive/dataset

In [None]:
nlp_ner = spacy.load("/content/drive/MyDrive/dataset_generator/model-best")

In [None]:
doc = nlp_ner("Hello there! I hope you're having a good day. Could you please tell me your name? Sure, my name is Tanya Bhargava. Great! And how old are you? I'm 36 years old. Thanks! What's your phone number, in case we need to reach you? My phone number is 8992902281. Perfect! And where do you live? My address is 01/871, Shankar Road, Bhiwani 054861.")

colors = {
    "NAME OF PATIENT": "#F67DE3",
    "AGE": "#7DF6D9",
    "WEIGHT": "#a6e22d",
    "NUMBER": "#FF5733",
    "ADDRESS": "#3498db",
    "DATE OF ADMISSION": "#f39c12"
}
options = {"colors": colors}

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)
mapped = []

for ent in doc.ents:
    mapped.append({
        "text": ent.text,
        "label": ent.label_
    })

print(mapped)

[{'text': 'Tanya Bhargava', 'label': 'NAME OF PATIENT'}, {'text': '36', 'label': 'AGE'}, {'text': '8992902281', 'label': 'NUMBER'}, {'text': '01/871, Shankar Road, Bhiwani 054861', 'label': 'ADDRESS'}]


In [None]:
import spacy
import pickle

# Load your trained model
nlp_ner = spacy.load("/content/drive/MyDrive/dataset_generator/model-best")

# Dump the model to a pickle file
with open("/content/drive/MyDrive/dataset_generator/model.pkl", "wb") as f:
    pickle.dump(nlp_ner, f)

In [None]:
# Define the epoch data
epoch_data = [
    {"ENTS_P": 99.32, "ENTS_R": 99.57, "ENTS_F": 99.45},
    {"ENTS_P": 99.65, "ENTS_R": 99.90, "ENTS_F": 99.77},
    {"ENTS_P": 99.75, "ENTS_R": 99.90, "ENTS_F": 99.82},
]

# Calculate accuracy (using F1-score as an approximation)
accuracy = sum(epoch["ENTS_F"] for epoch in epoch_data) / len(epoch_data)

print("Accuracy:", accuracy)


Accuracy: 99.67999999999999
