#### Checking for entitities if their length in less than 3. This ensures that all the entities matched accurately.

In [45]:
def print_short_matched_words(output_file):
    try:
        with open(output_file, "r", encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                parts = line.strip().split('\t')
                if len(parts) > 4:  # Ensure there are enough parts in the line
                    matched_word = parts[4]  # Matched word is in the fifth column
                    if len(matched_word) < 3:
                        print(line.strip())
    except IOError as e:
        print(f"Error reading file {output_file}: {e}")

# Define the path to the output file
output_file = "matched_entities.txt"

# Call the function to print words whose letters are less than 5
print_short_matched_words(output_file)


DISO	333	335	27911423_en.txt	on
DISO	768	770	27911423_en.txt	on
ANATOMY	48	50	27911423_en.txt	in
ANATOMY	195	197	27911423_en.txt	in
ANATOMY	665	667	27911423_en.txt	in
ANATOMY	726	728	27911423_en.txt	in
ANATOMY	111	113	28556645_en.txt	in
ANATOMY	309	311	28556645_en.txt	in
ANATOMY	353	355	28556645_en.txt	in
ANATOMY	515	517	28556645_en.txt	in
ANATOMY	707	709	28556645_en.txt	in
ANATOMY	808	810	28556645_en.txt	in
ANATOMY	856	858	28556645_en.txt	in
ANATOMY	876	878	28556645_en.txt	in
ANATOMY	934	936	28556645_en.txt	in
ANATOMY	954	956	28556645_en.txt	in
ANATOMY	989	991	28556645_en.txt	in
ANATOMY	1006	1008	28556645_en.txt	in
ANATOMY	1190	1192	28556645_en.txt	in
ANATOMY	1341	1343	28556645_en.txt	in
DISO	263	265	28250530_en.txt	on
ANATOMY	38	40	28250530_en.txt	in
ANATOMY	130	132	28250530_en.txt	in
ANATOMY	206	208	28250530_en.txt	in
ANATOMY	657	659	28250530_en.txt	in
ANATOMY	772	774	28250530_en.txt	in
ANATOMY	840	842	28250530_en.txt	in
ANATOMY	920	922	28250530_en.txt	in
ANATOMY	1056	1058	28250530_

#### Matching the entities with the test file generated using PubMedBERT and Changing the format of the file to match the required format

In [23]:
import os
import re
import json

def read_entity_dictionary(entity_folder):
    entity_dictionary = {}
    for filename in os.listdir(entity_folder):
        if filename.endswith(".txt"):
            entity_type = filename.split(".")[0]
            with open(os.path.join(entity_folder, filename), "r") as f:
                entity_texts = f.read().splitlines()
                entity_dictionary[entity_type] = set(entity_texts)
    return entity_dictionary

def match_text_with_entity_dicts(test_folder, entity_folder, output_file):
    entity_dictionary = read_entity_dictionary(entity_folder)
    results = []

    for test_file_name in os.listdir(test_folder):
        if test_file_name.endswith(".txt"):
            test_file_path = os.path.join(test_folder, test_file_name)
            document_id = test_file_name.replace('.txt', '')
            entities = []

            with open(test_file_path, "r") as f_test:
                test_text = f_test.read()

            for entity_type, terms in entity_dictionary.items():
                for term in terms:
                    if len(term) < 3:
                        continue
                    pattern = re.compile(r'\b' + re.escape(term) + r'\b', flags=re.IGNORECASE)
                    for match in pattern.finditer(test_text):
                        start_pos, end_pos = match.span()
                        matched_term = test_text[start_pos:end_pos]
                        entities.append([start_pos, end_pos, entity_type])

            # Sort entities by start position
            entities.sort(key=lambda x: x[0])

            # Create the JSON structure
            result = {
                "entities": entities,
                "id": document_id,
                "text": test_text
            }
            results.append(result)

    # Write results to a .jsonl file
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for result in results:
            json.dump(result, f_out)
            f_out.write('\n')

# Define paths
test_folder = "test"
entity_folder = "dict"
output_file = "filtered_matched_entities.jsonl"

# Match text from test files with entity dictionary files and write to output file
match_text_with_entity_dicts(test_folder, entity_folder, output_file)

print(f"Matching completed. Results saved to {output_file}")


Matching completed. Results saved to filtered_matched_entities.jsonl


#### Merging the entities from the dictionary and the test file containing the duplicates

In [49]:
import json
import os

def convert_to_jsonl(input_file, output_file, test_folder):
    data = {}
    # Read and aggregate entries by document ID
    with open(input_file, "r") as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) < 5:
                continue
            entity_type, start, end, doc_id, _ = parts
            start, end = int(start), int(end)
            doc_id_modified = doc_id.replace('.txt', '')  # Adjust this if the document IDs need slight modification
            if doc_id_modified not in data:
                data[doc_id_modified] = {"entities": [], "text": ""}
            data[doc_id_modified]["entities"].append([start, end, entity_type])
    
    # Load the text for each document ID from the test folder
    for doc_id in data.keys():
        text_filename = f"{doc_id}.txt"  # Ensure the naming convention matches exactly
        try:
            with open(os.path.join(test_folder, text_filename), "r", encoding='utf-8') as text_file:
                data[doc_id]["text"] = text_file.read()
        except FileNotFoundError:
            print(f"Text file for {doc_id} not found in {test_folder}")

    # Write to a JSONL file with the desired order of keys
    with open(output_file, "w") as outfile:
        for doc_id, content in data.items():
            # Sort entities by start position
            content["entities"].sort(key=lambda x: x[0])
            json_object = {
                "entities": content["entities"],
                "id": doc_id,
                "text": content["text"]
            }
            json.dump(json_object, outfile)
            outfile.write('\n')

# Define file paths
input_file = "umls_may15/filtered_matched_entities.txt"
output_file = "umls_may15/matched_entities.jsonl"
test_folder = "test"  # Ensure this is the correct path to the folder containing the text files

# Convert to JSONL
convert_to_jsonl(input_file, output_file, test_folder)

print(f"Conversion completed. Results saved to {output_file}")


Conversion completed. Results saved to umls_may15/matched_entities.jsonl


#### checking the counts after merging

In [53]:
import json

# Function to count entities in a JSONL file for each id
def count_entities_for_each_id(file_path):
    entity_count_per_id = {}
    type_count_per_id = {}
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            entities = data.get('entities', [])
            entity_count_per_id[data['id']] = len(entities)
            type_count_per_id[data['id']] = {}
            for start, end, entity_type in entities:
                type_count_per_id[data['id']][entity_type] = type_count_per_id[data['id']].get(entity_type, 0) + 1
    return entity_count_per_id, type_count_per_id

# Main function to print entity counts for each id
def print_entity_counts_for_each_id(file_path):
    entity_count_per_id, type_count_per_id = count_entities_for_each_id(file_path)
    for id_, count in entity_count_per_id.items():
        print(f"ID: {id_}, Total Entity Count: {count}")
        print("Entity Type Counts:")
        for entity_type, entity_count in type_count_per_id[id_].items():
            print(f"  {entity_type}: {entity_count}")

# Example usage
jsonl_file_path = 'umls_may15/merged_output.jsonl'  # Change this to the path of your matched_entities.jsonl file
print_entity_counts_for_each_id(jsonl_file_path)

ID: 27735898_en, Total Entity Count: 110
Entity Type Counts:
  CHEM: 9
  DISO: 14
  PHYS: 41
  ANATOMY: 34
  FINDING: 8
  LABPROC: 4
ID: 26978605_en, Total Entity Count: 142
Entity Type Counts:
  FINDING: 27
  PHYS: 28
  DISO: 54
  ANATOMY: 32
  LABPROC: 1
ID: 29403100_en, Total Entity Count: 37
Entity Type Counts:
  DISO: 21
  CHEM: 2
  FINDING: 7
  LABPROC: 1
  PHYS: 6
ID: 27911425_en, Total Entity Count: 88
Entity Type Counts:
  PHYS: 23
  ANATOMY: 36
  FINDING: 11
  DISO: 8
  LABPROC: 9
  DEVICE: 1
ID: 29953088_en, Total Entity Count: 57
Entity Type Counts:
  ANATOMY: 22
  DEVICE: 9
  DISO: 11
  FINDING: 9
  INJURY_POISONING: 4
  PHYS: 2
ID: 26977916_en, Total Entity Count: 122
Entity Type Counts:
  CHEM: 27
  DISO: 33
  PHYS: 29
  ANATOMY: 17
  FINDING: 16
ID: 27845314_en, Total Entity Count: 65
Entity Type Counts:
  FINDING: 10
  PHYS: 5
  ANATOMY: 4
  DISO: 39
  CHEM: 7
ID: 29695881_en, Total Entity Count: 108
Entity Type Counts:
  DISO: 49
  FINDING: 25
  PHYS: 34
ID: 29027524_

#### Merging the entities and removing the duplicates

In [54]:
import json

def read_jsonl_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file]

def merge_jsonl_files(filtered_path, test_path, output_path):
    # Read both JSONL files
    filtered_data = read_jsonl_file(filtered_path)
    test_data = read_jsonl_file(test_path)

    # Create a dictionary to map IDs to test data for easy lookup
    test_data_dict = {item['id']: item for item in test_data}

    # Combine data and remove duplicates
    for item in filtered_data:
        if item['id'] in test_data_dict:
            # Ensure there is an 'entities' key
            if 'entities' not in test_data_dict[item['id']]:
                test_data_dict[item['id']]['entities'] = []

            existing_entities = test_data_dict[item['id']]['entities']
            new_entities = [entity for entity in item['entities'] if entity not in existing_entities]
            test_data_dict[item['id']]['entities'].extend(new_entities)

    # Write results to the new JSONL file
    with open(output_path, 'w', encoding='utf-8') as f_out:
        for item in test_data_dict.values():
            json.dump(item, f_out)
            f_out.write('\n')

# Define file paths
filtered_path = 'umls_may15/matched_entities.jsonl'
test_path = 'test.jsonl'
output_path = 'umls_may15/merged_output_test.jsonl'

# Merge the JSONL files
merge_jsonl_files(filtered_path, test_path, output_path)

print(f"Data merged successfully without duplicates. Results saved to {output_path}")


Data merged successfully without duplicates. Results saved to umls_may15/merged_output_test.jsonl


In [55]:
import json

# Function to count entities in a JSONL file for each id
def count_entities_for_each_id(file_path):
    entity_count_per_id = {}
    type_count_per_id = {}
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            entities = data.get('entities', [])
            entity_count_per_id[data['id']] = len(entities)
            type_count_per_id[data['id']] = {}
            for start, end, entity_type in entities:
                type_count_per_id[data['id']][entity_type] = type_count_per_id[data['id']].get(entity_type, 0) + 1
    return entity_count_per_id, type_count_per_id

# Main function to print entity counts for each id
def print_entity_counts_for_each_id(file_path):
    entity_count_per_id, type_count_per_id = count_entities_for_each_id(file_path)
    for id_, count in entity_count_per_id.items():
        print(f"ID: {id_}, Total Entity Count: {count}")
        print("Entity Type Counts:")
        for entity_type, entity_count in type_count_per_id[id_].items():
            print(f"  {entity_type}: {entity_count}")

# Example usage
jsonl_file_path = 'umls_may15/merged_output_test.jsonl'  # Change this to the path of your matched_entities.jsonl file
print_entity_counts_for_each_id(jsonl_file_path)

ID: 27735898_en, Total Entity Count: 85
Entity Type Counts:
  CHEM: 9
  DISO: 13
  PHYS: 33
  ANATOMY: 18
  FINDING: 8
  LABPROC: 4
ID: 26978605_en, Total Entity Count: 106
Entity Type Counts:
  FINDING: 23
  PHYS: 20
  DISO: 45
  ANATOMY: 17
  LABPROC: 1
ID: 29403100_en, Total Entity Count: 29
Entity Type Counts:
  DISO: 13
  CHEM: 2
  FINDING: 7
  LABPROC: 1
  PHYS: 6
ID: 27911425_en, Total Entity Count: 76
Entity Type Counts:
  PHYS: 23
  ANATOMY: 26
  FINDING: 10
  DISO: 7
  LABPROC: 9
  DEVICE: 1
ID: 29953088_en, Total Entity Count: 52
Entity Type Counts:
  ANATOMY: 19
  DEVICE: 9
  DISO: 9
  FINDING: 9
  INJURY_POISONING: 4
  PHYS: 2
ID: 26977916_en, Total Entity Count: 68
Entity Type Counts:
  CHEM: 14
  DISO: 18
  PHYS: 16
  ANATOMY: 10
  FINDING: 10
ID: 27845314_en, Total Entity Count: 46
Entity Type Counts:
  FINDING: 10
  PHYS: 3
  ANATOMY: 4
  DISO: 22
  CHEM: 7
ID: 29695881_en, Total Entity Count: 72
Entity Type Counts:
  DISO: 28
  FINDING: 25
  PHYS: 19
ID: 29027524_en, 

In [58]:
import json

# Function to count entities in a JSONL file for each id
def count_entities_for_each_id(file_path):
    entity_count_per_id = {}
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            entity_count = len(data.get('entities', []))
            entity_count_per_id[data['id']] = entity_count
    return entity_count_per_id

# Function to compare entity counts between two JSONL files for each id
def compare_entity_counts(jsonl_file_path_1, jsonl_file_path_2):
    entity_count_per_id_1 = count_entities_for_each_id(jsonl_file_path_1)
    entity_count_per_id_2 = count_entities_for_each_id(jsonl_file_path_2)
    
    for id_, count_1 in entity_count_per_id_1.items():
        count_2 = entity_count_per_id_2.get(id_, 0)
        difference = count_1 - count_2
        print(f"ID: {id_}, Difference in Entity Count: {difference}")

# Example usage
#jsonl_file_path_1 = 'umls/merged_output_test.jsonl'
jsonl_file_path_1 = 'merged_output_no_duplicates.jsonl'  # Change this to the path of your first JSONL file
jsonl_file_path_2 = 'umls_may15/merged_output_test.jsonl'  # Change this to the path of your second JSONL file
compare_entity_counts(jsonl_file_path_1, jsonl_file_path_2)

ID: 27735898_en, Difference in Entity Count: 0
ID: 26978605_en, Difference in Entity Count: 1
ID: 29403100_en, Difference in Entity Count: 0
ID: 27911425_en, Difference in Entity Count: 0
ID: 29953088_en, Difference in Entity Count: 0
ID: 26977916_en, Difference in Entity Count: 1
ID: 27845314_en, Difference in Entity Count: 0
ID: 29695881_en, Difference in Entity Count: 0
ID: 29027524_en, Difference in Entity Count: 0
ID: 26288209_en, Difference in Entity Count: 0
ID: 29795108_en, Difference in Entity Count: 2
ID: 29076467_en, Difference in Entity Count: 0
ID: 27600777_en, Difference in Entity Count: 2
ID: 29240055_en, Difference in Entity Count: 0
ID: 27843162_en, Difference in Entity Count: 1
ID: 27459622_en, Difference in Entity Count: 0
ID: 30199050_en, Difference in Entity Count: 0
ID: 29411741_en, Difference in Entity Count: 1
ID: 27459485_en, Difference in Entity Count: 0
ID: 27804929_en, Difference in Entity Count: 0
ID: 27213795_en, Difference in Entity Count: 3
ID: 27735917_