# Supervised Corpus Creation


### Process BENT Output
After using BENT to predict entities, the predicted entities were masked in the clinical notes in order to allow LLaMIC to extract entities a second time from the modified notes.

In [None]:
import json
import re
import pandas as pd

def blank_all_predicted_entities(document, entities, context_chars=20):
    """
    Replace all occurrences of predicted entities in the document with underscores,
    preserving the original length of the entity.
    Prints each entity in the document surrounded by < > with some context before blanking.
    """
    for entity in entities:
        pattern = r'\b' + re.escape(entity) + r'\b'
        matches = list(re.finditer(pattern, document, flags=re.IGNORECASE))
        for match in matches:
            start, end = match.start(), match.end()
            # Determine context boundaries
            context_start = max(0, start - context_chars)
            context_end = min(len(document), end + context_chars)
            snippet = document[context_start:start] + "<" + document[start:end] + ">" + document[end:context_end]
            
            #print(f"Context preview: ...{snippet}...")
            
            # Replace entity with underscores
            blanked = '_' * (end - start)
            document = document[:start] + blanked + document[end:]
    return document

def extract_entities(data, entity_type='disease'):
    """
    Extract unique predicted entities of the specified type from the data.
    """
    key_map = {
        'disease': 'diseases_predicted',
        'drug': 'drugs_predicted'
    }
    entities = []
    for item in data:
        for e in item.get(key_map[entity_type], []):
            entities.append(e['entity'])
    return sorted(set(entities))

if __name__ == "__main__":
    # Load JSON data
    input_file = "bent_predicts.json"  # BENT predicts JSON file
    input_document = "clinical_notes.csv.gz"  # Clinical notes CSV file
    output_document = "clinical_notes_blanked.csv.gz"
    with open(input_file) as f:
        data = json.load(f)['results']

    # Load CSV data
    df = pd.read_csv(input_document, compression='gzip')

    # Keep only the rows where id is in the JSON
    df_filtered = df[df['id'].isin([note['id'] for note in data])].copy()

    for note in data:
        note_id = note['id']
        entities = extract_entities([note], entity_type='disease')

        documents = df_filtered.loc[df_filtered['id'] == note_id, 'documents'].values[0]

        # Replace predicted entities with blanks, printing them before
        blanked_document = blank_all_predicted_entities(documents, entities)

        # Update the documents in the filtered dataframe
        df_filtered.loc[df_filtered['id'] == note_id, 'documents'] = blanked_document

    # Save only the filtered, blanked notes
    df_filtered.to_csv(output_document, index=False)


### Merge BENT and LLaMIC

In [None]:
import json

def merge_entity_jsons(json_file1, json_file2, output_file):
    """
    Merge two JSON files containing entity predictions without duplicating entities.
    Duplicates are defined by ('entity', 'icd'/'mesh') for each entity type.
    """
    # Load first JSON
    with open(json_file1) as f1:
        data1 = json.load(f1)['results']

    # Load second JSON
    with open(json_file2) as f2:
        data2 = json.load(f2)['results']

    # Merge entities by note id
    merged_dict = {}

    for item in data1 + data2:
        note_id = item['id']
        if note_id not in merged_dict:
            merged_dict[note_id] = item.copy()
        else:
            for key, id_field in [('diseases_predicted', 'icd'), ('drugs_predicted', 'mesh')]:
                if key in item:
                    existing_entities = merged_dict[note_id].get(key, [])
                    new_entities = item[key]

                    # Avoid duplicates based on entity text + id_field
                    existing_set = {(e['entity'], e.get(id_field)) for e in existing_entities}
                    for e in new_entities:
                        if (e['entity'], e.get(id_field)) not in existing_set:
                            print(f"Adding new entity to note {note_id}: {e['entity']} ({id_field})")
                            existing_entities.append(e)
                    merged_dict[note_id][key] = existing_entities

    # Convert merged dictionary back to list
    merged_data = list(merged_dict.values())

    # Save merged JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({'results': merged_data}, f, indent=4, ensure_ascii=False)

    print(f"Merged JSON saved to: {output_file}")


if __name__ == "__main__":
    bent_file = "bent_predicts.json"
    llamic_file = "llamic_predicts_on_blanked_notes.json"
    output_file = "entities_merged.json"
    merge_entity_jsons(bent_file, llamic_file, output_file)


### Cluter Predicted Entities to Manual Correct
Cluster predicted entities for subsequent manual curation to remove incorrect entities

In [None]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
import numpy as np

def cluster_entities(entities, eps=0.5, min_samples=2):
    # TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(entities)

    # Compute cosine similarity and convert to distance
    cosine_sim = cosine_similarity(tfidf_matrix)
    distance_matrix = np.clip(1 - cosine_sim, 0, 1)

    # DBSCAN clustering
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    labels = dbscan.fit_predict(distance_matrix)

    # Group entities by cluster
    clusters = {}
    for label, entity in zip(labels, entities):
        clusters.setdefault(label, []).append(entity)

    return clusters


def save_clusters_to_file(clusters, filename):
    """Save clusters dictionary to a text file."""
    with open(filename, 'w') as f:
        for label, entity_list in clusters.items():
            f.write(f"Cluster {label}:\n")
            for entity in entity_list:
                f.write(f" - {entity}\n")
            f.write("\n")


if __name__ == "__main__":
    input_file = 'automatic_predictions.json'
    with open(input_file) as f:
        data = json.load(f)['results']

    # Extract entities
    entities = extract_entities(data, entity_type=entity_type)
    # Cluster entities
    clusters = cluster_entities(entities, eps=0.5, min_samples=2)
    save_clusters_to_file(clusters, f"clusters_{entity_type}s.txt")


Script to update the JSON file by removing entities that were excluded during cluster curation

In [None]:
import json
import re

def load_kept_entities(cluster_file):
    """
    Read manually edited cluster file and return a set of entities to keep.
    """
    kept_entities = set()
    with open(cluster_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('- '):
                entity = line[2:].strip()
                kept_entities.add(entity)
    print(f"Kept entities loaded: {kept_entities}")
    return kept_entities

def filter_entities_in_json(input_json_file, output_json_file, kept_entities):
    """
    Filter the JSON so that only entities present in kept_entities are preserved.
    """
    with open(input_json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for note in data['results']:
        # Filter diseases
        if 'diseases_predicted' in note:
            filtered_diseases = []
            for e in note.get('diseases_predicted', []):
                if e['entity'] in kept_entities:
                    filtered_diseases.append(e)
                else:
                    print(f"Removed entity: '{e['entity']}' from note id {note['id']}")
            note['diseases_predicted'] = filtered_diseases

        # Filter drugs (optional, uncomment if needed)
        if 'drugs_predicted' in note:
            filtered_drugs = []
            for e in note.get('drugs_predicted', []):
                if e['entity'] in kept_entities:
                    filtered_drugs.append(e)
                else:
                    print(f"Removed entity: '{e['entity']}' from note id {note['id']}")
            note['drugs_predicted'] = filtered_drugs

    # Save updated JSON
    with open(output_json_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Filtered JSON saved to: {output_json_file}")


if __name__ == "__main__":
    input_json_file = 'automatic_predictions.json'
    output_json_file = 'automatic_predictions_filtered.json'
    cluster_file = 'clusters_diseases.txt' # manually edited file

    kept_entities = load_kept_entities(cluster_file)
    filter_entities_in_json(input_json_file, output_json_file, kept_entities)


### Correct Terminologies
Create a set of entities with their corresponding ICD or MeSH codes in a TXT file, one pair per line, to allow subsequent manual curation.

In [None]:
import json

def save_entities_to_txt(json_file, output_txt_file):
    """
    Extract entities and their ICD/MeSH codes from the JSON and save them to a TXT file.
    Each line: entity \t id
    """
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)['results']

    lines = set()  # Use set to avoid duplicates

    for note in data:
        for key, id_field in [('diseases_predicted', 'icd'), ('drugs_predicted', 'mesh')]:
            for e in note.get(key, []):
                entity_text = e.get('entity', '').strip()
                entity_id = e.get(id_field, '').strip()
                if entity_text and entity_id:
                    lines.add(f"{entity_text}\t{entity_id}")

    # Save to TXT file, one entity per line
    with open(output_txt_file, 'w', encoding='utf-8') as f:
        for line in sorted(lines):
            f.write(line + '\n')

    print(f"Entities saved to {output_txt_file}")


if __name__ == "__main__":
    input_json_file = "automatic_predictions_filtered.json"
    output_txt_file = "entities_for_manual_correction.txt"
    save_entities_to_txt(input_json_file, output_txt_file)


Script to update the JSON file by updating ICD or MeSH codes

In [None]:
import json

def load_kept_entities(txt_file):
    """
    Load manually corrected entities from a TXT file.
    Each line should have: entity \t id
    Returns a dictionary: {entity: id}
    """
    kept_entities = {}
    with open(txt_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split('\t')
            if len(parts) == 2:
                entity, entity_id = parts
                kept_entities[entity.strip()] = entity_id.strip()
    return kept_entities

def update_json_with_kept_entities(input_json, txt_file, output_json):
    """
    Update the JSON, keeping only entities present in the manually corrected TXT.
    The ID (icd/mesh) is updated according to the TXT.
    """
    kept_entities = load_kept_entities(txt_file)

    with open(input_json, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for note in data['results']:
        for key, id_field in [('diseases_predicted', 'icd'), ('drugs_predicted', 'mesh')]:
            filtered_entities = []
            for e in note.get(key, []):
                entity_name = e['entity']
                if entity_name in kept_entities:
                    # Update the id (icd or mesh) according to the TXT
                    e[id_field] = kept_entities[entity_name]
                    filtered_entities.append(e)
                else:
                    print(f"Removed entity '{entity_name}' from note {note['id']}")
            note[key] = filtered_entities

    # Save updated JSON
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Updated JSON saved to: {output_json}")


if __name__ == "__main__":
    input_json = "automatic_predictions_filtered.json"
    txt_file = "entities_for_manual_correction.txt"
    output_json = "semi-automatic_predictions.json"
    update_json_with_kept_entities(input_json, txt_file, output_json)


After this manual pre-correction, run the script `note_level_entity_correction.py` on `semi_automatic_predictions.json` to perform corrections at the note and entity levels.