# Supervised Relations Corpus Creation
### Merge CCD and CCDt Corpus

In [None]:
import json
import pandas as pd
import re

# ---- Load data ----
path_ccd = "ccd_corpus.json"
with open(path_ccd, encoding='utf-8') as f:
    data_ccd = json.load(f)['results']
path_ccdt = "ccdt_corpus.json"
with open(path_ccdt, encoding='utf-8') as f:
    data_ccdt = json.load(f)['results']

df = pd.read_csv("clinical_notes_text.csv")

# ---- Function to apply entity tags using indices ----
def add_indexed_tags(text, drug_spans, disease_spans):
    spans = []
    # Collect spans as (start, end, tag)
    for idx, span in enumerate(drug_spans):
        spans.append((span['start'], span['end'],
                      f"<drug{idx+1}>{text[span['start']:span['end']]}</drug{idx+1}>"))
    for idx, span in enumerate(disease_spans):
        spans.append((span['start'], span['end'],
                      f"<disease{idx+1}>{text[span['start']:span['end']]}</disease{idx+1}>"))

    # 1. Sort spans by length (longer first), then by start index
    spans.sort(key=lambda x: (-(x[1] - x[0]), x[0]))

    # 2. Remove overlapping spans → keep only the longest non-overlapping ones
    non_overlapping = []
    for start, end, tagged in spans:
        overlap = any(not (end <= s or start >= e) for s, e, _ in non_overlapping)
        if not overlap:
            non_overlapping.append((start, end, tagged))

    # 3. Sort back-to-front to preserve indices during insertion
    non_overlapping.sort(key=lambda x: x[0], reverse=True)

    # 4. Insert tags into text
    for start, end, tagged in non_overlapping:
        text = text[:start] + tagged + text[end:]
    return text


# ---- Process notes ----
df_per_sentences = pd.DataFrame(columns=['id', 'documents'])

for note in data_ccd:
    hadm_id = note['id']
    diseases = note['diseases_predicted']

    note_df = df[df['id'] == hadm_id]
    if note_df.empty:
        continue

    # Extract note text
    text = note_df.iloc[0]['documents']

    # Flatten nested drugs list [[...]] → [...]
    drugs_nested = [d['drugs_predicted'] for d in data_ccdt if d['id'] == hadm_id]
    drugs = [item for sublist in drugs_nested for item in sublist]

    # Clean empty entries
    diseases = [d for d in diseases if d]
    drugs = [d for d in drugs if d]

    if not diseases and not drugs:
        continue

    # Apply tags
    text_tagged = add_indexed_tags(text, drugs, diseases)

    # Split by '#' and keep only parts containing both entities
    for i, part in enumerate(text_tagged.split('#')):
        has_drug = re.search(r"<drug\d+>", part)
        has_disease = re.search(r"<disease\d+>", part)
        if has_drug and has_disease:
            drugs_found = re.findall(r"<drug\d+>(.*?)</drug\d+>", part)
            diseases_found = re.findall(r"<disease\d+>(.*?)</disease\d+>", part)
            
            df_per_sentences = pd.concat([df_per_sentences, pd.DataFrame([{
                'id': f"{hadm_id}_{i}",
                'documents': part
            }])], ignore_index=True)


# ---- Save ----
df_per_sentences.to_csv('df_per_sentences_filtered.csv', index=False)

### Cluter Predicted Relations to Create Relation Labels List
Using the LLaMIC output in flexible mode – meaning the model can generate relation labels freely – we cluster the predicted relations to derive consistent relation categories

In [None]:
import json
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np

plt.rcParams.update({
    'axes.facecolor': '#eeeeee',
    'axes.edgecolor': 'gray',
    'axes.grid': True,
    'grid.color': 'white',
    'grid.linestyle': '-',
    'grid.alpha': 1.0,
    'font.size': 9,
    'axes.titlesize': 11,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'legend.fontsize': 7,
    'legend.title_fontsize': 8,
    'figure.dpi': 300
})

# Load predicted biomedical relations
with open(r"llamic_flexible_results.json") as f:
    data = json.load(f)['results']

# Organize extracted relation types with updated terminology
categories = {
    'CCD-CCD': [],
    'CCDt-CCDt': [],
    'CCD-CCDt': []
}

for entry in data:
        relations = entry.get('relations', [])
        if not relations:
            continue

        for triplet in relations:
            if not isinstance(triplet, list) or len(triplet) != 3:
                continue
            head, rel, tail = triplet
            if '<disease' in head and '<disease' in tail:
                categories['CCD-CCD'].append(rel)
            elif '<drug' in head and '<drug' in tail:
                categories['CCDt-CCDt'].append(rel)
            elif ('<disease' in head and '<drug' in tail) or ('<drug' in head and '<disease' in tail):
                categories['CCD-CCDt'].append(rel)

# Remove duplicates
categories = {k: list(set(v)) for k, v in categories.items()}
# Load language model for semantic embedding
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare the figure with 3 subplots (vertical)
fig, axes = plt.subplots(nrows=3, figsize=(8, 14))
fig.subplots_adjust(hspace=0.4, right=0.75)

# Color palette
colors = plt.get_cmap("Set2").colors

# Iterate over categories and axes
for ax, (cat, rels) in zip(axes, categories.items()):
    if not rels:
        ax.set_visible(False)
        continue

    if len(rels) == 1:
        ax.scatter(0, 0, s=80, color=colors[0], label=rels[0])
        ax.set_title(f"Semantic Clustering – {cat}", weight='bold')
        ax.set_xlabel("PC1")
        ax.set_ylabel("PC2")
        ax.spines[['top', 'right']].set_visible(False)
        ax.legend(
            loc='upper left',
            bbox_to_anchor=(1.02, 1),
            borderaxespad=0.5,
            title="Representative Relation",
            frameon=True
        )
        continue

    embeddings = model.encode(rels)
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(embeddings)

    k = min(10, len(rels))
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(reduced)

    cluster_labels = {}
    for idx, label in enumerate(rels):
        cluster_id = clusters[idx]
        if cluster_id not in cluster_labels:
            cluster_labels[cluster_id] = label

    for cluster_id in range(k):
        points = reduced[np.array(clusters) == cluster_id]
        ax.scatter(points[:, 0], points[:, 1], s=50, alpha=0.85,
                   color=colors[cluster_id % len(colors)],
                   label=cluster_labels[cluster_id])

    ax.set_title(f"Semantic Clustering – {cat}", weight='bold')
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.spines[['top', 'right']].set_visible(False)
    ax.legend(
        loc='upper left',
        bbox_to_anchor=(1.02, 1),
        borderaxespad=0.5,
        title="Representative Relation",
        frameon=True
    )


# Save and show
plt.tight_layout()
plt.show()

## Input for BioLinkBert
Converts CSVs with annotated clinical sentences into JSON for BioLinkBERT, marking entity pairs as [E1]/[E2] and assigning relation labels The BioLinkBERT model is available at: https://github.com/michiyasunaga/LinkBERT/tree/main/src/seqcls

In [None]:
import re
import os
from itertools import combinations
import pandas as pd
import json

# Load CSVs
train_path = "../data/train.csv" # supervised corpus
dev_path = "../data/dev.csv" # supervised corpus
test_path = "../data/test.csv" # supervised corpus

train_table = pd.read_csv(train_path)
dev_table = pd.read_csv(dev_path)
test_table = pd.read_csv(test_path)

# Convert relations string to list of dicts
for df in [train_table, dev_table, test_table]:
    df['relations'] = df['relations'].apply(lambda x: eval(x) if pd.notna(x) else [])

# Function to extract the entity ID
def extract_entity_id(text):
    match = re.search(r'<(disease|drug)(\d+)>', text)
    if match:
        return f"{match.group(1)}{match.group(2)}"
    return None

# Mark entities as [E1] and [E2]
def annotate_entities(text, e1_id, e2_id):
    def repl(match):
        tag, idx, content = match.groups()
        full_id = f"{tag.lower()}{idx}"
        if full_id == e1_id:
            return f"[E1] {content} [/E1]"
        elif full_id == e2_id:
            return f"[E2] {content} [/E2]"
        return content
    return re.sub(r'<(disease|drug)(\d+)>(.*?)</\1\2>', repl, text)

# Find all entities in the sentence
def find_all_entities(text):
    return re.findall(r'<(disease|drug)(\d+)>', text)

# Main function to process the annotations
def generate_annotated_pairs(df):
    output_dataset = []
    for _, row in df.iterrows():
        sentence = row['documents']
        triplets = row['relations']

        entities = find_all_entities(sentence)
        entity_ids = [f"{t[0]}{t[1]}" for t in entities]

        # Generate unique combinations of entity pairs
        for e1_id, e2_id in combinations(entity_ids, 2):
            annotated_text = annotate_entities(sentence, e1_id, e2_id)
            
            # Check if there is a relation in any order
            label = "NA"
            for t in triplets:
                h = extract_entity_id(t[0])
                t_ = extract_entity_id(t[2])
                if {h, t_} == {e1_id, e2_id}:  # relation regardless of order
                    label = t[1]
                    break

            output_dataset.append({
                "text": annotated_text,
                "label": label
            })
    return output_dataset

# Generate datasets
train_data = generate_annotated_pairs(train_table)
val_data = generate_annotated_pairs(dev_table)
test_data = generate_annotated_pairs(test_table)

# Create output directories inside the input file directories
train_output_dir = os.path.join(os.path.dirname(train_path), "input_biolinkbert")
dev_output_dir = os.path.join(os.path.dirname(dev_path), "input_biolinkbert")
test_output_dir = os.path.join(os.path.dirname(test_path), "input_biolinkbert")

for d in [train_output_dir, dev_output_dir, test_output_dir]:
    os.makedirs(d, exist_ok=True)

# Save JSON files
with open(os.path.join(train_output_dir, "train.json"), "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open(os.path.join(dev_output_dir, "validation.json"), "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)

with open(os.path.join(test_output_dir, "test.json"), "w", encoding="utf-8") as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

print(f" Data saved:\n  Train: {len(train_data)}\n  Validation: {len(val_data)}\n  Test: {len(test_data)}")
