## Adverse Drug Event Extraction Using Generative NLP Models (CADEC Dataset)

An NLP system that extracts adverse drug events (ADEs) from CADEC patient forum texts using generative models from Hugging Face.


## Setup and Initialization


### Mount drive and load dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/Datasets/CADEC.v2.zip /content/
!unzip -q CADEC.v2.zip -d CADEC.v2

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
replace CADEC.v2/cadec/meddra/ARTHROTEC.1.ann? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


### Install Required Packages


In [2]:
%%capture
%pip install langchain transformers langchain-huggingface bitsandbytes accelerate rapidfuzz

### Import Libraries and Define Constants


In [None]:
from transformers import pipeline
from rapidfuzz import fuzz, process
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache
import json
import requests
import torch
import re
import os

MAX_RETRIES = 3
BATCH_SIZE = 20
SAMPLE_SIZE = 400
GENERATIVE_MODEL_NAME = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
EMBEDDING_MODEL_NAME = "paraphrase-MiniLM-L6-v2"
UMLS_API_KEY = "xxxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
UMLS_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"



print(f"CUDA available: {torch.cuda.is_available()}")
print(
    f"Current CUDA device: {torch.cuda.current_device() if torch.cuda.is_available() else 'None'}"
)

CUDA available: True
Current CUDA device: 0


## Data Preprocessing


In [4]:
directory = "CADEC.v2/cadec"
text_dir = directory + "/text"
original_ann_dir = directory + "/original"

### Load posts


In [5]:
def load_posts(directory):
    """
    Load posts from text files in the specified directory.
    Args:
        directory (str): Directory path containing post text files
    Returns:
        list: List of dictionaries with file name and post content
    """
    posts = []
    for filename in sorted(os.listdir(directory)):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf-8") as file:
                text = file.read()
                posts.append({"file_name": filename, "content": text})
    return posts


posts = load_posts(text_dir)
print(f"Loaded {len(posts)} posts")
posts[:2]

Loaded 1250 posts


[{'file_name': 'ARTHROTEC.1.txt',
  'content': "I feel a bit drowsy & have a little blurred vision, so far no gastric problems.\nI've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it.\nDue to my arthritis getting progressively worse, to the point where I am in tears with the agony, gp's started me on 75 twice a day and I have to take it.\nevery day for the next month to see how I get on, here goes.\nSo far its been very good, pains almost gone, but I feel a bit weird, didn't have that when on 50.\n"},
 {'file_name': 'ARTHROTEC.10.txt',
  'content': 'Hunger pangs.\nBrilliant, I have a new lease of life, i walk up & down steps properly, no longer sideways like a toddler, hip pain as gone other than if i jar it.\n'}]

### Load annotations


In [6]:
def load_annotations(directory):
    """
    Load annotations from .ann files in the specified directory.
    Args:
        directory (str): Directory path containing annotation files
    Returns:
        list: List of dictionaries with file name and annotations
    """
    annotated_posts = []
    for filename in sorted(os.listdir(directory)):
        if filename.endswith(".ann"):
            filepath = os.path.join(directory, filename)
            post_annotations = {
                "file_name": filename,
                "drugs": [],
                "ades": [],
                "symptoms_diseases": [],
            }
            valid_file = False
            try:
                with open(filepath, "r", encoding="utf-8") as file:
                    for line in file:
                        line = line.strip()
                        if not line or line.startswith("#") or line.startswith("---"):
                            continue
                        parts = line.split("\t")
                        if len(parts) != 3:
                            continue
                        valid_file = True
                        ann_id, ann_info, text = parts[0], parts[1], parts[2]
                        info_fields = ann_info.split()
                        if not info_fields:
                            continue
                        ann_type = info_fields[0].strip().lower()
                        if ann_type == "drug":
                            post_annotations["drugs"].append(text.strip())
                        elif ann_type == "adr":
                            post_annotations["ades"].append(text.strip())
                        elif ann_type in ("symptom", "disease", "finding"):
                            post_annotations["symptoms_diseases"].append(text.strip())
                        else:
                            continue
                    if not valid_file:
                        print(f"Warning: Empty file {filename}")
                    annotated_posts.append(post_annotations)
            except Exception as e:
                print(f"Warning: Could not process file {filename}. Error: {e}")
                annotated_posts.append(post_annotations)
    return annotated_posts


annotated_posts = load_annotations(original_ann_dir)
print(f"Loaded {len(annotated_posts)} annotated posts")
annotated_posts[:2]

Loaded 1250 annotated posts


[{'file_name': 'ARTHROTEC.1.ann',
  'drugs': ['Arthrotec'],
  'ades': ['bit drowsy',
   'little blurred vision',
   'gastric problems',
   'feel a bit weird'],
  'symptoms_diseases': ['arthritis', 'agony', 'pains']},
 {'file_name': 'ARTHROTEC.10.ann',
  'drugs': [],
  'ades': ['Hunger pangs'],
  'symptoms_diseases': ['hip pain', 'walk up & down steps sideways']}]

## Entity Extraction

Using a generative model to expand abbreviations then extract entities from un-annotated posts.


### Initialize pipeline and prompts


In [7]:
hf_pipe = pipeline(
    "text-generation", model=GENERATIVE_MODEL_NAME, batch_size=BATCH_SIZE, model_kwargs={"temperature": 0}
)

EXTRACTION_SYSTEM_PROMPT = """Analyze the provided medical forum post on patient-reported Adverse Drug Events. Extract all mentions of Drugs, Adverse Drug Events, and Symptoms/Diseases.
                Keys:
                - 'drugs': Exact mentions of the name of a medicine or drug. Ignore generic terms like "pills". Ignore dosage strength and form (e.g., "75", "tablet").
                - 'ades': Any adverse drug reactions or side effects implied to be due to a drug.
                - 'symptoms_diseases': Any other medical conditions or symptoms mentioned.

                Use empty lists for categories with no mentions. If an entity could be interpreted as either an ADE or a generic symptom/disease, classify it as an ADE. Do not give duplicates. Return a JSON object with keys 'drugs', 'ades', and 'symptoms_diseases', each mapping to a list of exact extracted strings. Ensure valid JSON with only the extracted terms, no explanations."""

EXPAND_ABB_SYSTEM_PROMPT = "Expand all medical abbreviations in the provided text, replacing each abbreviation with its full term. Return only the original text with these expansions, preserving all other content, structure, and wording unchanged. Do not add any explanation."

Device set to use cuda:0


### Abbreviation Expansion


In [8]:
def expand_abbreviations_batch(posts):
    """
    Expand medical abbreviations in a batch of posts using the language model.
    Args:
        posts (list): List of post dictionaries with 'content' field
    Returns:
        list: List of expanded post texts
    """
    messages = []
    for post in posts:
        messages.append(
            [
                {"role": "system", "content": EXPAND_ABB_SYSTEM_PROMPT},
                {"role": "user", "content": post["content"]},
            ]
        )
    output = hf_pipe(messages, max_new_tokens=512)
    expanded_posts = []
    for i in range(len(posts)):
        expanded_posts.append(
            output[i][0]["generated_text"][len(messages[0])]["content"]
        )
    return expanded_posts

### Entity Extraction


In [9]:
def extract_entities_json(text, feedback=None, previous_attempt=None) -> str:
    """
    Extracts entities from `text` as a JSON string, optionally using feedback and a previous attempt.
    Args:
        text (str): The source text to extract entities from.
        feedback (str, optional): Feedback on the previous extraction output.
        previous_attempt (str, optional): JSON string output from a previous extraction attempt.
    Returns:
        str: raw json string of entities.
    """
    dialogue = [
        {"role": "system", "content": EXTRACTION_SYSTEM_PROMPT},
        {"role": "user", "content": text},
    ]
    if previous_attempt:
        dialogue.append({"role": "assistant", "content": previous_attempt})
    if feedback:
        dialogue.append(
            {
                "role": "user",
                "content": (
                    "I reviewed the previous extraction and have the following feedback:\n"
                    f"{feedback}\n"
                    "Please revise the extraction to address these points while adhering to the formatting instructions."
                ),
            }
        )
    messages = [dialogue]
    result = hf_pipe(messages, max_new_tokens=512)
    output = result[0][0]["generated_text"][len(dialogue)]["content"]
    return output


def extract_entities_batch(texts) -> list:
    """
    Extract entities from a batch of texts as JSON strings. Uses HF batch processing.
    Args:
        texts (list of str): The source texts to extract entities from.
    Returns:
        list of str: List of raw JSON strings of entities.
    """
    messages = []
    for text in texts:
        messages.append(
            [
                {"role": "system", "content": EXTRACTION_SYSTEM_PROMPT},
                {"role": "user", "content": text},
            ]
        )
    output = hf_pipe(messages, max_new_tokens=512)
    extracted_entites_json = []
    for i in range(len(texts)):
        extracted_entites_json.append(output[i][0]["generated_text"][len(messages[0])]["content"])
    return extracted_entites_json

### Expand and extract entities, then save to files


In [10]:
expanded_texts = expand_abbreviations_batch(posts[:SAMPLE_SIZE])
raw_json_strings = extract_entities_batch(expanded_texts)

with open("expanded_texts.txt", "w") as outfile:
    for item in expanded_texts:
        outfile.write(item.strip() + "\n\n")


with open("raw_json_strings.jsonl", "w") as outfile:
    for item in raw_json_strings:
        outfile.write(item.strip() + "\n")

print(f"Processed {len(expanded_texts)} posts\n")
print(f"Sample expanded text: {expanded_texts[0][:200]}...\n")
print(f"Sample extracted entities: {raw_json_strings[0]}")

Processed 400 posts

Sample expanded text: I feel a bit drowsy & have a little blurred vision, so far no gastric problems.
I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it.
Due to my arthritis getting pro...

Sample extracted entities: ```json
{
  "drugs": ["Arthrotec", "75"],
  "ades": ["drowsy", "blurred vision", "weird feeling"],
  "symptoms_diseases": ["arthritis", "agony"]
}
```


## Entity standardization using UMLS

Using the UMLS API to standardize the extracted entities. Run and tested with API key.


In [None]:
vocab_map = {
    "drugs": "RXNORM",
    "ades": "SNOMEDCT_US",
    "symptoms_diseases": "SNOMEDCT_US",
}

@lru_cache(maxsize=1024)
def query_umls(entity: str, vocab: str) -> dict:
    """
    Query the UMLS API for standardized terms.
    Args:
        entity (str): The entity term to standardize
        vocab (str): The vocabulary to search in (e.g., "RXNORM", "SNOMEDCT_US")
    Returns:
        dict: Dictionary with original term, standardized term, and concept ID
    """
    params = {"string": entity, "apiKey": UMLS_API_KEY, "sabs": vocab, "pageSize": 1}

    try:
        response = requests.get(UMLS_URL, params=params, timeout=30)
        response.raise_for_status()
        results = response.json().get("result", {}).get("results", [])
        return {
            "original": entity,
            "standard_term": results[0]["name"] if results else None,
            "CUI": results[0]["ui"] if results else None,
        }
    except Exception as e:
        print(f"Error querying UMLS: {e}\nparams={params}")
        return {"original": entity, "standard_term": None, "CUI": None}


def standardize_entities(entities: dict) -> dict:
    """
    Standardize entities using UMLS.
    Args:
        entities (dict): keys: 'drugs', 'ades', 'symptoms_diseases'
    Returns:
        dict: standardized terms
    """
    try:
        return {
            key: [
                (query_umls(item, vocab).get("standard_term") or item)
                for item in entities.get(key, [])
            ]
            for key, vocab in vocab_map.items()
        }
    except Exception as e:
        print(f"Error standardizing entities: {e}\nInput: {entities}")
        return entities


def standardize_annotations_parallel(entities_list, max_workers=30):
    """
    Standardize multiple entity dictionaries in parallel.
    Args:
        entities_list (list): List of entity dictionaries
        max_workers (int): Maximum number of parallel workers
    Returns:
        list: List of standardized entity dictionaries
    """
    results = [
        {
        "file_name": post["file_name"],
        "entities": {cat: [] for cat in vocab_map}
        }
        for post in entities_list
    ]

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(query_umls, value, vocab_map[cat]): (i, cat, value)
            for i, post in enumerate(entities_list)
            for cat in vocab_map
            for value in post.get(cat, [])
        }
        for future in as_completed(futures):
            i, cat, original = futures[future]
            std_term = future.result().get("standard_term") or original
            results[i]["entities"][cat].append(std_term)

    return results

### Run batched standardization

In [None]:
# Standardize ground truth annotated posts first
standardized_annotated_posts = standardize_annotations_parallel(annotated_posts[:SAMPLE_SIZE])
standardized_annotated_posts[:2]

[{'file_name': 'ARTHROTEC.1.ann',
  'entities': {'drugs': ['Arthrotec'],
   'ades': ['gastric problems',
    'little blurred vision',
    'feel a bit weird',
    'bit drowsy'],
   'symptoms_diseases': ['Agony', 'Arthritis', 'Growing pains']}},
 {'file_name': 'ARTHROTEC.10.ann',
  'entities': {'drugs': [],
   'ades': ['Hunger pangs'],
   'symptoms_diseases': ['walk up & down steps sideways', 'Hip joint pain']}}]

## Verification system
Verify the extracted entities by
- parsing JSON
- checking completeness
- checking semantic similarity

In [None]:
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device="cuda:0" if torch.cuda.is_available() else "cpu")

SISTER_CATEGORIES = {"ades": "symptoms_diseases", "symptoms_diseases": "ades"}


def parse_json_string(raw_str):
    """
    Parse a JSON string from a potentially messy text.
    Args:
        raw_str (str): Raw string containing JSON
    Returns:
        dict: Parsed JSON object
    """
    try:
        json_match = re.search(r"\{.*?\}", raw_str, re.DOTALL)
        if not json_match:
            raise json.JSONDecodeError("No JSON object found", raw_str, 0)
        json_str = json_match.group(0)
        return json.loads(json_str)
    except json.JSONDecodeError:
        raise
    except Exception as e:
        raise json.JSONDecodeError(f"Failed to extract JSON: {str(e)}", raw_str, 0)


def validate_and_count_entities(extracted_entities, ground_truth_entities):
    """
    Validate entity dictionaries and collect count information.
    Args:
        extracted_entities (dict): Dictionary of extracted entities
        ground_truth_entities (dict): Dictionary of ground truth entities
    Returns:
        tuple: (is_valid, error_message, gt_counts, gen_counts, total_gt, total_gen)
    """
    if not ground_truth_entities:
        return True, None, {}, {}, 0, 0

    gt_counts = {}
    gen_counts = {}
    total_gt = 0
    total_gen = 0

    for category in ground_truth_entities:
        gt_entities = ground_truth_entities[category]
        if not isinstance(gt_entities, list):
            continue
        if category not in extracted_entities:
            return False, f"Feedback: Missing category '{category}'.", {}, {}, 0, 0

        gen_entities = extracted_entities.get(category, [])

        gt_counts[category] = len(gt_entities)
        gen_counts[category] = len(gen_entities)

        total_gt += len(gt_entities)
        total_gen += len(gen_entities)

    return True, None, gt_counts, gen_counts, total_gt, total_gen


def generate_feedback(gt_counts, gen_counts, default_message):
    """
    Generate feedback message based on entity count differences.
    Args:
        gt_counts (dict): Ground truth entity counts by category
        gen_counts (dict): Generated entity counts by category
        default_message (str): Default message if no specific feedback
    Returns:
        str: Feedback message
    """
    feedback_parts = []

    for category in gt_counts:
        gt_count = gt_counts[category]
        gen_count = gen_counts.get(category, 0)

        if gt_count > gen_count:
            feedback_parts.append(
                f"more '{category}' (expected {gt_count}, got {gen_count})"
            )
        elif gt_count < gen_count:
            feedback_parts.append(
                f"fewer '{category}' (expected {gt_count}, got {gen_count})"
            )

    if not feedback_parts:
        return default_message

    return "Feedback: Try extracting " + ", ".join(feedback_parts) + "."


def verify_completeness(
    extracted_entities,
    ground_truth_entities,
    completeness_threshold=0.6,
    fuzzy_match_threshold=60,
    partial_credit = 0.7
):
    """
    Verify if extracted entities match ground truth using fuzzy string matching.
    Allows matches across sister categories with partial credit.
    Args:
        extracted_entities (dict): Dictionary of extracted entities
        ground_truth_entities (dict): Dictionary of ground truth entities
        completeness_threshold (float): Threshold for completeness ratio
        fuzzy_match_threshold (int): Threshold for fuzzy string matching
        partial_credit (float): Credit for matches in sister categories
    Returns:
        tuple: (is_verified, feedback_message)
    """
    is_valid, error, gt_counts, gen_counts, total_gt, total_gen = (
        validate_and_count_entities(extracted_entities, ground_truth_entities)
    )

    if not is_valid:
        return False, error

    total_match_credit = 0.0

    for category, gt_entities in ground_truth_entities.items():
        if not gt_entities:
            continue
        if not isinstance(gt_entities, list):
            continue
        gen_entities = extracted_entities.get(category, [])
        processed_gen_entities = [e.lower() for e in gen_entities]

        for gt_entity in gt_entities:
            match_credit = 0.0
            best_score = 0

            # Try matching in the same category
            if processed_gen_entities:
                _, score, _ = process.extractOne(
                    gt_entity.lower(), processed_gen_entities, scorer=fuzz.token_ratio
                )
                if score >= fuzzy_match_threshold:
                    best_score = score
                    match_credit = 1.0

            # If no match, try sister category
            if best_score < fuzzy_match_threshold and category in SISTER_CATEGORIES:
                sister_category = SISTER_CATEGORIES[category]
                sister_entities = extracted_entities.get(sister_category, [])

                if sister_entities:
                    processed_sister_entities = [e.lower() for e in sister_entities]
                    _, score, _ = process.extractOne(
                        gt_entity.lower(),
                        processed_sister_entities,
                        scorer=fuzz.token_ratio,
                    )
                    if score >= fuzzy_match_threshold:
                        match_credit = partial_credit

            total_match_credit += match_credit

    recall = total_match_credit / total_gt if total_gt > 0 else 1.0
    if recall >= completeness_threshold:
        return True, None

    return False, generate_feedback(
        gt_counts, gen_counts, "Feedback: Check entity accuracy or category assignment."
    )


def verify_similarity(
    extracted_entities,
    ground_truth_entities,
    similarity_threshold=0.7,
    match_threshold=0.6,
    partial_credit=0.7,
):
    """
    Verify if extracted entities are semantically similar to ground truth using embeddings.
    Allows matches across sister categories with partial credit.
    Args:
        extracted_entities (dict): Dictionary of extracted entities
        ground_truth_entities (dict): Dictionary of ground truth entities
        similarity_threshold (float): Threshold for semantic similarity
        match_threshold (float): Threshold for overall matching ratio
        partial_credit (float): Credit for matches in sister categories
    Returns:
        tuple: (is_verified, feedback_message)
    """
    is_valid, error, gt_counts, gen_counts, total_gt, total_gen = (
        validate_and_count_entities(extracted_entities, ground_truth_entities)
    )
    if not is_valid:
        return False, error

    gt_embeddings = {
        cat: embedding_model.encode(gt_list)
        for cat, gt_list in ground_truth_entities.items()
        if gt_list and isinstance(gt_list, list)
    }

    total_match_score = 0.0

    for cat, gt_list in ground_truth_entities.items():
        gen_list = extracted_entities.get(cat, [])

        if not gt_list and not gen_list:
            total_match_score += 1.0
            continue
        if not gt_list or not gen_list:
            continue
        gt_emb = gt_embeddings[cat]
        gen_emb = embedding_model.encode(gen_list)
        sim_mat = cosine_similarity(gen_emb, gt_emb)

        for i in range(len(gen_list)):
            best_sim = sim_mat[i].max()
            if best_sim >= similarity_threshold:
                total_match_score += 1.0
            else:
                sister_cat = SISTER_CATEGORIES.get(cat)
                if sister_cat and sister_cat in gt_embeddings:
                    sister_gt_emb = gt_embeddings[sister_cat]
                    sis_sim = cosine_similarity(gen_emb[i : i + 1], sister_gt_emb).max()
                    if sis_sim >= similarity_threshold:
                        total_match_score += partial_credit

    overall_ratio = total_match_score / max(min(total_gt, total_gen), 1.0)

    if overall_ratio >= match_threshold:
        return True, None
    return False, generate_feedback(
        gt_counts,
        gen_counts,
        "Feedback: Improve entity semantic similarity or check category assignment.",
    )

## Iterative correction with feedback
Run the complete pipeline

In [None]:
def run_pipeline(expanded_texts, raw_json_strings, ground_truth_entities, max_retries=MAX_RETRIES):
    """
    Run the complete entity extraction pipeline with iterative correction.
    Args:
        expanded_texts (list): List of expanded post texts
        raw_json_strings (list): List of raw JSON strings from initial extraction
        ground_truth_entities (list): List of ground truth entity dictionaries
        max_retries (int): Maximum number of retry attempts
    Returns:
        tuple: (extracted_json_entities, success_count)
    """
    extracted_json_entities = []
    success_count = 0

    for i in range(len(expanded_texts)):
        file_name = ground_truth_entities[i]['file_name']
        print(f"\nProcessing post {i+1}/{len(expanded_texts)}: {file_name}")
        print("------------------------------------")

        previous_attempt = None
        feedback = None
        standardized_json_obj = {}

        for attempt in range(max_retries):
            try:
                if attempt == 0:
                    raw_str = raw_json_strings[i]
                else:
                    raw_str = extract_entities_json(expanded_texts[i], feedback, previous_attempt)

                print(f"Attempt {attempt + 1}: Raw JSON string - {raw_str}")

                json_obj = parse_json_string(raw_str)
                print("Parsed successfully")
                standardized_json_obj = standardize_entities(json_obj)
                previous_attempt = raw_str

                print("Extracted: ", standardized_json_obj)
                print("Ground Truth: ", ground_truth_entities[i].get("entities", {}))

                completeness_verified, feedback_completeness = verify_completeness(
                    standardized_json_obj, ground_truth_entities[i].get("entities", {})
                )
                similarity_verified, feedback_similarity = verify_similarity(
                    standardized_json_obj, ground_truth_entities[i].get("entities", {})
                )
                if completeness_verified and similarity_verified:
                    print("ALL CHECKS PASSED!")
                    success_count += 1
                    break

                reasons = []
                if not completeness_verified: reasons.append("completeness verification failed")
                if not similarity_verified: reasons.append("similarity verification failed")
                print(f"Attempt {attempt + 1} failed: {', '.join(reasons)}")

                if not completeness_verified:
                    feedback = feedback_completeness
                else:
                    feedback = feedback_similarity

                if attempt < max_retries - 1:
                    print(f"Retrying with feedback: {feedback}")
                else:
                    print("Max retries reached. Using current result.")

            except json.JSONDecodeError as e:
                print(f"Attempt {attempt + 1} failed with error: {e}")
                if attempt < max_retries - 1:
                    feedback = f"JSON parsing error: {e}. Please ensure the output is valid JSON."
                    previous_attempt = None
                    print(f"Retrying with feedback: {feedback}")
                else:
                    print("Max retries reached. Using empty dictionary.")
                    standardized_json_obj = {}

        extracted_entities = {"file_name": file_name, "entities": standardized_json_obj}
        extracted_json_entities.append(extracted_entities)

    return extracted_json_entities, success_count

In [16]:
extracted_json_entities, success_count = run_pipeline(
    expanded_texts, raw_json_strings, standardized_annotated_posts
)

with open("extracted_entities.json", "w") as outfile:
    json.dump(extracted_json_entities, outfile)

print(f"Successfully processed {success_count} out of {len(expanded_texts)} posts.")
print(f"Success Rate: {success_count / len(expanded_texts) * 100:.2f}%")


Processing post 1/400: ARTHROTEC.1.ann
------------------------------------
Attempt 1: Raw JSON string - ```json
{
  "drugs": ["Arthrotec", "75"],
  "ades": ["drowsy", "blurred vision", "weird feeling"],
  "symptoms_diseases": ["arthritis", "agony"]
}
```
Parsed successfully
Extracted:  {'drugs': ['Arthrotec', 'dextran 75'], 'ades': ['Drowsiness', 'Blurred vision', 'weird feeling'], 'symptoms_diseases': ['Arthritis', 'Agony']}
Ground Truth:  {'drugs': ['Arthrotec'], 'ades': ['gastric problems', 'little blurred vision', 'feel a bit weird', 'bit drowsy'], 'symptoms_diseases': ['Agony', 'Arthritis', 'Growing pains']}
ALL CHECKS PASSED!

Processing post 2/400: ARTHROTEC.10.ann
------------------------------------
Attempt 1: Raw JSON string - {
  "drugs": [],
  "ades": [],
  "symptoms_diseases": ["Hunger pangs", "Hip pain"]
}
Parsed successfully
Extracted:  {'drugs': [], 'ades': [], 'symptoms_diseases': ['Hunger pangs', 'Hip joint pain']}
Ground Truth:  {'drugs': [], 'ades': ['Hunger pangs

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Parsed successfully
Extracted:  {'drugs': ['Cataflam'], 'ades': ['rectal bleeding/collitis'], 'symptoms_diseases': ['GI distress']}
Ground Truth:  {'drugs': ['Cataflam', 'Cataflam'], 'ades': ['Rectal hemorrhage', 'collitis', 'GI distress'], 'symptoms_diseases': []}
Attempt 1 failed: completeness verification failed, similarity verification failed
Retrying with feedback: Feedback: Try extracting more 'drugs' (expected 2, got 1), more 'ades' (expected 3, got 1), fewer 'symptoms_diseases' (expected 0, got 1).
Attempt 2: Raw JSON string - {
  "drugs": ["Cataflam"],
  "ades": ["rectal bleeding/collitis", "GI distress", "hospitalization"],
  "symptoms_diseases": ["GI distress"]
}
Parsed successfully
Extracted:  {'drugs': ['Cataflam'], 'ades': ['rectal bleeding/collitis', 'GI distress', 'Home hospitalization'], 'symptoms_diseases': ['GI distress']}
Ground Truth:  {'drugs': ['Cataflam', 'Cataflam'], 'ades': ['Rectal hemorrhage', 

In [18]:
for i in range(10):
    print(f"Post {i+1}:")
    print("Ground Truth:")
    print(standardized_annotated_posts[i])
    print("\nExtracted:")
    print(extracted_json_entities[i])
    print("\n" + "-"*50)

Post 1:
Ground Truth:
{'file_name': 'ARTHROTEC.1.ann', 'entities': {'drugs': ['Arthrotec'], 'ades': ['gastric problems', 'little blurred vision', 'feel a bit weird', 'bit drowsy'], 'symptoms_diseases': ['Agony', 'Arthritis', 'Growing pains']}}

Extracted:
{'file_name': 'ARTHROTEC.1.ann', 'entities': {'drugs': ['Arthrotec', 'dextran 75'], 'ades': ['Drowsiness', 'Blurred vision', 'weird feeling'], 'symptoms_diseases': ['Arthritis', 'Agony']}}

--------------------------------------------------
Post 2:
Ground Truth:
{'file_name': 'ARTHROTEC.10.ann', 'entities': {'drugs': [], 'ades': ['Hunger pangs'], 'symptoms_diseases': ['walk up & down steps sideways', 'Hip joint pain']}}

Extracted:
{'file_name': 'ARTHROTEC.10.ann', 'entities': {'drugs': [], 'ades': [], 'symptoms_diseases': ['Hunger pangs', 'Hip joint pain']}}

--------------------------------------------------
Post 3:
Ground Truth:
{'file_name': 'ARTHROTEC.100.ann', 'entities': {'drugs': [], 'ades': ['Vaginal Hemorrhage', 'Headache', 