In [27]:
import json
import pandas as pd
from pathlib import Path
from utils import clean_extracted_data
import re
import string

In [28]:
# Load extracted data
output_dir = Path('../../outputs/nv_task/extractions')

subset = [
    '*NOMC-zeroshot_NuExtract-v1.5.jsonl',
    '*NOMC-noexample-zeroshot_NuExtract-2-4B_temperature=0_do_sample=False.jsonl',
    '*gpt-4o-mini-2024-07-18*'
    ]


# Extract substring after NuExtract for each output
nv_task_nu_json = {}
for s in subset:
    t = list(output_dir.glob(s))[0]

    # Everying after zeroshot_
    key = t.stem.split('zeroshot_')[1]
    with open(t, 'r') as f:
        nv_task_nu_json[key] = json.load(f)

nv_task_nu_dict = {}
for params, data in nv_task_nu_json.items():
    nv_task_nu_dict[params] = clean_extracted_data(data)

# Load documents

docs = pd.read_json('../../../labelbuddy-annotations/projects/nv_task/documents/batch_0.jsonl', lines=True)


In [71]:
# Add pmcid to docs
docs['pmcid'] = docs['metadata'].apply(lambda x: x['pmcid'])

In [78]:
docs[docs['pmcid'] == 10028637][0]

KeyError: 0

In [200]:
compare_cols = ['StudyObjective', 'TaskName', 'TaskDescription']
# --- End Sample Data ---

import unicodedata

# --- Helper Function for Text Normalization ---
def normalize_text(text):
    """
    Normalizes text extensively:
    - Ensures input is a string.
    - Converts to lowercase.
    - Performs Unicode normalization (NFKC) to handle ligatures (e.g., ﬁ -> fi)
      and other compatibility characters.
    - Removes accents/diacritics (e.g., é -> e).
    - Normalizes various dashes and hyphens to a standard hyphen (-).
    - Removes remaining punctuation (excluding the standard hyphen).
    - Normalizes whitespace (collapses multiple spaces/tabs/newlines, strips ends).
    """
    # 1. Ensure input is a string
    if not isinstance(text, str):
        return "" # Return empty string if input is not a string

    # 2. Convert to lowercase
    text = text.lower()

    # 3. Unicode Normalization (NFKC)
    # Decomposes compatibility characters (like ligatures ﬁ, ﬂ)
    # and recomposes them in canonical form.
    try:
        text = unicodedata.normalize('NFKC', text)
    except Exception as e:
        # Handle potential errors during normalization if needed
        # print(f"Warning: Unicode normalization failed for text chunk: {e}")
        pass # Continue with the text as is if normalization fails

    # 4. Remove accents (diacritics)
    # Decompose into base character and combining marks (NFKD)
    # then filter out the combining marks.
    try:
        nfkd_form = unicodedata.normalize('NFKD', text)
        text = "".join([c for c in nfkd_form if not unicodedata.combining(c)])
    except Exception as e:
        # print(f"Warning: Accent removal failed for text chunk: {e}")
        pass # Continue with the text as is if accent removal fails


    # 5. Normalize different types of dashes/hyphens to a standard hyphen (-)
    # Includes En Dash, Em Dash, Figure Dash, Horizontal Bar, Minus Sign, Hyphen Bullet, Non-breaking Hyphen, standard Hyphen itself
    # Ensures all these variants become the standard ASCII hyphen U+002D.
    # Using r'[...]' creates a character set.
    dashes_pattern = r'[–—‒―−‐‑]' # Note: standard hyphen '-' is implicitly handled if present, but explicitly converting others to it is key.
    text = re.sub(dashes_pattern, '-', text)

    # 6. Define punctuation to remove (excluding the standard hyphen)
    # Start with string.punctuation and remove the standard hyphen '-' from the set.
    punctuation_to_remove = string.punctuation.replace('-', '')
    # Escape the punctuation characters for safe use in the regex pattern
    escaped_punctuation = re.escape(punctuation_to_remove)

    # 7. Remove the defined punctuation characters
    # Using f'[{escaped_punctuation}]' creates a character set of punctuation to remove.
    text = re.sub(f'[{escaped_punctuation}]', '', text)

    # 8. Normalize whitespace
    # Replace sequences of one or more whitespace characters (space, tab, newline, etc.) with a single space.
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing whitespace.
    text = text.strip()

    return text
# --- Main Logic ---
results = {}


def _compare_string(extracted_data, normalized_original_text):
    if isinstance(extracted_data, str) and extracted_data.strip():
        normalized_extracted = normalize_text(extracted_data)
        # Check only if normalized extracted string is not empty
        is_substring = normalized_extracted in normalized_original_text if normalized_extracted else False
        return is_substring
    else:
        return False # Treat empty/non-string as not found

for model_name, model_data in nv_task_nu_dict.items():
    results[model_name] = {}
    print(f"Processing Model: {model_name}")

    for pmcid_str, extracted_values in model_data.items():
        results[model_name][pmcid_str] = {}

        # Find the original text for the current PMCID
        # Assumes PMCID in metadata is integer, key in dict is string
        pmcid_int = int(pmcid_str)
        original_text = docs[docs['pmcid'] == pmcid_int].iloc[0]['text']

        normalized_original_text = normalize_text(original_text)
        

        if not normalized_original_text: # Handle cases where original text is empty/invalid
                raise ValueError(f"Original text is empty for PMCID {pmcid_str}.")

        # Process each column to compare
        for col in compare_cols:

            extracted_data = extracted_values[col]

            # --- StudyObjective (String) ---
            if col == 'StudyObjective':
                results[model_name][pmcid_str][col] = _compare_string(normalize_text(extracted_data), normalized_original_text)

            # --- TaskName & TaskDescription (List of Strings) ---
            elif col in ['TaskName', 'TaskDescription']:
                if isinstance(extracted_data, list):
                    substring_results = []
                    for item in extracted_data:
                        substring_results.append(_compare_string(normalize_text(item), normalized_original_text))

                else:
                        results[model_name][pmcid_str][col] = [] # Treat non-list as empty list
                        print(f"      {col}: Extracted data is not a list.")

                results[model_name][pmcid_str][col] = substring_results


Processing Model: NuExtract-v1.5
Processing Model: NuExtract-2-4B_temperature=0_do_sample=False
Processing Model: gpt-4o-mini-2024-07-18


In [201]:
long_format_data = []

# Iterate through the nested dictionary structure
for model_name, model_results in results.items():
    for pmcid_str, pmcid_results in model_results.items():

        # Check for processing errors first
        if isinstance(pmcid_results, dict) and 'error' in pmcid_results:
            long_format_data.append({
                'model': model_name,
                'pmcid': pmcid_str,
                'column': None, # Or pd.NA
                'list_index': pd.NA, # Use pandas NA for missing integer index
                'is_substring': None, # Or pd.NA
                'status': f"Error: {pmcid_results['error']}"
            })
            continue # Skip to the next PMCID if there was a general error

        # Process results for each column if no general error
        for col in compare_cols: # Use the same compare_cols list
            if col not in pmcid_results:
                # This case might not happen if we handled it previously,
                # but good for robustness
                 long_format_data.append({
                    'model': model_name,
                    'pmcid': pmcid_str,
                    'column': col,
                    'list_index': pd.NA,
                    'is_substring': None, # Or pd.NA
                    'status': 'Column Not Processed' # Or similar indicator
                })
                 continue

            result_value = pmcid_results[col]

            if result_value is None:
                 # Case where the column was present in compare_cols
                 # but missing in the specific extraction output
                 long_format_data.append({
                    'model': model_name,
                    'pmcid': pmcid_str,
                    'column': col,
                    'list_index': pd.NA,
                    'is_substring': None, # Or pd.NA
                    'status': 'Column Missing in Extraction'
                })

            elif isinstance(result_value, bool): # StudyObjective case
                long_format_data.append({
                    'model': model_name,
                    'pmcid': pmcid_str,
                    'column': col,
                    'list_index': pd.NA, # No list index for single boolean
                    'is_substring': result_value,
                    'status': 'OK'
                })

            elif isinstance(result_value, list): # TaskName/TaskDescription case
                if not result_value: # Handle empty lists
                    long_format_data.append({
                        'model': model_name,
                        'pmcid': pmcid_str,
                        'column': col,
                        'list_index': pd.NA,
                        'is_substring': None, # Or pd.NA
                        'status': 'Extracted List Empty'
                    })
                else:
                    for idx, bool_val in enumerate(result_value):
                         long_format_data.append({
                            'model': model_name,
                            'pmcid': pmcid_str,
                            'column': col,
                            'list_index': idx, # Add the index from the list
                            'is_substring': bool_val,
                            'status': 'OK'
                        })

In [202]:
# Create the DataFrame
results_long_df = pd.DataFrame(long_format_data)

# Optional: Set specific data types if needed
results_long_df['pmcid'] = results_long_df['pmcid'].astype(str) # Keep PMCID as string
results_long_df['list_index'] = results_long_df['list_index'].astype(pd.Int64Dtype()) # Use nullable integer type
results_long_df['is_substring'] = results_long_df['is_substring'].astype(pd.BooleanDtype()) # Use nullable boolean type


In [203]:
results_long_df.groupby(['model', 'column'])['is_substring'].mean()

model                                         column         
NuExtract-2-4B_temperature=0_do_sample=False  StudyObjective     0.828571
                                              TaskDescription    0.595455
                                              TaskName           0.945455
NuExtract-v1.5                                StudyObjective     0.820755
                                              TaskDescription    0.863158
                                              TaskName           0.991228
gpt-4o-mini-2024-07-18                        StudyObjective     0.211538
                                              TaskDescription    0.111111
                                              TaskName            0.79798
Name: is_substring, dtype: Float64

In [204]:
results_long_df[(results_long_df['is_substring'] == False) & (results_long_df['column'] == 'TaskDescription')]

Unnamed: 0,model,pmcid,column,list_index,is_substring,status
10,NuExtract-v1.5,10129386,TaskDescription,0,False,OK
131,NuExtract-v1.5,4517759,TaskDescription,1,False,OK
132,NuExtract-v1.5,4517759,TaskDescription,2,False,OK
162,NuExtract-v1.5,5324609,TaskDescription,0,False,OK
163,NuExtract-v1.5,5324609,TaskDescription,1,False,OK
...,...,...,...,...,...,...
1255,gpt-4o-mini-2024-07-18,9308012,TaskDescription,0,False,OK
1258,gpt-4o-mini-2024-07-18,9454014,TaskDescription,0,False,OK
1264,gpt-4o-mini-2024-07-18,9837608,TaskDescription,0,False,OK
1268,gpt-4o-mini-2024-07-18,9910278,TaskDescription,0,False,OK


In [205]:
pmcid = 10129386
_sub = normalize_text(nv_task_nu_dict['NuExtract-2-4B_temperature=0_do_sample=False'][pmcid]['TaskDescription'][0]) 
_text = normalize_text(docs[docs['pmcid'] == pmcid].iloc[0]['text'])
_sub in _text

False

In [206]:
_sub

'participants were administered a set of trait adjectives from the affective norms of emotion words database the original 20 positive and 20 negative trait adjectives were translated to swedish by the first author slightly modified to make the words more relevant in swedish and divided into blocks of five words each which were either all positive or all negative to each block one ‘fluid’ word and one ‘constant’ word judged to be of the same valence as the rest of the block were added total 16 words relating to fluiditysolidity each word was presented for 3 s resulting in a total of 21 s per seven-word block a fixation cross was presented for 4 s between each block the blocks were presented three times once in conjunction with each of three questions ‘describes me’ self condition ‘is positive’ valence condition and ‘is uppercase’ case condition'

In [209]:
_s = 'participants were administered a set of trait adjectives from the affective norms of emotion words database the original 20 positive and 20 negative trait adjectives were translated to swedish by the first author slightly modified to make the words more relevant in swedish and divided into blocks of five words each which were either all positive or all negative to each block one ‘fluid’ word and one ‘constant’ word judged to be of the same valence as the rest of the block were added total 16 words relating to fluiditysolidity examples of such words were ‘bestandig’ durable positive-constant ‘dynamisk’ dynamic positive-fluid ‘statisk’ static negative-constant and ‘flyktig’ volatile negative-fluid half of the words evenly distributed across positive and negative were presented in uppercase and the other half in lowercase letters each word was presented for 3 s resulting in a total of 21 s per seven-word block a fixation cross was presented for 4 s between each block the blocks were presented three times once in conjunction with each of three questions ‘describes me’ self condition ‘is positive’ valence condition and ‘is uppercase’ case condition'

## Take Home findings

For the most part, NuExtract models are fairly truthfully quoting the original text, although occasionally both models will leave out a word

However, more annoyingly, due to punctionation and slight differences in text normalization, it can be difficult to find the substring in the original text.

In longer texts it is likely it is this normalization that is throwing things off, although in NuExtract 2 it is very often modifying the original text

Meanwhile, GPT 4o is not too far from accurately quoting the original task for taskname, but veers further in StudyObject and TaskDescription, although this can often be a good thing