In [1]:
!pip install deepdiff

Collecting deepdiff
  Downloading deepdiff-8.0.1-py3-none-any.whl.metadata (8.5 kB)
Collecting orderly-set==5.2.2 (from deepdiff)
  Downloading orderly_set-5.2.2-py3-none-any.whl.metadata (6.3 kB)
Downloading deepdiff-8.0.1-py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.7/82.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading orderly_set-5.2.2-py3-none-any.whl (11 kB)
Installing collected packages: orderly-set, deepdiff
Successfully installed deepdiff-8.0.1 orderly-set-5.2.2


In [2]:
import json

def strip_json_code_block(json_str):
    """
    Strips '```json\n' prefix and '```' suffix from a JSON string if present.
    """
    json_str = json_str.strip()
    if json_str.startswith('```json\n'):
        json_str = json_str[8:].lstrip('\n\r')
    elif json_str.startswith('```'):
        json_str = json_str[3:].lstrip('\n\r')
    if json_str.endswith('```'):
        json_str = json_str[:-3].rstrip('\n\r')
    return json_str.strip()

def preprocess_item(item):
    """
    Recursively preprocess item to strip unnecessary whitespace from strings.
    """
    if isinstance(item, dict):
        return {k.strip(): preprocess_item(v) for k, v in item.items()}
    elif isinstance(item, list):
        return [preprocess_item(i) for i in item]
    elif isinstance(item, str):
        return item.strip()
    return item

def parse_json_line(line):
    """
    Parse a JSON line, handling cases where the JSON object is a string representation.
    """
    try:
        item = json.loads(line)
        while isinstance(item, str):
            item = json.loads(item)
        return preprocess_item(item), None
    except json.JSONDecodeError as e:
        return None, e
    except Exception as e:
        return None, e

def read_jsonl_file(file_path):
    """
    Read a JSONL file and return a dictionary with 'task-id' as keys.
    """
    data_dict = {}
    errors = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f, 1):
            item, error = parse_json_line(line)
            if error:
                errors.append((line_number, error))
                continue  # Skip this line
            task_id = item.get('task-id')
            if task_id is not None:
                data_dict[task_id] = item
            else:
                errors.append((line_number, "Missing 'task-id'"))
        return data_dict, errors

def flatten_json(y):
    out = {}
    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], f"{name}.{a}" if name else a)
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, f"{name}[{i}]")
        else:
            out[name] = x
    flatten(y)
    return out

def compare_jsons(gold_json, llm_json):
    """
    Compare two JSON objects and collect statistics on TP, FP, FN.
    """
    gold_flat = flatten_json(gold_json)
    llm_flat = flatten_json(llm_json)

    gold_paths = set(gold_flat.keys())
    llm_paths = set(llm_flat.keys())

    TP = 0  # True Positives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    # Iterate over all keys in the gold standard
    for path in gold_paths:
        if path in llm_paths:
            if gold_flat[path] == llm_flat[path]:
                # Correct value at correct path
                TP += 1
            else:
                # Value differs at the same path
                FN += 1
        else:
            # Key missing in LLM output
            FN += 1

    # Identify extra keys in LLM output
    for path in llm_paths - gold_paths:
        # Extra key not present in gold standard
        FP += 1

    return {'TP': TP, 'FP': FP, 'FN': FN}

def compare_jsonl_files(gold_corpus_file, llm_output_file):
    """
    Compare two JSONL files focusing on the 'json' key and report statistics.
    """
    gold_dict, gold_errors = read_jsonl_file(gold_corpus_file)
    llm_dict, llm_errors = read_jsonl_file(llm_output_file)

    # Initialize a list to collect parsing errors in 'json' field
    json_parsing_errors = []

    # Initialize counters for statistics
    total_TP = 0
    total_FP = 0
    total_FN = 0

    gold_task_ids = set(gold_dict.keys())
    llm_task_ids = set(llm_dict.keys())

    missing_task_ids = gold_task_ids - llm_task_ids
    extra_task_ids = llm_task_ids - gold_task_ids
    matching_task_ids = gold_task_ids & llm_task_ids

    # Handle missing Task IDs (LLM failed to produce output)
    for task_id in missing_task_ids:
        print(f"\nMissing Task ID in LLM output: {task_id}")
        gold_item = gold_dict[task_id]
        gold_json_str = strip_json_code_block(gold_item.get('json', ''))
        if gold_json_str:
            try:
                gold_json = json.loads(gold_json_str)
                gold_json = preprocess_item(gold_json)
                gold_flat = flatten_json(gold_json)
                total_FN += len(gold_flat)
            except json.JSONDecodeError:
                pass  # Skip if gold_json cannot be parsed

    # Handle extra Task IDs (LLM produced unexpected output)
    for task_id in extra_task_ids:
        print(f"\nExtra Task ID in LLM output: {task_id}")
        llm_item = llm_dict[task_id]
        llm_json_str = strip_json_code_block(llm_item.get('json', ''))
        if llm_json_str:
            try:
                llm_json = json.loads(llm_json_str)
                llm_json = preprocess_item(llm_json)
                llm_flat = flatten_json(llm_json)
                total_FP += len(llm_flat)
            except json.JSONDecodeError:
                pass  # Skip if llm_json cannot be parsed

    # Compare 'json' key for matching Task IDs
    for task_id in matching_task_ids:
        gold_item = gold_dict[task_id]
        llm_item = llm_dict[task_id]

        gold_json_str = strip_json_code_block(gold_item.get('json', ''))
        llm_json_str = strip_json_code_block(llm_item.get('json', ''))

        # Parse gold JSON
        if not gold_json_str:
            error_message = f"Task ID {task_id} missing 'json' in gold corpus."
            json_parsing_errors.append((task_id, error_message))
            raise Exception('Check your gold corpus!')

        try:
            gold_json = json.loads(gold_json_str)
            gold_json = preprocess_item(gold_json)
            gold_flat = flatten_json(gold_json)
        except json.JSONDecodeError as e:
            error_message = f"Error parsing gold JSON for Task ID {task_id}: {e}"
            json_parsing_errors.append((task_id, error_message))
            raise Exception('Check your gold corpus!')

        # If parsing of LLM JSON fails, increment FN (False Negative) since the LLM failed to provide a valid result
        if not llm_json_str:
            error_message = f"Task ID {task_id} missing 'json' in LLM output."
            json_parsing_errors.append((task_id, error_message))
            # Count all expected data points as False Negatives
            total_FN += len(gold_flat)
            continue
        try:
            llm_json = json.loads(llm_json_str)
            llm_json = preprocess_item(llm_json)
            llm_flat = flatten_json(llm_json)
        except json.JSONDecodeError as e:
            error_message = f"Error parsing LLM JSON for Task ID {task_id}: {e}"
            json_parsing_errors.append((task_id, error_message))
            # Count all expected data points as False Negatives
            total_FN += len(gold_flat)
            continue

        # Compare the JSON structures
        stats = compare_jsons(gold_json, llm_json)

        # Accumulate totals
        total_TP += stats['TP']
        total_FP += stats['FP']
        total_FN += stats['FN']

    # Calculate metrics
    total_items = total_TP + total_FP + total_FN
    Precision = (total_TP / (total_TP + total_FP)) * 100 if (total_TP + total_FP) > 0 else 0
    Recall = (total_TP / (total_TP + total_FN)) * 100 if (total_TP + total_FN) > 0 else 0
    F1_Score = (2 * Precision * Recall / (Precision + Recall)) if (Precision + Recall) > 0 else 0
    Accuracy = (total_TP / total_items) * 100 if total_items > 0 else 0

    # Report parsing errors
    total_errors = len(gold_errors) + len(llm_errors) + len(json_parsing_errors)
    if total_errors > 0:
        print(f"\nTotal parsing errors: {total_errors}")
        if gold_errors:
            print("\nErrors in gold corpus:")
            for line_number, error in gold_errors:
                print(f"  Line {line_number}: {error}")
        if llm_errors:
            print("\nErrors in LLM output:")
            for line_number, error in llm_errors:
                print(f"  Line {line_number}: {error}")
        if json_parsing_errors:
            print("\nErrors parsing 'json' field:")
            for task_id, error_message in json_parsing_errors:
                print(f"  Task ID {task_id}: {error_message}")

    # Print total statistics and evaluation metrics
    print("\nTotal statistics across all Task IDs:")
    print(f"  Total True Positives (Correctly formatted items): {total_TP}")
    print(f"  Total False Positives (Extra items not in gold standard): {total_FP}")
    print(f"  Total False Negatives (Missing or incorrect items): {total_FN}")
    print(f"\nEvaluation Metrics:")
    print(f"  Accuracy: {Accuracy:.2f}%")
    print(f"  Precision: {Precision:.2f}%")
    print(f"  Recall: {Recall:.2f}%")
    print(f"  F1 Score: {F1_Score:.2f}%")

# Get gold corpus

In [3]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gold_corpus_structured_yudakhin.jsonl

--2024-09-23 04:57:35--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gold_corpus_structured_yudakhin.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81445 (80K) [text/plain]
Saving to: ‘gold_corpus_structured_yudakhin.jsonl’


2024-09-23 04:57:35 (31.5 MB/s) - ‘gold_corpus_structured_yudakhin.jsonl’ saved [81445/81445]



# Get fine-tuned Gemini Flash 1.5's output

In [4]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/fine_tuned_gemini_flash_outputs.jsonl

--2024-09-23 04:57:35--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/fine_tuned_gemini_flash_outputs.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 90259 (88K) [text/plain]
Saving to: ‘fine_tuned_gemini_flash_outputs.jsonl’


2024-09-23 04:57:35 (23.6 MB/s) - ‘fine_tuned_gemini_flash_outputs.jsonl’ saved [90259/90259]



In [5]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'fine_tuned_gemini_flash_outputs.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total parsing errors: 3

Errors parsing 'json' field:
  Task ID 4130a6ce-2389-4727-8168-9a0ade47d420: Error parsing LLM JSON for Task ID 4130a6ce-2389-4727-8168-9a0ade47d420: Extra data: line 18 column 1 (char 241)
  Task ID 71f0c0d5-3302-4395-b41e-0f7f97d89163: Error parsing LLM JSON for Task ID 71f0c0d5-3302-4395-b41e-0f7f97d89163: Extra data: line 14 column 1 (char 165)
  Task ID d1c737f7-a5a6-4c6e-803a-34165ac47b25: Error parsing LLM JSON for Task ID d1c737f7-a5a6-4c6e-803a-34165ac47b25: Extra data: line 14 column 1 (char 174)

Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 700
  Total False Positives (Extra items not in gold standard): 107
  Total False Negatives (Missing or incorrect items): 148

Evaluation Metrics:
  Accuracy: 73.30%
  Precision: 86.74%
  Recall: 82.55%
  F1 Score: 84.59%


# Get vanilla Gemini Flash 1.5's output

In [6]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/untrained_gemini_flash_outputs.jsonl

--2024-09-23 04:57:43--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/untrained_gemini_flash_outputs.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 186519 (182K) [text/plain]
Saving to: ‘untrained_gemini_flash_outputs.jsonl’


2024-09-23 04:57:43 (32.5 MB/s) - ‘untrained_gemini_flash_outputs.jsonl’ saved [186519/186519]



### Zero shot

In [7]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_gemini_flash_outputs.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total parsing errors: 5

Errors parsing 'json' field:
  Task ID 8ffa3ce8-c2d9-40f8-871e-4aff86c31f99: Error parsing LLM JSON for Task ID 8ffa3ce8-c2d9-40f8-871e-4aff86c31f99: Expecting ',' delimiter: line 12 column 25 (char 248)
  Task ID af43de31-8fc5-4a63-93ef-dc570efc4449: Error parsing LLM JSON for Task ID af43de31-8fc5-4a63-93ef-dc570efc4449: Expecting ',' delimiter: line 49 column 25 (char 1259)
  Task ID cc2d5608-c629-4856-856a-4af949c975eb: Error parsing LLM JSON for Task ID cc2d5608-c629-4856-856a-4af949c975eb: Expecting ',' delimiter: line 12 column 29 (char 195)
  Task ID a6661a28-9323-4c5c-a7f9-d6730686a5ff: Error parsing LLM JSON for Task ID a6661a28-9323-4c5c-a7f9-d6730686a5ff: Expecting ',' delimiter: line 11 column 25 (char 195)
  Task ID acb89d97-59b5-48f5-ab7b-0a8dcfe00303: Error parsing LLM JSON for Task ID acb89d97-59b5-48f5-ab7b-0a8dcfe00303: Expecting ',' delimiter: line 11 column 25 (char 208)

Total statistics across all Task IDs:
  Total True Positives (Correc

# Get vanilla Gemini Flash 1.5's output with few-shot examples

In [8]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/untrained_gemini_flash_outputs_23_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/untrained_gemini_flash_outputs_17_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/untrained_gemini_flash_outputs_12_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/untrained_gemini_flash_outputs_6_examples.jsonl

--2024-09-23 04:57:50--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/untrained_gemini_flash_outputs_23_examples.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81138 (79K) [text/plain]
Saving to: ‘untrained_gemini_flash_outputs_23_examples.jsonl’


2024-09-23 04:57:50 (29.9 MB/s) - ‘untrained_gemini_flash_outputs_23_examples.jsonl’ saved [81138/81138]

--2024-09-23 04:57:50--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/gemini_flash_outputs/untrained_gemini_flash_outputs_17_examples.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (

In [9]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_gemini_flash_outputs_6_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 702
  Total False Positives (Extra items not in gold standard): 59
  Total False Negatives (Missing or incorrect items): 146

Evaluation Metrics:
  Accuracy: 77.40%
  Precision: 92.25%
  Recall: 82.78%
  F1 Score: 87.26%


In [10]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_gemini_flash_outputs_12_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 712
  Total False Positives (Extra items not in gold standard): 41
  Total False Negatives (Missing or incorrect items): 136

Evaluation Metrics:
  Accuracy: 80.09%
  Precision: 94.56%
  Recall: 83.96%
  F1 Score: 88.94%


In [11]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_gemini_flash_outputs_17_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 719
  Total False Positives (Extra items not in gold standard): 40
  Total False Negatives (Missing or incorrect items): 129

Evaluation Metrics:
  Accuracy: 80.97%
  Precision: 94.73%
  Recall: 84.79%
  F1 Score: 89.48%


In [12]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_gemini_flash_outputs_23_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 723
  Total False Positives (Extra items not in gold standard): 39
  Total False Negatives (Missing or incorrect items): 125

Evaluation Metrics:
  Accuracy: 81.51%
  Precision: 94.88%
  Recall: 85.26%
  F1 Score: 89.81%


# Get vanilla ChatGPT-4o's zero-shot output

In [13]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_outputs/untrained_chatgpt_4o_outputs.jsonl

--2024-09-23 04:58:06--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_outputs/untrained_chatgpt_4o_outputs.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 163744 (160K) [text/plain]
Saving to: ‘untrained_chatgpt_4o_outputs.jsonl’


2024-09-23 04:58:06 (31.2 MB/s) - ‘untrained_chatgpt_4o_outputs.jsonl’ saved [163744/163744]



In [14]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_outputs.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 473
  Total False Positives (Extra items not in gold standard): 346
  Total False Negatives (Missing or incorrect items): 375

Evaluation Metrics:
  Accuracy: 39.61%
  Precision: 57.75%
  Recall: 55.78%
  F1 Score: 56.75%


# Get vanilla ChatGPT 4o's outputs with few-shot examples


In [15]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_outputs/untrained_chatgpt_4o_outputs_23_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_outputs/untrained_chatgpt_4o_outputs_17_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_outputs/untrained_chatgpt_4o_outputs_12_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_outputs/untrained_chatgpt_4o_outputs_6_examples.jsonl

--2024-09-23 04:58:10--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_outputs/untrained_chatgpt_4o_outputs_23_examples.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81025 (79K) [text/plain]
Saving to: ‘untrained_chatgpt_4o_outputs_23_examples.jsonl’


2024-09-23 04:58:10 (21.0 MB/s) - ‘untrained_chatgpt_4o_outputs_23_examples.jsonl’ saved [81025/81025]

--2024-09-23 04:58:10--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_outputs/untrained_chatgpt_4o_outputs_17_examples.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubus

In [16]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_outputs_6_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 710
  Total False Positives (Extra items not in gold standard): 110
  Total False Negatives (Missing or incorrect items): 138

Evaluation Metrics:
  Accuracy: 74.11%
  Precision: 86.59%
  Recall: 83.73%
  F1 Score: 85.13%


In [17]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_outputs_12_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 754
  Total False Positives (Extra items not in gold standard): 39
  Total False Negatives (Missing or incorrect items): 94

Evaluation Metrics:
  Accuracy: 85.01%
  Precision: 95.08%
  Recall: 88.92%
  F1 Score: 91.90%


In [18]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_outputs_17_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 781
  Total False Positives (Extra items not in gold standard): 31
  Total False Negatives (Missing or incorrect items): 67

Evaluation Metrics:
  Accuracy: 88.85%
  Precision: 96.18%
  Recall: 92.10%
  F1 Score: 94.10%


In [19]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_outputs_23_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 801
  Total False Positives (Extra items not in gold standard): 16
  Total False Negatives (Missing or incorrect items): 47

Evaluation Metrics:
  Accuracy: 92.71%
  Precision: 98.04%
  Recall: 94.46%
  F1 Score: 96.22%


# Get vanilla ChatGPT 4o-mini's zero-shot outputs


In [20]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/untrained_chatgpt_4o_mini_outputs.jsonl

--2024-09-23 04:58:28--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/untrained_chatgpt_4o_mini_outputs.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 160193 (156K) [text/plain]
Saving to: ‘untrained_chatgpt_4o_mini_outputs.jsonl’


2024-09-23 04:58:28 (36.9 MB/s) - ‘untrained_chatgpt_4o_mini_outputs.jsonl’ saved [160193/160193]



In [21]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_mini_outputs.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 298
  Total False Positives (Extra items not in gold standard): 458
  Total False Negatives (Missing or incorrect items): 550

Evaluation Metrics:
  Accuracy: 22.82%
  Precision: 39.42%
  Recall: 35.14%
  F1 Score: 37.16%


# Get vanilla ChatGPT 4o-mini's outputs with few-shot examples

In [22]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/untrained_chatgpt_4o_mini_outputs_23_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/untrained_chatgpt_4o_mini_outputs_17_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/untrained_chatgpt_4o_mini_outputs_12_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/untrained_chatgpt_4o_mini_outputs_6_examples.jsonl

--2024-09-23 04:58:40--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/untrained_chatgpt_4o_mini_outputs_23_examples.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81830 (80K) [text/plain]
Saving to: ‘untrained_chatgpt_4o_mini_outputs_23_examples.jsonl’


2024-09-23 04:58:40 (37.2 MB/s) - ‘untrained_chatgpt_4o_mini_outputs_23_examples.jsonl’ saved [81830/81830]

--2024-09-23 04:58:40--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/untrained_chatgpt_4o_mini_outputs_17_examples.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githu

In [23]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_mini_outputs_6_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 637
  Total False Positives (Extra items not in gold standard): 86
  Total False Negatives (Missing or incorrect items): 211

Evaluation Metrics:
  Accuracy: 68.20%
  Precision: 88.11%
  Recall: 75.12%
  F1 Score: 81.09%


In [24]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_mini_outputs_12_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 608
  Total False Positives (Extra items not in gold standard): 112
  Total False Negatives (Missing or incorrect items): 240

Evaluation Metrics:
  Accuracy: 63.33%
  Precision: 84.44%
  Recall: 71.70%
  F1 Score: 77.55%


In [25]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_mini_outputs_17_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 640
  Total False Positives (Extra items not in gold standard): 96
  Total False Negatives (Missing or incorrect items): 208

Evaluation Metrics:
  Accuracy: 67.80%
  Precision: 86.96%
  Recall: 75.47%
  F1 Score: 80.81%


In [26]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_chatgpt_4o_mini_outputs_23_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 647
  Total False Positives (Extra items not in gold standard): 62
  Total False Negatives (Missing or incorrect items): 201

Evaluation Metrics:
  Accuracy: 71.10%
  Precision: 91.26%
  Recall: 76.30%
  F1 Score: 83.11%


# Get fine-tuned ChatGPT-4o-mini's output

In [27]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/finetuned_chatgpt_4o_mini_outputs.jsonl

--2024-09-23 04:58:47--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/chatgpt_4o_mini_outputs/finetuned_chatgpt_4o_mini_outputs.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80854 (79K) [text/plain]
Saving to: ‘finetuned_chatgpt_4o_mini_outputs.jsonl’


2024-09-23 04:58:47 (31.3 MB/s) - ‘finetuned_chatgpt_4o_mini_outputs.jsonl’ saved [80854/80854]



In [28]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'finetuned_chatgpt_4o_mini_outputs.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 763
  Total False Positives (Extra items not in gold standard): 33
  Total False Negatives (Missing or incorrect items): 85

Evaluation Metrics:
  Accuracy: 86.61%
  Precision: 95.85%
  Recall: 89.98%
  F1 Score: 92.82%


# Get vanilla Mistral-Nemo's zero-shot outputs


In [29]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/untrained_mistral_nemo_outputs.jsonl

--2024-09-23 04:58:49--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/untrained_mistral_nemo_outputs.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206358 (202K) [text/plain]
Saving to: ‘untrained_mistral_nemo_outputs.jsonl’


2024-09-23 04:58:49 (62.4 MB/s) - ‘untrained_mistral_nemo_outputs.jsonl’ saved [206358/206358]



In [30]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_mistral_nemo_outputs.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total parsing errors: 3

Errors parsing 'json' field:
  Task ID 853aa35e-10bd-4afe-8591-16b670a7fafd: Error parsing LLM JSON for Task ID 853aa35e-10bd-4afe-8591-16b670a7fafd: Unterminated string starting at: line 8 column 15 (char 138)
  Task ID d1c737f7-a5a6-4c6e-803a-34165ac47b25: Error parsing LLM JSON for Task ID d1c737f7-a5a6-4c6e-803a-34165ac47b25: Unterminated string starting at: line 8 column 15 (char 155)
  Task ID d9b6c0e6-9896-4edd-b051-afbb676fa563: Error parsing LLM JSON for Task ID d9b6c0e6-9896-4edd-b051-afbb676fa563: Unterminated string starting at: line 8 column 15 (char 135)

Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 349
  Total False Positives (Extra items not in gold standard): 572
  Total False Negatives (Missing or incorrect items): 499

Evaluation Metrics:
  Accuracy: 24.58%
  Precision: 37.89%
  Recall: 41.16%
  F1 Score: 39.46%


# Get vanilla Mistral-Nemo's few-shot outputs

In [31]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/untrained_mistral_nemo_outputs_23_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/untrained_mistral_nemo_outputs_17_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/untrained_mistral_nemo_outputs_12_examples.jsonl
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/untrained_mistral_nemo_outputs_6_examples.jsonl

--2024-09-23 04:58:52--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/untrained_mistral_nemo_outputs_23_examples.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84295 (82K) [text/plain]
Saving to: ‘untrained_mistral_nemo_outputs_23_examples.jsonl’


2024-09-23 04:58:52 (29.4 MB/s) - ‘untrained_mistral_nemo_outputs_23_examples.jsonl’ saved [84295/84295]

--2024-09-23 04:58:52--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/untrained_mistral_nemo_outputs_17_examples.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (

In [32]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_mistral_nemo_outputs_6_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 590
  Total False Positives (Extra items not in gold standard): 81
  Total False Negatives (Missing or incorrect items): 258

Evaluation Metrics:
  Accuracy: 63.51%
  Precision: 87.93%
  Recall: 69.58%
  F1 Score: 77.68%


In [33]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_mistral_nemo_outputs_12_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 624
  Total False Positives (Extra items not in gold standard): 117
  Total False Negatives (Missing or incorrect items): 224

Evaluation Metrics:
  Accuracy: 64.66%
  Precision: 84.21%
  Recall: 73.58%
  F1 Score: 78.54%


In [34]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_mistral_nemo_outputs_17_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 630
  Total False Positives (Extra items not in gold standard): 114
  Total False Negatives (Missing or incorrect items): 218

Evaluation Metrics:
  Accuracy: 65.49%
  Precision: 84.68%
  Recall: 74.29%
  F1 Score: 79.15%


In [35]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'untrained_mistral_nemo_outputs_23_examples.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 654
  Total False Positives (Extra items not in gold standard): 117
  Total False Negatives (Missing or incorrect items): 194

Evaluation Metrics:
  Accuracy: 67.77%
  Precision: 84.82%
  Recall: 77.12%
  F1 Score: 80.79%


# Get fine-tuned Mistral-Nemo's output

In [36]:
!wget https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/finedtuned_mistral_nemo_outputs.jsonl

--2024-09-23 04:59:19--  https://raw.githubusercontent.com/kyrgyz-nlp/structured_yudakhin/refs/heads/main/data/mistral_nemo_outputs/finedtuned_mistral_nemo_outputs.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81321 (79K) [text/plain]
Saving to: ‘finedtuned_mistral_nemo_outputs.jsonl’


2024-09-23 04:59:19 (27.9 MB/s) - ‘finedtuned_mistral_nemo_outputs.jsonl’ saved [81321/81321]



In [37]:
gold_corpus_file = 'gold_corpus_structured_yudakhin.jsonl'
llm_output_file = 'finedtuned_mistral_nemo_outputs.jsonl'

compare_jsonl_files(gold_corpus_file, llm_output_file)


Total statistics across all Task IDs:
  Total True Positives (Correctly formatted items): 729
  Total False Positives (Extra items not in gold standard): 54
  Total False Negatives (Missing or incorrect items): 119

Evaluation Metrics:
  Accuracy: 80.82%
  Precision: 93.10%
  Recall: 85.97%
  F1 Score: 89.39%
