In [None]:
import spacy
from spacy.scorer import Scorer
from spacy.tokens import Span

import pandas as pd

In [None]:
# Define a function to calculate precision, recall, and F1 score for spans
def calculate_metrics(annotated_spans, predicted_spans):
    true_positives = len(set(annotated_spans) & set(predicted_spans))
    false_positives = len(set(predicted_spans) - set(annotated_spans))
    false_negatives = len(set(annotated_spans) - set(predicted_spans))
    
    precision = true_positives / (true_positives + false_positives) if true_positives or false_positives else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives or false_negatives else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision or recall else 0
    
    return precision, recall, f1_score

In [None]:
df = pd.read_csv('predictions_for_evaluation.csv')
df.head(2)

In [None]:
df.columns

In [None]:
# Apply the function to each row of the DataFrame
df['precision'], df['recall'], df['f1_score'] = zip(*df.apply(lambda row: calculate_metrics(row['Text'], row['SpanCat_Predictions']), axis=1))

In [None]:
# Print the evaluation results
print(df[['precision', 'recall', 'f1_score']].head())

In [None]:
# Define a function to calculate precision, recall, and F1-score for CONLL-type evaluation
def calculate_conll_metrics(annotated_spans, predicted_spans):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    # Convert annotated_spans and predicted_spans to sets for efficient comparison
    annotated_set = set(annotated_spans)
    predicted_set = set(predicted_spans)
    
    # Calculate true positives
    true_positives = len(annotated_set.intersection(predicted_set))
    
    # Calculate false positives
    false_positives = len(predicted_set - annotated_set)
    
    # Calculate false negatives
    false_negatives = len(annotated_set - predicted_set)
    
    # Calculate precision
    precision = true_positives / (true_positives + false_positives) if true_positives or false_positives else 0
    
    # Calculate recall
    recall = true_positives / (true_positives + false_negatives) if true_positives or false_negatives else 0
    
    # Calculate F1-score
    f1_score = 2 * (precision * recall) / (precision + recall) if precision or recall else 0
    
    return precision, recall, f1_score


In [None]:
# Apply the function to each row of the DataFrame
df['precision_conll'], df['recall_conll'], df['f1_score_conll'] = zip(*df.apply(lambda row: calculate_conll_metrics(row['Text'], row['SpanCat_Predictions']), axis=1))


In [None]:
df.to_csv('predictions_for_evaluation_With_metrics.csv')

MUC evaluation

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('test_data.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Preprocessing: Handle empty or "NA" values in Span1

df['span1'] = df['span1'].replace("NA", "").fillna("")

In [None]:
# Function to determine if two spans are Correct, Incorrect, Partial, Missing, or Spurious

def muc_metrics(gold, pred):
    if pred == gold:
        return "COR"
    elif not pred and gold:
        return "MIS"
    elif pred and not gold:
        return "SPU"
    elif pred and gold:
        if pred in gold or gold in pred:
            return "PAR"
        else:
            return "INC"
    else:
        return "INC"

In [None]:
# Apply the function to each row in the dataframe

df['MUC'] = df.apply(lambda row: muc_metrics(row['Text'], row['span1']), axis=1)

In [None]:
# Count each metric

metrics_counts = df['MUC'].value_counts().to_dict()

In [None]:
# Fill in counts for any missing metrics

for metric in ["COR", "INC", "PAR", "MIS", "SPU"]:
    if metric not in metrics_counts:
        metrics_counts[metric] = 0

In [None]:
# Output the counts for each metric

metrics_counts

In [None]:
# Convert the dictionary to a pandas DataFrame

metrics_df = pd.DataFrame(list(metrics_counts.items()), columns=['Metric', 'Count'])
 
# Save the DataFrame to a CSV file

metrics_df.to_csv('muc_metrics_counts_501_v2.csv', index=False)

In [None]:
len(df)

Use this approach

it uses wilson score interval and bootstrapping

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import norm


In [None]:
df = pd.read_csv('test_data.csv')

In [None]:
# Preprocessing: Handle empty or "NA" values 

df['span1'] = df['span1'].replace("NA", "").fillna("")

In [None]:
# Function to determine if two spans are correct or partial matches
def match_type(gold, pred):
    if pred == gold:
        return "exact"
    elif pred in gold or gold in pred:
        return "partial"
    return "none"
 
# Apply the function to each row in the dataframe
df['match'] = df.apply(lambda row: match_type(row['Text'], row['span1']), axis=1)
 
# Function to calculate precision, recall, and F1 score
def calculate_metrics(tp, fp, fn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1
 
# Function to calculate Wilson score interval
def wilson_score_interval(p, n, z=1.96):
    denominator = 1 + z**2 / n
    center = p + z**2 / (2 * n)
    interval_half_width = z * np.sqrt((p * (1 - p) + z**2 / (4 * n)) / n)
    lower_bound = (center - interval_half_width) / denominator
    upper_bound = (center + interval_half_width) / denominator
    return lower_bound, upper_bound
 
# Function to perform bootstrapping
def bootstrap_f1(df, n_bootstraps=1000):
    lenient_f1s, strict_f1s = [], []
    for _ in range(n_bootstraps):
        sample_df = df.sample(n=len(df), replace=True)
        lenient_tp = len(sample_df[sample_df['match'].isin(["exact", "partial"])])
        strict_tp = len(sample_df[sample_df['match'] == "exact"])
        fp = len(sample_df[(sample_df['span1'] != "") & (sample_df['match'] == "none")])
        fn = len(sample_df[(sample_df['span1'] == "") & (sample_df['Text'] != "")])
 
        _, _, lenient_f1 = calculate_metrics(lenient_tp, fp, fn)
        _, _, strict_f1 = calculate_metrics(strict_tp, fp, fn)
 
        lenient_f1s.append(lenient_f1)
        strict_f1s.append(strict_f1)
    lenient_f1_ci = np.percentile(lenient_f1s, [2.5, 97.5])
    strict_f1_ci = np.percentile(strict_f1s, [2.5, 97.5])
    return lenient_f1_ci, strict_f1_ci
 
# Calculate lenient and strict metrics
lenient_tp = len(df[df['match'].isin(["exact", "partial"])])
strict_tp = len(df[df['match'] == "exact"])
fp = len(df[(df['span1'] != "") & (df['match'] == "none")])
fn = len(df[(df['span1'] == "") & (df['Text'] != "")])
 
lenient_precision, lenient_recall, lenient_f1 = calculate_metrics(lenient_tp, fp, fn)
strict_precision, strict_recall, strict_f1 = calculate_metrics(strict_tp, fp, fn)
 
# Calculate Wilson score intervals for precision and recall
lenient_precision_ci = wilson_score_interval(lenient_precision, lenient_tp + fp)
lenient_recall_ci = wilson_score_interval(lenient_recall, lenient_tp + fn)
strict_precision_ci = wilson_score_interval(strict_precision, strict_tp + fp)
strict_recall_ci = wilson_score_interval(strict_recall, strict_tp + fn)
 
# Perform bootstrapping to get confidence intervals for F1 scores
lenient_f1_ci, strict_f1_ci = bootstrap_f1(df)
 
# Create a dictionary with the results
results = {
    "Metric": ["Precision", "Recall", "F1 Score"],
    "Lenient": [lenient_precision, lenient_recall, lenient_f1],
    "Lenient 95% CI Lower": [lenient_precision_ci[0], lenient_recall_ci[0], lenient_f1_ci[0]],
    "Lenient 95% CI Upper": [lenient_precision_ci[1], lenient_recall_ci[1], lenient_f1_ci[1]],
    "Strict": [strict_precision, strict_recall, strict_f1],
    "Strict 95% CI Lower": [strict_precision_ci[0], strict_recall_ci[0], strict_f1_ci[0]],
    "Strict 95% CI Upper": [strict_precision_ci[1], strict_recall_ci[1], strict_f1_ci[1]]
}
 
# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

results_df.head()

In [None]:
# Save the DataFrame to a CSV file
results_df.to_csv('evaluation_metrics.csv', index=False)