In [None]:
import spacy
from spacy.scorer import Scorer
from spacy.tokens import Span

import pandas as pd

In [None]:
model = "m6"

In [None]:
df = pd.read_csv('test_data_'+model+'_clean.csv')
df.head(2)

In [None]:
df['label'].value_counts()

In [None]:
df.columns

In [None]:
df['TextContent'] = df['TextContent'].astype(str)
df['Text'] = df['Text'].astype(str)
df['span1'] = df['span1'].astype(str)
df['span2'] = df['span2'].astype(str)

df['label'] = df['label'].astype(str)
df['label1'] = df['label1'].astype(str)
df['label2'] = df['label2'].astype(str)

In [None]:
# Preprocessing: Handle empty or "NA" values in Span1

df['span1'] = df['span1'].replace("NA", "").fillna("")
df['span2'] = df['span2'].replace("NA", "").fillna("")

# USE THIS approach

it uses wilson score interval and bootstrapping

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import norm


In [None]:
df = pd.read_csv('m6-check-lenient-strict-match-clean.csv')
df.head(2)

In [None]:
# Function to calculate precision, recall, and F1 score
def calculate_metrics(tp, fp, fn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1
 
# Function to calculate Wilson score interval
def wilson_score_interval(p, n, z=1.96):
    denominator = 1 + z**2 / n
    center = p + z**2 / (2 * n)
    interval_half_width = z * np.sqrt((p * (1 - p) + z**2 / (4 * n)) / n)
    lower_bound = (center - interval_half_width) / denominator
    upper_bound = (center + interval_half_width) / denominator
    return lower_bound, upper_bound
 
# Function to perform bootstrapping
def bootstrap_f1(df, n_bootstraps=1000):
    lenient_f1s, strict_f1s = [], []
    for _ in range(n_bootstraps):
        sample_df = df.sample(n=len(df), replace=True)
        lenient_tp = len(sample_df[sample_df['match'].isin(["exact", "partial"])])
        strict_tp = len(sample_df[sample_df['match'] == "exact"])
        fp = len(sample_df[(sample_df['span1'] != "") & (sample_df['match'] == "none")])
        fn = len(sample_df[(sample_df['span1'] == "") & (sample_df['Text'] != "")])
 
        _, _, lenient_f1 = calculate_metrics(lenient_tp, fp, fn)
        _, _, strict_f1 = calculate_metrics(strict_tp, fp, fn)
 
        lenient_f1s.append(lenient_f1)
        strict_f1s.append(strict_f1)
    lenient_f1_ci = np.percentile(lenient_f1s, [2.5, 97.5])
    strict_f1_ci = np.percentile(strict_f1s, [2.5, 97.5])
    return lenient_f1_ci, strict_f1_ci
 
# Calculate lenient and strict metrics
lenient_tp = len(df[df['match'].isin(["exact", "partial"])])
strict_tp = len(df[df['match'] == "exact"])
fp = len(df[(df['span1'] != "") & (df['match'] == "no match")])
fn = len(df[(df['span1'] == "") & (df['Text'] != "")])
 
lenient_precision, lenient_recall, lenient_f1 = calculate_metrics(lenient_tp, fp, fn)
strict_precision, strict_recall, strict_f1 = calculate_metrics(strict_tp, fp, fn)
 
# Calculate Wilson score intervals for precision and recall
lenient_precision_ci = wilson_score_interval(lenient_precision, lenient_tp + fp)
lenient_recall_ci = wilson_score_interval(lenient_recall, lenient_tp + fn)
strict_precision_ci = wilson_score_interval(strict_precision, strict_tp + fp)
strict_recall_ci = wilson_score_interval(strict_recall, strict_tp + fn)
 
# Perform bootstrapping to get confidence intervals for F1 scores
lenient_f1_ci, strict_f1_ci = bootstrap_f1(df)
 
# Create a dictionary with the results
results = {
    "Metric": ["Precision", "Recall", "F1 Score"],
    "Lenient": [lenient_precision, lenient_recall, lenient_f1],
    "Lenient 95% CI Lower": [lenient_precision_ci[0], lenient_recall_ci[0], lenient_f1_ci[0]],
    "Lenient 95% CI Upper": [lenient_precision_ci[1], lenient_recall_ci[1], lenient_f1_ci[1]],
    "Strict": [strict_precision, strict_recall, strict_f1],
    "Strict 95% CI Lower": [strict_precision_ci[0], strict_recall_ci[0], strict_f1_ci[0]],
    "Strict 95% CI Upper": [strict_precision_ci[1], strict_recall_ci[1], strict_f1_ci[1]]
}
 
# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

results_df.head()

In [None]:
# Function to determine if two spans are correct or partial matches
'''def match_type(gold, pred):
    if pred == gold:
        return "exact"
    elif pred in gold or gold in pred:
        return "partial"
    return "none"'''

def match_type(gold, pred):
    if pd.isna(pred) or pred.strip() == '' or pred=="nan":
        return "no pred (FN)"
    elif pred in gold:
        if pred == gold:
            return "complete"
        else:
            return "partial"
    else:
        return "no match"
    

 
# Apply the function to each row in the dataframe
df['match'] = df.apply(lambda row: match_type(row['Text'], row['span1']), axis=1)

'''
# Function to calculate precision, recall, and F1 score
def calculate_metrics(tp, fp, fn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1'''
 
# Function to calculate Wilson score interval
'''def wilson_score_interval(p, n, z=1.96):
    denominator = 1 + z**2 / n
    center = p + z**2 / (2 * n)
    interval_half_width = z * np.sqrt((p * (1 - p) + z**2 / (4 * n)) / n)
    lower_bound = (center - interval_half_width) / denominator
    upper_bound = (center + interval_half_width) / denominator
    return lower_bound, upper_bound'''
 
# Function to perform bootstrapping
'''def bootstrap_f1(df, n_bootstraps=1000):
    lenient_f1s, strict_f1s = [], []
    for _ in range(n_bootstraps):
        sample_df = df.sample(n=len(df), replace=True)
        lenient_tp = len(sample_df[sample_df['match'].isin(["exact", "partial"])])
        strict_tp = len(sample_df[sample_df['match'] == "exact"])
        fp = len(sample_df[(sample_df['span1'] != "") & (sample_df['match'] == "none")])
        fn = len(sample_df[(sample_df['span1'] == "") & (sample_df['Text'] != "")])
 
        _, _, lenient_f1 = calculate_metrics(lenient_tp, fp, fn)
        _, _, strict_f1 = calculate_metrics(strict_tp, fp, fn)
 
        lenient_f1s.append(lenient_f1)
        strict_f1s.append(strict_f1)
    lenient_f1_ci = np.percentile(lenient_f1s, [2.5, 97.5])
    strict_f1_ci = np.percentile(strict_f1s, [2.5, 97.5])
    return lenient_f1_ci, strict_f1_ci'''
''' 
# Calculate lenient and strict metrics
lenient_tp = len(df[df['match'].isin(["exact", "partial"])])
strict_tp = len(df[df['match'] == "exact"])
fp = len(df[(df['span1'] != "") & (df['match'] == "no match")])
fn = len(df[(df['span1'] == "") & (df['Text'] != "")])
 
lenient_precision, lenient_recall, lenient_f1 = calculate_metrics(lenient_tp, fp, fn)
strict_precision, strict_recall, strict_f1 = calculate_metrics(strict_tp, fp, fn)
 
# Calculate Wilson score intervals for precision and recall
#lenient_precision_ci = wilson_score_interval(lenient_precision, lenient_tp + fp)
#lenient_recall_ci = wilson_score_interval(lenient_recall, lenient_tp + fn)
#strict_precision_ci = wilson_score_interval(strict_precision, strict_tp + fp)
#strict_recall_ci = wilson_score_interval(strict_recall, strict_tp + fn)
 
# Perform bootstrapping to get confidence intervals for F1 scores
#lenient_f1_ci, strict_f1_ci = bootstrap_f1(df)
 
# Create a dictionary with the results
results = {
    "Metric": ["Precision", "Recall", "F1 Score"],
    "Lenient": [lenient_precision, lenient_recall, lenient_f1],
   # "Lenient 95% CI Lower": [lenient_precision_ci[0], lenient_recall_ci[0], lenient_f1_ci[0]],
   # "Lenient 95% CI Upper": [lenient_precision_ci[1], lenient_recall_ci[1], lenient_f1_ci[1]],
    "Strict": [strict_precision, strict_recall, strict_f1],
   # "Strict 95% CI Lower": [strict_precision_ci[0], strict_recall_ci[0], strict_f1_ci[0]],
   # "Strict 95% CI Upper": [strict_precision_ci[1], strict_recall_ci[1], strict_f1_ci[1]]
}
 
# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

results_df.head()'''

In [None]:
recall

In [None]:
lenient_recall

In [None]:
df.head()

In [None]:
df.to_csv('m6-check-lenient-strict-match2.csv')

In [None]:
# Save the DataFrame to a CSV file
results_df.to_csv('evaluation_metrics_'+model+'_CI_v2.csv', index=False)

In [None]:
# for two spans

def match_type(gold, pred1, pred2):
    # Combine predictions (span1 and span2)
    combined_pred = set(pred1).union(set(pred2)) if pred1 or pred2 else set()
    if combined_pred == set(gold):
        return "exact"
    elif set(gold).issubset(combined_pred) or combined_pred.issubset(set(gold)):
        return "partial"
    return "none"
 
# Apply the function to each row in the DataFrame
df['match'] = df.apply(lambda row: match_type(row['Text'], row['span1'], row['span2']), axis=1)

def bootstrap_f1(df, n_bootstraps=1000):
    lenient_f1s, strict_f1s = [], []
 
    for _ in range(n_bootstraps):
        sample_df = df.sample(n=len(df), replace=True)
        # Calculate lenient and strict true positives
        lenient_tp = len(sample_df[sample_df['match'].isin(["exact", "partial"])])
        strict_tp = len(sample_df[sample_df['match'] == "exact"])
 
        # False positives: predictions (span1 or span2) that are not true positives
        fp = len(sample_df[(sample_df['span1'] != "") | (sample_df['span2'] != "") & (sample_df['match'] == "none")])
 
        # False negatives: gold standards (TextContent) missed by both span1 and span2
        fn = len(sample_df[(sample_df['span1'] == "") & (sample_df['span2'] == "") & (sample_df['Text'] != "")])
 
        # Calculate lenient and strict F1 scores
        _, _, lenient_f1 = calculate_metrics(lenient_tp, fp, fn)
        _, _, strict_f1 = calculate_metrics(strict_tp, fp, fn)
 
        lenient_f1s.append(lenient_f1)
        strict_f1s.append(strict_f1)
 
    # Calculate confidence intervals for F1 scores
    lenient_f1_ci = np.percentile(lenient_f1s, [2.5, 97.5])
    strict_f1_ci = np.percentile(strict_f1s, [2.5, 97.5])
 
    return lenient_f1_ci, strict_f1_ci

# Calculate lenient and strict true positives
lenient_tp = len(df[df['match'].isin(["exact", "partial"])])
strict_tp = len(df[df['match'] == "exact"])
 
# False positives: predictions (span1 or span2) that are not true positives
fp = len(df[(df['span1'] != "") | (df['span2'] != "") & (df['match'] == "none")])
 
# False negatives: gold standards (TextContent) missed by both span1 and span2
fn = len(df[(df['span1'] == "") & (df['span2'] == "") & (df['Text'] != "")])
 
# Calculate lenient and strict metrics
lenient_precision, lenient_recall, lenient_f1 = calculate_metrics(lenient_tp, fp, fn)
strict_precision, strict_recall, strict_f1 = calculate_metrics(strict_tp, fp, fn)

# Calculate Wilson score intervals for precision and recall
lenient_precision_ci = wilson_score_interval(lenient_precision, lenient_tp + fp)
lenient_recall_ci = wilson_score_interval(lenient_recall, lenient_tp + fn)
strict_precision_ci = wilson_score_interval(strict_precision, strict_tp + fp)
strict_recall_ci = wilson_score_interval(strict_recall, strict_tp + fn)

# Perform bootstrapping to get confidence intervals for F1 scores
lenient_f1_ci, strict_f1_ci = bootstrap_f1(df)
 
# Create a dictionary with the results
results = {
    "Metric": ["Precision", "Recall", "F1 Score"],
    "Lenient": [lenient_precision, lenient_recall, lenient_f1],
    "Lenient 95% CI Lower": [lenient_precision_ci[0], lenient_recall_ci[0], lenient_f1_ci[0]],
    "Lenient 95% CI Upper": [lenient_precision_ci[1], lenient_recall_ci[1], lenient_f1_ci[1]],
    "Strict": [strict_precision, strict_recall, strict_f1],
    "Strict 95% CI Lower": [strict_precision_ci[0], strict_recall_ci[0], strict_f1_ci[0]],
    "Strict 95% CI Upper": [strict_precision_ci[1], strict_recall_ci[1], strict_f1_ci[1]]
}
 
# Convert the results to a DataFrame
results_df = pd.DataFrame(results)
results_df.head()

In [None]:
# Save the DataFrame to a CSV file
results_df.to_csv('evaluation_metrics_'+model+'_CI_v2_2spans.csv', index=False)

for labels

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
y_true = df['label']
y_pred = df['label1']

In [None]:
# Combine label1 and label2 using an intersection-like approach (logical AND)
y_true = df['label']
y_pred = df['label1']

 
# Now you can calculate metrics such as precision, recall, and F1 score
from sklearn.metrics import precision_score, recall_score, f1_score
 
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
 
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

In [None]:
# Combine label1 and label2 using an intersection-like approach (logical AND)
y_true = df['label']
y_pred = df.apply(lambda row: set([row['label1']]).intersection([row['label2']]), axis=1)
 
# Convert to lists to match the original format, or keep them as sets if working with multilabel
y_pred_combined = y_pred.apply(lambda x: list(x))
 
# Now you can calculate metrics such as precision, recall, and F1 score
from sklearn.metrics import precision_score, recall_score, f1_score
 
precision = precision_score(y_true, y_pred_combined, average='macro')
recall = recall_score(y_true, y_pred_combined, average='macro')
f1 = f1_score(y_true, y_pred_combined, average='macro')
 
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

In [None]:
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score
 
# Combine label1 and label2 as multi-label predictions (e.g., union of the two)
y_true = df['label']
y_pred = df.apply(lambda row: [row['label1'], row['label2']], axis=1)
 
# Convert to binary indicator matrix (one-hot encoding for multi-label classification)
mlb = MultiLabelBinarizer()
 
# Fit and transform both y_true and y_pred to binary format
# Make sure y_true is also in a list of lists (multi-label format)
y_true_binarized = mlb.fit_transform([[label] for label in y_true])  # Convert y_true to list of lists
y_pred_binarized = mlb.transform(y_pred)  # Convert y_pred (label1 + label2 combined) to binary format
 
# Calculate precision, recall, and F1 score with 'macro' or 'micro' averaging depending on your needs
precision = precision_score(y_true_binarized, y_pred_binarized, average='macro')
recall = recall_score(y_true_binarized, y_pred_binarized, average='macro')
f1 = f1_score(y_true_binarized, y_pred_binarized, average='macro')
 
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

In [None]:
def bootstrap_metric(metric_func, y_true, y_pred, n_bootstrap=1000):
    scores = []
    for _ in range(n_bootstrap):
        indices = np.random.randint(0, len(y_true), len(y_true))
        score = metric_func(y_true[indices], y_pred[indices], average='macro', zero_division=0)
        scores.append(score)
    ci_lower, ci_upper = np.percentile(scores, [2.5, 97.5])
    return ci_lower, ci_upper
 
# Calculate 95% confidence intervals for precision, recall, and f1 score
precision_ci = bootstrap_metric(precision_score, y_true.values, y_pred.values)
recall_ci = bootstrap_metric(recall_score, y_true.values, y_pred.values)
f1_ci = bootstrap_metric(f1_score, y_true.values, y_pred.values)
 
print(f'Precision 95% CI: [{precision_ci[0]:.4f}, {precision_ci[1]:.4f}]')
print(f'Recall 95% CI: [{recall_ci[0]:.4f}, {recall_ci[1]:.4f}]')
print(f'F1 Score 95% CI: [{f1_ci[0]:.4f}, {f1_ci[1]:.4f}]')

In [None]:
metrics_df = pd.DataFrame({
    'Metric': ['Precision', 'Recall', 'F1 Score'],
    'Score': [precision, recall, f1],
    '95% CI Lower': [precision_ci[0], recall_ci[0], f1_ci[0]],
    '95% CI Upper': [precision_ci[1], recall_ci[1], f1_ci[1]]
})

print(metrics_df)

In [None]:
metrics_df.to_csv('label_metrics_w_CI_'+model+'.csv', index=False)

In [None]:
metrics_df = pd.DataFrame({
    'Metric': ['Precision', 'Recall', 'F1 Score'],
    'Score': [precision, recall, f1],
    '95% CI Lower': [precision_ci[0], recall_ci[0], f1_ci[0]],
    '95% CI Upper': [precision_ci[1], recall_ci[1], f1_ci[1]]
})

print(metrics_df)

In [None]:
metrics_df.to_csv('label_metrics_w_CI_'+model+'_2lables.csv', index=False)