## Evaluation: 

We compare the full sets of predicted and reference phrases, labeling items as 1 (present) or 0 (not present) for each set, and then computing standard classification metrics like accuracy, precision, recall, and F1-score, as well as edit distance and BLEU score. We finally save per-row details for debugging

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.metrics import edit_distance


# Function to split CLEAN_TEXT by '/' based on labeling convention
def split_text(text):
    if not text or not isinstance(text, str):
        return []  # Handle empty or invalid inputs
    return [part.strip() for part in text.split('/') if part.strip()]

# Function to calculate BLEU score with special handling for single-token cases
def calculate_bleu(reference, predicted):
    smoothing_fn = SmoothingFunction().method1

    # If only one token, use a simple string comparison (only 2-gram+ in the implementation from nltk)
    if len(reference) == 1 and len(predicted) == 1:
        return 1.0 if reference[0] == predicted[0] else 0.0

    # Otherwise, compute BLEU score normally
    return sentence_bleu([reference], predicted, weights=(0.5, 0.5), smoothing_function=smoothing_fn)


# Function to calculate metrics per row
def calculate_metrics(row):
    # Extract the reference and predicted sets
    reference = set(split_text(row['CLEAN_TEXT']))
      # Convert to list
    predicted = set(split_text(row['NER_OUT_drop_(0.25)']))    
    # Combine all unique phrases
    all_phrases = list(reference | predicted)
    
    # Create binary labels for reference and predicted
    reference_labels = [1 if phrase in reference else 0 for phrase in all_phrases]
    predicted_labels = [1 if phrase in predicted else 0 for phrase in all_phrases]
    
    # Compute edit distance and BLEU score
    edit_dist = edit_distance(' '.join(reference), ' '.join(predicted))
    bleu = calculate_bleu(list(reference), list(predicted))
    #print(reference,predicted, edit_dist, bleu)
    # Return the metrics
    return reference_labels, predicted_labels, all_phrases, edit_dist, bleu


# Load the processed CSV
file_path = "data/processed_data_test.csv"  
df = pd.read_csv(file_path)

# Apply metrics calculation to each row
metrics = df.apply(calculate_metrics, axis=1)

# Flatten the results for global metrics computation
reference_labels = []
predicted_labels = []
all_phrases = []
edit_distances = []
bleu_scores = []

for ref, pred, phrases, edit_dist, bleu in metrics:
    reference_labels.extend(ref)
    predicted_labels.extend(pred)
    all_phrases.extend(phrases)
    edit_distances.append(edit_dist)
    bleu_scores.append(bleu)

# Calculate overall metrics
precision = precision_score(reference_labels, predicted_labels, zero_division=0)
recall = recall_score(reference_labels, predicted_labels, zero_division=0)
f1 = f1_score(reference_labels, predicted_labels, zero_division=0)

# Logging also the accuracy and CM for completion, though the accuracy should  not be very indicative in this task
accuracy = accuracy_score(reference_labels, predicted_labels)
cm = confusion_matrix(reference_labels, predicted_labels) # Expected 0 predicted and true negatives

# Calculate overall average metrics
avg_edit_distance = sum(edit_distances) / len(edit_distances)
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)

# Print overall metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
print(f"Average Edit Distance: {avg_edit_distance:.2f}")
print(f"Average BLEU Score: {avg_bleu_score:.2f}")


df['reference_labels'] = [ref for ref, _, _, _, _ in metrics]
df['predicted_labels'] = [pred for _, pred, _, _, _ in metrics]
df['all_phrases'] = [phrases for _, _, phrases, _, _ in metrics]
df['edit_distance'] = edit_distances
df['bleu_score'] = 

# Save detailed results for debugging (optional)
output_with_details = "data/output_with_details.csv"
df.to_csv(output_with_details, index=False)
print(f"Detailed results with edit distance and BLEU score have been saved to {output_with_details}")


Accuracy: 69.11%
Precision: 84.37%
Recall: 79.25%
F1 Score: 81.73%
Average Edit Distance: 5.16
Average BLEU Score: 0.70
Detailed results with edit distance and BLEU score have been saved to data/output_with_details.csv



**Rule based** (simple preproc):
Accuracy: 71.79%
Precision: 80.42%
Recall: 86.99%
F1 Score: 83.58%
Average Edit Distance: 4.28
Average BLEU Score: 0.78

**NER only** r=0.25:
Accuracy: 69.11%
Precision: 84.37%
Recall: 79.25%
F1 Score: 81.73%
Average Edit Distance: 5.16
Average BLEU Score: 0.70


**NER & POS** r=(0.25,0.25):
Accuracy: 71.99%
Precision: 80.76%
Recall: 86.90%
F1 Score: 83.72%
Average Edit Distance: 4.25
Average BLEU Score: 0.78




**LLM** (llama-3.1-8B) (Small sample, **not full result!**):
Accuracy: 79.31%
Precision: 88.46%
Recall: 88.46%
F1 Score: 88.46%
Average Edit Distance: 2.40
Average BLEU Score: 0.73

**DL** (has not converged appropriately. Maybe more training?)
Accuracy: 1.85%
Precision: 2.13%
Recall: 12.35%
F1 Score: 3.63%
Average Edit Distance: 61.23
Average BLEU Score: 0.01

 