In [1]:
import os
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# Path to your markdown files
markdown_dir = "/home/subin/Desktop/subin/ritsu_bot/markdown_files"  # Replace with your markdown directory

# Function to load markdown content
def load_markdown_content(filename):
    filepath = os.path.join(markdown_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

In [3]:
# Extract ground truth answers
def get_ground_truth(filename, question):
    original_text = load_markdown_content(filename)
    # TODO: Implement a better answer extraction logic here if needed
    return original_text  # For now, return the entire markdown content


In [4]:
# CSV file containing Question and Generated_Answer
csv_path = "/home/subin/Desktop/subin/finetuned_model_answers_2.csv"  # Replace with your CSV file path
data = pd.read_csv(csv_path)

# Add a column for ground truth answers
data['Ground_Truth'] = data.apply(
    lambda row: get_ground_truth(row['Filename'], row['Question']),
    axis=1
)

# Tokenize the answers (simple whitespace splitting for this example)
def tokenize(text):
    return text.lower().split()

# Compute Precision, Recall, and F1
def compute_metrics(data):
    all_precisions = []
    all_recalls = []
    all_f1s = []

    for index, row in data.iterrows():
        # Tokenize ground truth and generated answers
        ground_truth_tokens = set(tokenize(str(row['Ground_Truth'])))
        generated_tokens = set(tokenize(str(row['Answer'])))

        # Calculate true positives, false positives, and false negatives
        true_positives = len(ground_truth_tokens & generated_tokens)
        false_positives = len(generated_tokens - ground_truth_tokens)
        false_negatives = len(ground_truth_tokens - generated_tokens)

        # Precision, Recall, and F1
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)

    # Average metrics across all examples
    avg_precision = sum(all_precisions) / len(all_precisions)
    avg_recall = sum(all_recalls) / len(all_recalls)
    avg_f1 = sum(all_f1s) / len(all_f1s)

    return avg_precision, avg_recall, avg_f1

# Calculate metrics
precision, recall, f1 = compute_metrics(data)

# Print results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.3045
Recall: 0.1769
F1 Score: 0.1964
