In [None]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from src.evaluation.evaluate import predicted_ranks
from src.datasets import dataset_factory
from src.evaluation.metrics import standard_metrics
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

In [None]:
result_paths = [
    'sampled_posts-mistral-123b-k_50.csv',
    'sampled_posts-c4ai-104b-k_50.csv',
    'sampled_posts-llama3_1-70b-k_50.csv',
    'sampled_posts-llama3_3-70b-k_50.csv',
    'sampled_posts-qwen2_5-72b-k_50.csv',
    'sampled_posts-qwen2_5-7b-k_50.csv',
    'sampled_posts-llama3_1-8b-k_50.csv',
    'sampled_posts-gemma3-27b-k_50.csv',
]

In [None]:
dataset = dataset_factory(
    name="multiclaim",
    crosslingual=False,
    language=None,
    split=None,
    version="original"
).load()

In [None]:
def calculate_standard_metrics(post_ids, fact_check_ids):
    desired_fact_check_ids = defaultdict(lambda: list())
    for fact_check_id, post_id in dataset.fact_check_post_mapping:
        desired_fact_check_ids[post_id].append(fact_check_id)

    predicted_ids = [
        desired
        for desired in fact_check_ids
    ]
    
    ranks = []
    for predicted_ids, post_id in zip(predicted_ids, post_ids):
        desired_ids = desired_fact_check_ids[post_id]
        post_ranks = predicted_ranks(np.array(predicted_ids), np.array(desired_ids), default_rank=100)
        ranks.append(post_ranks.values())

    return standard_metrics(ranks)

In [None]:
df_true = pd.read_csv('./datasets/multiclaim/sampled_posts.csv')

In [None]:
embedding_results_df = pd.DataFrame()
veracity_results_df = pd.DataFrame()
df_true = pd.read_csv('./datasets/multiclaim/sampled_posts.csv')
df_mapping = pd.read_csv('./datasets/multiclaim/fact_check_post_mapping.csv')

sample_df = pd.DataFrame(columns=['post_id', 'relevant_claims_ids', 'model', 'prediction', 'ground_truth', 'explanation_prompt', 'explanation'])

veracity_mapping = {
    'true': 'true',
    'false': 'false',
    'unverifiable': 'unverifiable',
    'partly true': 'true',
    'partly false': 'false',
    '': 'unverifiable',
    'partially true': 'true',
    'mostly true': 'true',
    'misleading': 'false',
    'not categorized': 'unverifiable',
    'partly_true': 'true',
    'mixture': 'unverifiable',
}

for path in result_paths:
    df = pd.read_csv(f'./results/final/pipeline/{path}')
    df['fact_check_ids'] = df['fact_check_ids'].apply(eval)
    columns = [
        'fact_check_ids',
        'summaries',
        'relevant_claims_ids',
    ]
    for column in columns:
        df[column] = df[column].fillna('[]')
        df[column] = df[column].apply(lambda x: eval(str(x)))
        
    
    post_ids = [int(post_id) for post_id in list(df['post_id'])]
    fact_check_ids = df['fact_check_ids'].tolist()
    
    # get all the pairs
    matching_df = pd.DataFrame()
    for index, row in df.iterrows():
        fc_ids = row.fact_check_ids
        post_id = row.post_id

        for fc_id in fc_ids:
            matching_df = pd.concat([
                matching_df,
                pd.DataFrame({
                    'post_id': [post_id],
                    'fc_id': [fc_id]
                })
            ])
            
    matching_df = matching_df.reset_index()
    matching_df['true_relevant'] = ''
    for index, row in matching_df.iterrows():
        post_id = row.post_id
        fc_id = row.fc_id

        found = df_mapping[(df_mapping['fact_check_id'] == fc_id) & ((df_mapping['post_id'] == post_id))]
        if found.shape[0] > 0:
            matching_df.at[index, 'true_relevant'] = 'yes'
        else:
            matching_df.at[index, 'true_relevant'] = 'no'
            
    matching_df[path] = 'no'
    
    # Evaluate the performance of Multilingual E5 Large model for claim retrieval
    embedding_results = calculate_standard_metrics(post_ids, fact_check_ids)

    # from tuples select only the first element
    for key in embedding_results:
        embedding_results[key] = embedding_results[key][0]
    embedding_results['model'] = 'Multilingual E5 Large'
    embedding_results['macro_f1'] = 0
    embedding_results['accuracy'] = 0
    embedding_results['macro_precision'] = 0
    embedding_results['macro_recall'] = 0
    embedding_results['tnr'] = 0
    embedding_results['fnr'] = 0
    embedding_results['tpr'] = 0
    embedding_results['fpr'] = 0
    
    embedding_results_df = pd.concat([embedding_results_df, pd.DataFrame(embedding_results, index=[0])])
    
    # Evaluate the performance of the particular model for claim retrieval
    relevant_claims_ids = df['relevant_claims_ids'].tolist()
    model_embedding_results = calculate_standard_metrics(post_ids, relevant_claims_ids)
    for key in model_embedding_results:
        model_embedding_results[key] = model_embedding_results[key][0]
    model_embedding_results['model'] = path
    
    for index, row in df.iterrows():
        relevant_claims_ids = row.relevant_claims_ids
        post_id = row.post_id

        for fc_id in relevant_claims_ids:
            found = matching_df[(matching_df['fc_id'] == fc_id) & ((matching_df['post_id'] == post_id))]
            if found.shape[0] > 0:
                matching_df.at[found.index[0], path] = 'yes'
                
    # Retrieval as Classification
    y_true = list(matching_df['true_relevant'].values)
    y_predict=list(matching_df[path].values)
    macro_f1_retrieval =  f1_score(y_true, y_predict, average="macro")
    accuracy_retrieval = accuracy_score(y_true, y_predict)
    macro_precision_retrieval = precision_score(y_true, y_predict, average="macro")
    macro_recall_retrieval = recall_score(y_true, y_predict, average="macro")
    tn, fp, fn, tp = confusion_matrix(y_true, y_predict).ravel()
    tnr = tn / (tn + fp) # irrelevant filtration
    fnr = fn / (fn + tp) # relevant filtration
    tpr = tp / (tp + fn) # relevant recall
    fpr = fp / (fp + tn) # irrelevant recall
    
    model_embedding_results['macro_f1'] = macro_f1_retrieval
    model_embedding_results['accuracy'] = accuracy_retrieval
    model_embedding_results['macro_precision'] = macro_precision_retrieval
    model_embedding_results['macro_recall'] = macro_recall_retrieval
    model_embedding_results['tnr'] = tnr
    model_embedding_results['fnr'] = fnr
    model_embedding_results['tpr'] = tpr
    model_embedding_results['fpr'] = fpr

    embedding_results_df = pd.concat([embedding_results_df, pd.DataFrame(model_embedding_results, index=[1])])

    # Evaluate the performance of model for veracity prediction
    df['ground_truth'] = ''
    for index, row in df.iterrows():
        post_id = row['post_id']
        label = df_true[df_true['post_id'] == post_id]['rating'].values[0]
        df.at[index, 'ground_truth'] = label.lower()
        
    df['predicted_veracity'] = df['explanation'].apply(lambda x: re.findall(r'"veracity": "(.*)"', str(x)))
    df['predicted_veracity'] = df['predicted_veracity'].apply(lambda x: x[0].lower() if len(x) > 0 else '')

    print(path)
    print(df.predicted_veracity.unique())

    df['predicted_veracity'] = df['predicted_veracity'].map(veracity_mapping)
    print(df.predicted_veracity.unique())
    
    # empty string is predicted_veracity are considere as unverifiable
    # df['predicted_veracity'] = df['predicted_veracity'].apply(lambda x: 'unverifiable' if x == '' else x)

        
    y_true = list(df['ground_truth'].values)
    y_pred = list(df['predicted_veracity'].values)
    report = classification_report(y_true, y_pred, output_dict=True)
    
    macro_f1 = report['macro avg']['f1-score']
    accuracy = report['accuracy']
    macro_precision = report['macro avg']['precision']
    macro_recall = report['macro avg']['recall']
    
    true_count = len(df[df['predicted_veracity'] == 'true'])
    false_count = len(df[df['predicted_veracity'] == 'false'])
    unverifiable_count = len(df[df['predicted_veracity'] == 'unverifiable'])
    
    veracity_results = {
        'model': path,
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'true_count': true_count,
        'false_count': false_count,
        'unverifiable_count': unverifiable_count,
    }
    veracity_results_df = pd.concat([veracity_results_df, pd.DataFrame(veracity_results, index=[0])])
    
    # randomly sample 20 incorrect predictions
    incorrect_predictions = df[(df['ground_truth'] != df['predicted_veracity']) & (~df['explanation'].isna())]
    incorrect_predictions = incorrect_predictions.sample(20)
        
    for i, row in incorrect_predictions.iterrows():
        explanation_text = re.findall(r'"explanation": "(.*)"', str(row['explanation']))
        if len(explanation_text) == 0:
            explanation_text = ['']
        
        sample_df = pd.concat([sample_df, pd.DataFrame({
            'post_id': [row['post_id']],
            'relevant_claims_ids': [row['relevant_claims_ids']],
            'model': [path],
            'prediction': [row['predicted_veracity']],
            'ground_truth': [row['ground_truth']],
            'predicted_explanation': [row['explanation']],
            'explanation_prompt': [row['explanation_prompt']],
            'explanation': [explanation_text[0]]
        })])

In [None]:
embedding_results_df.drop_duplicates(subset=['model'], keep='first', inplace=True)
embedding_results_df

In [None]:
veracity_results_df