# LLM zero-shot Evaluation

## Initialization

#### Setup project paths:

In [4]:
import os
import sys
from os.path import join, exists
from py_markdown_table.markdown_table import markdown_table

from tqdm.notebook import tqdm

# Enable tqdm for pandas
tqdm.pandas()

# ANSI Highlighting: https://stackoverflow.com/a/21786287
h_red = '\x1b[1;30;41m'
h_green = '\x1b[1;30;42m'
h_yellow = '\x1b[1;30;43m'
h_stop = '\x1b[0m'

## Setup project paths:
project_path = os.getcwd()
models_path = join(project_path, "Models")

datasets_path = join(project_path, "Source datasets")
multicw_path = join(project_path, 'Final-dataset')
multiclaim_path = join(datasets_path, "MultiClaim")
lesa_dst_dir = join(datasets_path, 'LESA-EACL-2021')
print('done')

done


## Results
Loading theresults of zero-shot evaluation of selected LLMs.

In [6]:
import os
import pandas as pd
from sklearn.metrics import classification_report

# Path to the folder with label CSVs
labels_folder = join('Results')

# Load your main dataframe
multicw_test = pd.read_csv(join('Results', 'multicw-test.csv'))
nemotron = pd.read_csv(join('Results', 'nemotron_4_340b_CoT_multicw_test.csv'))
multicw_test['nemotron_4_340b_CoT'] = nemotron['answer']
columns = multicw_test.columns

# print(columns)
columns = ['fc_worthy_CoT_CLEF_on_Q_claude-3-5-haiku-20241022', 'fc_worthy_CoT_CLEF_on_Q_llama3.1:70b']

languages = multicw_test['lang'].unique()
languages.sort()

for lang in languages:
    print(f'{h_green}Language: {lang}:{h_stop}')
    # Language specific subset
    multicw_lang = multicw_test[multicw_test['lang'] == lang]

    ground_truth = multicw_lang['label']
    results = multicw_lang['nemotron_4_340b_CoT']

    report = classification_report(ground_truth, results, output_dict=True)
    report_str = str(classification_report(ground_truth, results))
    
    print(f'{h_yellow}Nemotron-4 340B (CoT):{h_stop}')
    print(report_str)

    
    for column in columns:
        ground_truth = multicw_lang['label']
        results = multicw_lang[column]
    
        report = classification_report(ground_truth, results, output_dict=True)
        report_str = str(classification_report(ground_truth, results))
        if column == 'fc_worthy_CoT_CLEF_on_Q_claude-3-5-haiku-20241022':
            column = 'Claude 3.5 Haiku (CoT)'
        if column == 'fc_worthy_CoT_CLEF_on_Q_llama3.1:70b':
            column = 'Llama3.1:70b (CoT)'
        print(f'{h_yellow}{column}:{h_stop}')
        print(report_str)


[1;30;42mLanguage: ar:[0m
[1;30;43mNemotron-4 340B (CoT):[0m
              precision    recall  f1-score   support

         0.0       0.75      0.43      0.55       599
         1.0       0.60      0.85      0.70       599

    accuracy                           0.64      1198
   macro avg       0.67      0.64      0.63      1198
weighted avg       0.67      0.64      0.63      1198

[1;30;43mClaude 3.5 Haiku (CoT):[0m
              precision    recall  f1-score   support

         0.0       0.70      0.26      0.37       599
         1.0       0.54      0.89      0.68       599

    accuracy                           0.57      1198
   macro avg       0.62      0.57      0.52      1198
weighted avg       0.62      0.57      0.52      1198

[1;30;43mLlama3.1:70b (CoT):[0m
              precision    recall  f1-score   support

         0.0       0.64      0.36      0.46       599
         1.0       0.56      0.80      0.65       599

    accuracy                           0.58  