In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.utils import resample

In [None]:
model_names = [
    # > 70b params
    # Zero-Shot
    'mistral-123b-vanilla',
    'c4ai-104b-vanilla',
    'qwen2_5-72b-vanilla',
    'llama3_1-70b-vanilla',

    # Zero-Shot + Task
    'mistral-123b-instruct',
    'c4ai-104b-instruct', 
    'qwen2_5-72b-instruct',
    'llama3_1-70b-instruct',

    # Few-Shot + Task
    'mistral-123b-fewshot',
    'c4ai-104b-fewshot',
    'qwen2_5-72b-fewshot',
    'llama3_1-70b-fewshot',
    
    # CoT
    'mistral-123b-cot',
    'c4ai-104b-cot',
    'qwen2_5-72b-cot',
    'llama3_1-70b-cot',
    
    # XLT
    'c4ai-104b-xlt',
    'mistral-123b-xlt',
    'qwen2_5-72b-xlt',
    'llama3_1-70b-xlt',

    # < 10b params
    # Zero-Shot
    'llama3_1-8b-vanilla',
    'qwen2_5-7b-vanilla',
    'mistral-7b-vanilla',

    # Zero-Shot + Task
    'llama3_1-8b-instruct',
    'qwen2_5-7b-instruct',
    'mistral-7b-instruct',

    # Few-Shot + Task
    'llama3_1-8b-fewshot',
    'qwen2_5-7b-fewshot',
    'mistral-7b-fewshot',

    # CoT
    'llama3_1-8b-cot',
    'qwen2_5-7b-cot',
    'mistral-7b-cot',
    
    # XLT
    'llama3_1-8b-xlt',
    'qwen2_5-7b-xlt',
    'mistral-7b-xlt',

    # < 10b params English
    # Zero-Shot
    'llama3_1-8b-vanilla-en',
    'qwen2_5-7b-vanilla-en',
    'mistral-7b-vanilla-en',

    # Zero-Shot + Task
    'llama3_1-8b-instruct-en',
    'qwen2_5-7b-instruct-en',
    'mistral-7b-instruct-en',

    # Few-Shot + Task
    'llama3_1-8b-fewshot-en',
    'qwen2_5-7b-fewshot-en',
    'mistral-7b-fewshot-en',    

    # CoT
    'llama3_1-8b-cot-en',
    'qwen2_5-7b-cot-en',
    'mistral-7b-cot-en',

    # > 70b params English
    # Zero-Shot
    'mistral-123b-vanilla-en',
    'c4ai-104b-vanilla-en',
    'qwen2_5-72b-vanilla-en',
    'llama3_1-70b-vanilla-en',

    # Zero-Shot + Task
    'mistral-123b-instruct-en',
    'c4ai-104b-instruct-en',
    'qwen2_5-72b-instruct-en',
    'llama3_1-70b-instruct-en',

    # Few-Shot + Task
    'mistral-123b-fewshot-en',
    'c4ai-104b-fewshot-en',
    'qwen2_5-72b-fewshot-en',
    'llama3_1-70b-fewshot-en',
    
    # CoT
    'mistral-123b-cot-en',
    'c4ai-104b-cot-en',
    'qwen2_5-72b-cot-en',
    'llama3_1-70b-cot-en',
    
]
annotated_df = pd.read_csv(f'./datasets/annotation_pairs.csv')

In [None]:
languages = [
    'all',
    'spa',
    'eng',
    'por',
    'fra',
    'msa',
    'deu',
    'ara',
    'tha',
    'hbs',
    'kor',
    'pol',
    'slk',
    'nld',
    'ron',
    'ell',
    'ces',
    'bul',
    'hun',
    'hin',
    'mya',
    'monolingual',
    'crosslingual',
    'indo-european',
    'italic',
    'germanic',
    'balto-slavic',
    'latin',
    'non-latin',
    'latin-cross',
    'mixed-script',
    'spa-eng',
    'hin-eng',
    'eng-ara',
    'fra-eng',
    'deu-eng',
    'eng-por',
    'spa-por',
    'deu-fra',
    'slk-ces',
    'slk-eng',
    'pol-hbs',
    'ces-eng',
    'ces-pol',
    'nld-deu',
    'msa-ara',
    'kor-eng',
    'mya-msa',
    'ara-fra',
    'hun-pol',
    'tha-por',
    ]

In [None]:
os.makedirs('./results/metrics', exist_ok=True)
for model_name in model_names:
    df = pd.read_csv(f'./results/{model_name}.csv')
    df['FactCheckLang'] = annotated_df['factcheck_language']
    df['PostLang'] = annotated_df['post_language']
    df['GroundTruth'] = annotated_df['rating']

    # filter out the rows without rating
    df = df[~df['GroundTruth'].isna()]
    # filter out the rows without prediction
    df = df[~df['Prediction'].isna()]
    df['YesProb'] = df['YesProb'].apply(lambda x: x if x.startswith('[') else f'[{x}]')
    df['NoProb'] = df['NoProb'].apply(lambda x: x if x.startswith('[') else f'[{x}]')
    
    df['YesProb'] = df['YesProb'].apply(lambda x: eval(x))
    df['NoProb'] = df['NoProb'].apply(lambda x: eval(x))
    df['YesProb'] = df['YesProb'].apply(lambda x: x[0])
    df['NoProb'] = df['NoProb'].apply(lambda x: x[0])


    df_results = pd.DataFrame(columns=['Language', 'AUC', 'Macro F1 (Accuracy)', 'Macro F1', 'Accuracy', 'Precision', 'Recall', 'TNR (irrelevant)', 'FNR (relevant)', 'TPR', 'FPR', 'Bootstrapped Macro F1'])
    for language in languages:
        if language == 'all':
            df_temp = df
        elif language == 'monolingual':
            df_temp = df[df['FactCheckLang'] == df['PostLang']]
        elif language == 'crosslingual':
            df_temp = df[df['FactCheckLang'] != df['PostLang']]
        elif language == 'indo-european':
            langs = ['spa', 'eng', 'por', 'fra', 'deu', 'hbs', 'pol', 'slk', 'nld', 'ron', 'ell', 'ces', 'bul', 'hin']
            df_temp = df[(df['FactCheckLang'].isin(langs)) & (df['PostLang'].isin(langs)) & (df['FactCheckLang'] == df['PostLang'])]
        elif language == 'italic':
            langs = ['spa', 'por', 'fra', 'ron']
            df_temp = df[(df['FactCheckLang'].isin(langs)) & (df['PostLang'].isin(langs)) & (df['FactCheckLang'] == df['PostLang'])]
        elif language == 'germanic':
            langs = ['eng', 'deu', 'nld']
            df_temp = df[(df['FactCheckLang'].isin(langs)) & (df['PostLang'].isin(langs)) & (df['FactCheckLang'] == df['PostLang'])]
        elif language == 'balto-slavic':
            langs = ['hbs', 'ces', 'pol', 'slk', 'bul']
            df_temp = df[(df['FactCheckLang'].isin(langs)) & (df['PostLang'].isin(langs)) & (df['FactCheckLang'] == df['PostLang'])]
        elif language == 'latin':
            langs = ['spa', 'eng', 'por', 'fra', 'msa', 'deu', 'hbs', 'pol', 'slk', 'nld', 'ron', 'ces', 'hun']
            df_temp = df[(df['FactCheckLang'].isin(langs)) & (df['PostLang'].isin(langs)) & (df['FactCheckLang'] == df['PostLang'])]
        elif language == 'non-latin':
            langs = ['ara', 'tha', 'kor', 'ell', 'bul', 'hin', 'mya']
            df_temp = df[(df['FactCheckLang'].isin(langs)) & (df['PostLang'].isin(langs)) & (df['FactCheckLang'] == df['PostLang'])]
        elif language == 'latin-cross':
            langs = ['spa', 'eng', 'por', 'fra', 'msa', 'deu', 'hbs', 'pol', 'slk', 'nld', 'ron', 'ces', 'hun']
            df_temp = df[(df['FactCheckLang'].isin(langs)) & (df['PostLang'].isin(langs)) & (df['FactCheckLang'] != df['PostLang'])]
        elif language == 'mixed-script':
            latin = ['spa', 'eng', 'por', 'fra', 'msa', 'deu', 'hbs', 'pol', 'slk', 'nld', 'ron', 'ces', 'hun']
            non_latin = ['ara', 'tha', 'kor', 'ell', 'bul', 'hin', 'mya']
            df_temp = df[((df['FactCheckLang'].isin(latin) & df['PostLang'].isin(non_latin)) | (df['FactCheckLang'].isin(non_latin) & df['PostLang'].isin(latin)))]
        elif '-' not in language:
            df_temp = df[(df['FactCheckLang'] == language) & (df['PostLang'] == language)] 
        else:
            post_lang, fc_lang = language.split('-')
            df_temp = df[((df['FactCheckLang'] == fc_lang) & (df['PostLang'] == post_lang))]
                    
        if 'prediction_label' in df_temp.columns:
            predictions = df_temp['prediction_label'].values
        else:
            predictions = df_temp['Prediction'].values
        ground_truth = df_temp['GroundTruth'].values
        yes_probs = df_temp['YesProb'].values
        no_probs = df_temp['NoProb'].values

        yes_text = 'Yes'
        ground_truth = [1 if p == 'Yes' else 0 for p in ground_truth]
        predictions = [1 if p.strip() == yes_text else 0 for p in predictions]

        # normalization of yes and no probabilities
        for i in range(len(predictions)):
            yes_prob = yes_probs[i]
            no_prob = no_probs[i]
            yes_probs[i] = yes_prob / (yes_prob + no_prob)
            no_probs[i] = no_prob / (yes_prob + no_prob)

        fpr, tpr, thresholds = roc_curve(ground_truth, yes_probs)
        roc_auc = auc(fpr, tpr)
        
        f1 = f1_score(ground_truth, predictions, average='macro')
        
        # Number of bootstrap samples
        n_bootstraps = 1000
        macro_f1_scores = []

        # Generate bootstrap samples
        for _ in range(n_bootstraps):
            # Resample the dataset
            indices = resample(range(len(ground_truth)), replace=True)
            y_true_resampled = [ground_truth[i] for i in indices]
            y_pred_resampled = [predictions[i] for i in indices]

            # Compute Macro F1 score for the resampled dataset
            score = f1_score(y_true_resampled, y_pred_resampled, average='macro')
            macro_f1_scores.append(score)

        # Calculate 95% confidence interval
        lower = np.percentile(macro_f1_scores, 2.5)
        upper = np.percentile(macro_f1_scores, 97.5)
        mean_score = np.mean(macro_f1_scores)

        accuracy = accuracy_score(ground_truth, predictions)
        precision = precision_score(ground_truth, predictions)
        recall = recall_score(ground_truth, predictions)

        tn, fp, fn, tp = confusion_matrix(ground_truth, predictions).ravel()
        tnr = tn / (tn + fp)
        fnr = fn / (fn + tp)
        
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)
        f1_accuracy = f'{f1:.2f} ({accuracy:.2f})'
        
        df_results = pd.concat([
            df_results, 
            pd.DataFrame(
                [[language, roc_auc, f1_accuracy, f1, accuracy, precision, recall, tnr, fnr, tpr, fpr, f'({mean_score:.3f}, {lower:.3f}, {upper:.3f})']],
                columns=['Language', 'AUC', 'Macro F1 (Accuracy)', 'Macro F1', 'Accuracy', 'Precision', 'Recall', 'TNR (irrelevant)', 'FNR (relevant)', 'TPR', 'FPR', 'Bootstrapped Macro F1'])
            ])
    df_results = df_results.T
    new_header = df_results.iloc[0]
    df_results = df_results[1:]
    df_results.columns = new_header
    df_results.to_csv(f'./results/metrics/{model_name}.csv', index=True)

## Visualization

In [None]:
from glob import glob
import pandas as pd

paths = glob('./results/metrics/*')
small = [path for path in paths if ('7b' in path or '8b' in path) and ('-en' not in path)]
small_english = [path for path in paths if ('7b' in path or '8b' in path) and ('-en' in path)]
large = [path for path in paths if ('7b' not in path and '8b' not in path) and ('-en' not in path)]
large_english = [path for path in paths if ('7b' not in path and '8b' not in path) and ('-en' in path)]

large_models = [
    'mistral-123b',
    'c4ai-104b',
    'qwen2_5-72b',
    'llama3_1-70b',
]

small_models = [
    'llama3_1-8b',
    'qwen2_5-7b',
    'mistral-7b',
]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def group_chart(data):
    # Labels for models and techniques
    model_labels = [
        "Mistral-Large\n123B", "C4AI Command R+\n104B", "Qwen2.5\n72B", "Llama3.1\n70B",
        "Llama3.1\n8B", "Qwen2.5\n7B", "Mistral\n7B"
    ]
    technique_labels = ['Zero-Shot', 'Zero-Shot +\nTask description', 'Few-Shot +\nTask description', 'CoT', 'XLT']

    # Prepare the figure
    num_bars = len(technique_labels)  # Number of techniques

    fig, ax = plt.subplots(figsize=(12, 6))

    colors = ['b', 'r', 'g', 'm', 'c']
    bar_width = 0.15

    x = np.arange(len(model_labels))  # Positions for the groups (models)

    for i, (tech_data, color) in enumerate(zip(data, colors)):
        current_data = [item[0] for item in tech_data]
        lower_bound = [item[0] - item[1] for item in tech_data]
        upper_bound = [item[2] - item[0] for item in tech_data]
        ax.bar(x + i * bar_width, current_data, bar_width, label=technique_labels[i], color=color, alpha=0.7, yerr=[lower_bound, upper_bound], capsize=5)
        # Add numbers on top of the bars, positioned slightly higher to avoid overlap
        for j, value in enumerate(current_data):
            ax.text(
                x[j] + i * bar_width,
                value + 2,  # Adjust height to avoid overlap
                f'{value:.0f}',
                ha='center',
                va='bottom',
                fontsize=8,
                fontweight='bold',
            )

    # ax.set_title("Performance of Techniques Across Models", fontsize=14)
    ax.set_xticks(x + (num_bars - 1) * bar_width / 2)
    ax.set_xticklabels(model_labels, rotation=45, ha='right', fontsize=12)
    ax.set_ylabel("Macr F1 Score", fontsize=12)
    ax.set_ylim(40, 90)
    
    separator_position = 3.8
    ax.axvline(separator_position, color='black', linestyle='--', linewidth=2, label='_nolegend_')
    plt.text(separator_position - 2, 91, '70B+ LLMs', ha='center', fontsize=12, fontweight='bold')
    plt.text(separator_position + 1.7, 91, '10B- LLMs', ha='center', fontsize=12, fontweight='bold')

    ax.grid(axis='y', linestyle='--', alpha=0.6)

    # Add legend below the chart
    fig.legend(technique_labels, loc='lower center', ncol=num_bars, fontsize=12, frameon=False)
    fig.tight_layout(rect=[0, 0.05, 1, 1])

    # Save or show the plot
    plt.savefig('overall_results-grouped.pdf')
    plt.show()
    
metric = 'Bootstrapped Macro F1'
lang_comb = 'all'
all_data = [
    [np.asarray(eval(pd.read_csv(f'./results/metrics/{model}-vanilla.csv', index_col=0).at[metric, lang_comb])) * 100 for model in large_models + small_models],
    [np.asarray(eval(pd.read_csv(f'./results/metrics/{model}-instruct.csv', index_col=0).at[metric, lang_comb])) * 100 for model in large_models + small_models],
    [np.asarray(eval(pd.read_csv(f'./results/metrics/{model}-fewshot.csv', index_col=0).at[metric, lang_comb])) * 100 for model in large_models + small_models],
    [np.asarray(eval(pd.read_csv(f'./results/metrics/{model}-cot.csv', index_col=0).at[metric, lang_comb])) * 100 for model in large_models + small_models],
    [np.asarray(eval(pd.read_csv(f'./results/metrics/{model}-xlt.csv', index_col=0).at[metric, lang_comb])) * 100 for model in large_models + small_models],
]

group_chart(all_data)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

models = large_models + small_models
strategies = ['vanilla', 'instruct', 'fewshot', 'cot']

data_matrix = []
for model in models:
    model_data = []
    for strategy in strategies:
        org = float(pd.read_csv(f'./results/metrics/{model}-{strategy}.csv', index_col=0).at['Macro F1', 'all']) * 100
        eng = float(pd.read_csv(f'./results/metrics/{model}-{strategy}-en.csv', index_col=0).at['Macro F1', 'all']) * 100
        model_data.append(eng - org)
    data_matrix.append(model_data)

sns.set_theme(rc={'figure.figsize':(7, 5)})
sns.heatmap(
    data_matrix,
    annot=True,
    fmt=".2f", 
    xticklabels=['Zero-Shot', 'Zero-Shot\nTask Description', 'Few-Shot\nTask Description', 'CoT'],
    yticklabels=['Mistral-Large\n123B', 'C4AI Command R+\n104B', 'Qwen2.5\n72B', 'Llama3.1\n70B', 'Llama3.1\n8B', 'Qwen2.5\n7B', 'Mistral\n7B'],
    cmap='coolwarm',
    center=0,
    cbar_kws={'label': 'Macro F1 Score Difference (%)'}
)

plt.annotate('', xy=(1.35, 0.6), xytext=(1.35, 0.45), 
            xycoords='axes fraction', textcoords='axes fraction',
            arrowprops=dict(facecolor='black', width=2, headwidth=10, headlength=10))


plt.gca().xaxis.tick_top()
plt.gca().set_xticklabels(
    plt.gca().get_xticklabels(),
    rotation=30,
    ha='left',
    rotation_mode='anchor'
)

plt.gca().yaxis.set_tick_params(pad=50)
plt.gca().set_yticklabels(plt.gca().get_yticklabels(), rotation=0, ha='center')

plt.tight_layout()
plt.savefig('heatmap-org-vs-eng.pdf')


In [None]:
def create_grouped_charts_in_grid(data, save_as=None, n_cols=9, figsize=(20, 7)):
    unique_techniques = data["Technique"].unique()
    unique_models = data["Model"].unique()
    data["Technique"] = pd.Categorical(data["Technique"], categories=unique_techniques, ordered=True)
    data["Model"] = pd.Categorical(data["Model"], categories=unique_models, ordered=True)
    
    languages = data["Language"].unique()
    
    n_languages = len(languages)
    n_rows = (n_languages + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize, constrained_layout=True, sharey=True)
    axes = axes.flatten()
    
    legend_elements = []
    

    for idx, language in enumerate(languages):
        ax = axes[idx]
        

        lang_data = data[data["Language"] == language]
        
        pivot_data = lang_data.pivot(index="Technique", columns="Model", values="Performance")
        pivot_data = pivot_data.reindex(index=unique_techniques, columns=unique_models)  # Maintain order
        
        # Plot settings
        techniques = pivot_data.index
        n_bars = len(unique_models)
        bar_width = 0.8 / n_bars
        x = np.arange(len(techniques))

        for i, model in enumerate(unique_models):
            bars = ax.bar(
                x + i * bar_width, 
                pivot_data[model], 
                bar_width, 
                label=model if idx == 0 else None
            )
            if idx == 0:
                legend_elements.append(
                    (bars[0], model)
                )
        
        ax.set_title(language, fontsize=10)
        ax.set_xticks(x + (n_bars - 1) * bar_width / 2)
        ax.set_xticklabels(techniques, rotation=45, ha="right", fontsize=8)
        ax.tick_params(axis="y", labelsize=8)
        ax.set_ylim(20, 100)
        
        ax.grid(axis="y", linestyle="--", alpha=0.7)
        if idx % n_cols == 0:
            ax.set_ylabel("Macro F1")
        
    
    # Hide unused subplots
    for ax in axes[n_languages:]:
        ax.axis("off")
    
    if n_languages % n_cols != 0:
        empty_ax = axes[-1] 
        empty_ax.axis("off")
        empty_ax.legend(
            handles=[elem[0] for elem in legend_elements],
            labels=[elem[1] for elem in legend_elements],
            title="Models",
            loc="center",
            fontsize=10,
            title_fontsize=12,
        )
    else:
        fig.legend(
            unique_models, 
            title="Models", 
            loc="upper center", 
            bbox_to_anchor=(0.5, 0), 
            ncol=4, fontsize=10
)

    # Save or show the plot
    if save_as:
        plt.savefig(save_as, bbox_inches="tight")
    plt.show()


In [None]:
metric = 'Macro F1'
languages = [
    'spa', #
    'eng', #
    'por',
    'fra',
    'msa',
    'deu',
    'ara', #
    'tha',
    'hbs',
    'kor',
    'pol',
    'slk', #
    'nld',
    'ron',
    'ell',
    'ces',
    'bul',
    'hun',
    'hin',
    'mya', #
]

languages = sorted(languages)


models = ["mistral-123b", "c4ai-104b", "qwen2_5-72b", "llama3_1-70b"]
techniques = ["vanilla", "instruct", "fewshot", "cot", "xlt"]

data = pd.DataFrame([
    {"Language": lang, "Model": model, "Technique": tech, "Performance": float(pd.read_csv(f'./results/metrics/{model}-{tech}.csv', index_col=0).at[metric, lang]) * 100}
    for lang in languages for model in models for tech in techniques
])

# reanme the models
data["Model"] = data["Model"].replace({
    "mistral-123b": "Mistral Large 123B",
    "c4ai-104b": "C4AI Command R+ 104B",
    "qwen2_5-72b": "Qwen2.5 72B Instruct",
    "llama3_1-70b": "Llama3.1 70B Instruct"
})

# rename the techniques
data["Technique"] = data["Technique"].replace({
    "vanilla": "ZS",
    "instruct": "ZS + Task",
    "fewshot": "FW + Task",
    "cot": "CoT",
    "xlt": "XLT"
})

# Generate the charts
create_grouped_charts_in_grid(data, n_cols=7, save_as="large_language_analysis-part.pdf",)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def create_heatmap(df, save_as=None, figsize=(10, 10)):
    pivot_df = df.pivot(index='Model', columns='Language', values='Performance')
    pivot_df.columns = pivot_df.columns.map({
        'ara': 'ara*',
        'bul': 'bul*',
        'ces': 'ces',
        'deu': 'deu',
        'ell': 'ell*',
        'eng': 'eng',
        'fra': 'fra',
        'hbs': 'hbs',
        'hin': 'hin*',
        'hun': 'hun',
        'kor': 'kor*',
        'msa': 'msa',
        'mya': 'mya*',
        'nld': 'nld',
        'pol': 'pol',
        'por': 'por',
        'ron': 'ron',
        'slk': 'slk',
        'spa': 'spa',
        'tha': 'tha*',
        'spa-eng': 'spa-eng',
        'hin-eng': 'hin*-eng',
        'eng-ara': 'eng-ara*',
        'fra-eng': 'fra-eng',
        'deu-eng': 'deu-eng',
        'eng-por': 'eng-por',
        'spa-por': 'spa-por',
        'deu-fra': 'deu-fra',
        'slk-ces': 'slk-ces',
        'slk-eng': 'slk-eng',
        'pol-hbs': 'pol-hbs',
        'ces-eng': 'ces-eng',
        'ces-pol': 'ces-pol',
        'nld-deu': 'nld-deu',
        'msa-ara': 'msa-ara*',
        'kor-eng': 'kor*-eng',
        'mya-msa': 'mya*-msa',
        'ara-fra': 'ara*-fra',
        'hun-pol': 'hun-pol',
        'tha-por': 'tha*-por',
    })
    
    plt.figure(figsize=figsize)
    sns.heatmap(pivot_df, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.0f', cbar_kws={"pad": 0.02, "shrink": 1})
    plt.xlabel('')
    plt.ylabel('')
    
    plt.xticks(rotation=45, ha='right')
    
    plt.axvline(x=0, color='black', linewidth=2, linestyle='--', ymax=1.075, clip_on=False)    
    plt.axvline(x=4, color='black', linewidth=2, linestyle='--', ymax=1.075, clip_on=False)
    plt.axvline(x=7, color='black', linewidth=2, linestyle='--', ymax=1.075, clip_on=False)
    plt.axvline(x=12, color='black', linewidth=2, linestyle='--', ymax=1.075, clip_on=False)
    plt.axvline(x=14, color='black', linewidth=2, ymin=0, ymax=1.15, clip_on=False)
    plt.axvline(x=20, color='black', linewidth=4, ymin=0, ymax=1.15, clip_on=False)
    
    plt.text(2.25, -0.25, 'Italic', fontsize=10, ha='center', va='center')
    plt.text(5.55, -0.25, 'Germanic', fontsize=10, ha='center', va='center')
    plt.text(9.75, -0.25, 'Slavic', fontsize=10, ha='center', va='center')
    
    plt.text(6.5, -0.8, 'Indo-European', fontsize=10, ha='center', va='center', fontweight='bold')
    plt.text(17, -0.8, 'Others', fontsize=10, ha='center', va='center', fontweight='bold')
    plt.text(30, -0.8, 'Cross-Lingual Pairs', fontsize=10, ha='center', va='center', fontweight='bold')
        
    if save_as:
        plt.savefig(save_as, bbox_inches='tight')
    plt.show()
    
languages = [
    'ara', 'bul', 'ces', 'deu', 'ell', 'eng', 'fra', 'hbs', 'hin', 'hun', 'kor', 'msa', 'mya', 'nld', 'pol', 'por', 'ron', 'slk', 'spa', 'tha',
    'spa-eng',
    'hin-eng',
    'eng-ara',
    'fra-eng',
    'deu-eng',
    'eng-por',
    'spa-por',
    'deu-fra',
    'slk-ces',
    'slk-eng',
    'pol-hbs',
    'ces-eng',
    'ces-pol',
    'nld-deu',
    'msa-ara',
    'kor-eng',
    'mya-msa',
    'ara-fra',
    'hun-pol',
    'tha-por',
]
        
models = ["mistral-123b", "c4ai-104b", "qwen2_5-72b", "llama3_1-70b", "llama3_1-8b", "qwen2_5-7b", "mistral-7b"]
techniques = ["vanilla", "instruct", "fewshot", "cot", "xlt"]

data = pd.DataFrame([
    {"Language": lang, "Model": model, "Technique": tech, "Performance": float(pd.read_csv(f'./results/metrics/{model}-{tech}.csv', index_col=0).at[metric, lang]) * 100}
    for lang in languages for model in models for tech in techniques
])

data["Model"] = data["Model"].replace({
    "mistral-123b": "Mistral Large\n123B",
    "c4ai-104b": "C4AI Command R+\n104B",
    "qwen2_5-72b": "Qwen2.5\n72B",
    "llama3_1-70b": "Llama3.1\n70B",
    "llama3_1-8b": "Llama3.1\n8B",
    "qwen2_5-7b": "Qwen2.5\n7B",
    "mistral-7b": "Mistral\n7B"
})

data["Technique"] = data["Technique"].replace({
    "vanilla": "ZS",
    "instruct": "ZS + Task",
    "fewshot": "FW + Task",
    "cot": "CoT",
    "xlt": "XLT"
})

data["Model"] = pd.Categorical(data["Model"], categories=["Mistral Large\n123B", "C4AI Command R+\n104B", "Qwen2.5\n72B", "Qwen2.5\n7B", "Llama3.1\n70B", "Mistral\n7B", "Llama3.1\n8B"], ordered=True)

data["Language"] = pd.Categorical(data["Language"], categories=[
    'fra', 'por', 'ron', 'spa', 'deu', 'eng', 'nld', 'bul', 'ces', 'hbs', 'pol', 'slk', 'ell', 'hin', 'ara', 'hun', 'kor', 'msa', 'mya', 'tha', 
    'spa-eng',
    'hin-eng',
    'eng-ara',
    'fra-eng',
    'deu-eng',
    'eng-por',
    'spa-por',
    'deu-fra',
    'slk-ces',
    'slk-eng',
    'pol-hbs',
    'ces-eng',
    'ces-pol',
    'nld-deu',
    'msa-ara',
    'kor-eng',
    'mya-msa',
    'ara-fra',
    'hun-pol',
    'tha-por',
], ordered=True)

average_performance = data.groupby(["Model", 'Language'])["Performance"].mean().reset_index()

create_heatmap(average_performance, figsize=(20, 3.5), save_as="language_analysis_overall2.pdf")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def create_heatmap(df, save_as=None, figsize=(10, 10)):
    pivot_df = df.pivot(index='Technique', columns='Language', values='Performance')
    pivot_df.columns = pivot_df.columns.map({
        'ara': 'ara*',
        'bul': 'bul*',
        'ces': 'ces',
        'deu': 'deu',
        'ell': 'ell*',
        'eng': 'eng',
        'fra': 'fra',
        'hbs': 'hbs',
        'hin': 'hin*',
        'hun': 'hun',
        'kor': 'kor*',
        'msa': 'msa',
        'mya': 'mya*',
        'nld': 'nld',
        'pol': 'pol',
        'por': 'por',
        'ron': 'ron',
        'slk': 'slk',
        'spa': 'spa',
        'tha': 'tha*',
        'spa-eng': 'spa-eng',
        'hin-eng': 'hin*-eng',
        'eng-ara': 'eng-ara*',
        'fra-eng': 'fra-eng',
        'deu-eng': 'deu-eng',
        'eng-por': 'eng-por',
        'spa-por': 'spa-por',
        'deu-fra': 'deu-fra',
        'slk-ces': 'slk-ces',
        'slk-eng': 'slk-eng',
        'pol-hbs': 'pol-hbs',
        'ces-eng': 'ces-eng',
        'ces-pol': 'ces-pol',
        'nld-deu': 'nld-deu',
        'msa-ara': 'msa-ara*',
        'kor-eng': 'kor*-eng',
        'mya-msa': 'mya*-msa',
        'ara-fra': 'ara*-fra',
        'hun-pol': 'hun-pol',
        'tha-por': 'tha*-por',
    })
    
    plt.figure(figsize=figsize)
    sns.heatmap(pivot_df, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.0f', cbar_kws={"pad": 0.02, "shrink": 1})
    plt.xlabel('')
    plt.ylabel('')
    plt.xticks(rotation=45, ha='right')
    
    plt.axvline(x=0, color='black', linewidth=2, linestyle='--', ymax=1.075, clip_on=False)    
    plt.axvline(x=4, color='black', linewidth=2, linestyle='--', ymax=1.075, clip_on=False)
    plt.axvline(x=7, color='black', linewidth=2, linestyle='--', ymax=1.075, clip_on=False)
    plt.axvline(x=12, color='black', linewidth=2, linestyle='--', ymax=1.075, clip_on=False)
    plt.axvline(x=14, color='black', linewidth=2, ymin=0, ymax=1.15, clip_on=False)

    plt.text(2.25, -0.25, 'Italic', fontsize=10, ha='center', va='center')
    plt.text(5.55, -0.25, 'Germanic', fontsize=10, ha='center', va='center')
    plt.text(9.75, -0.25, 'Slavic', fontsize=10, ha='center', va='center')
    
    plt.text(6.5, -0.8, 'Indo-European', fontsize=10, ha='center', va='center', fontweight='bold')
    plt.text(17, -0.8, 'Others', fontsize=10, ha='center', va='center', fontweight='bold')
        
    # Save or show the plot
    if save_as:
        plt.savefig(save_as, bbox_inches='tight')
    plt.show()
    
languages = [
    'ara', 'bul', 'ces', 'deu', 'ell', 'eng', 'fra', 'hbs', 'hin', 'hun', 'kor', 'msa', 'mya', 'nld', 'pol', 'por', 'ron', 'slk', 'spa', 'tha',
]
        
models = ["mistral-123b", "c4ai-104b", "qwen2_5-72b", "llama3_1-70b", "llama3_1-8b", "qwen2_5-7b", "mistral-7b"]
techniques = ["vanilla", "instruct", "fewshot", "cot", "xlt"]

data = pd.DataFrame([
    {"Language": lang, "Model": model, "Technique": tech, "Performance": float(pd.read_csv(f'./results/metrics/{model}-{tech}.csv', index_col=0).at[metric, lang]) * 100}
    for lang in languages for model in models for tech in techniques
])

data["Model"] = data["Model"].replace({
    "mistral-123b": "Mistral Large\n123B",
    "c4ai-104b": "C4AI Command R+\n104B",
    "qwen2_5-72b": "Qwen2.5\n72B",
    "llama3_1-70b": "Llama3.1\n70B",
    "llama3_1-8b": "Llama3.1\n8B",
    "qwen2_5-7b": "Qwen2.5\n7B",
    "mistral-7b": "Mistral\n7B"
})

data["Technique"] = data["Technique"].replace({
    "vanilla": "ZS",
    "instruct": "ZS + Task",
    "fewshot": "FW + Task",
    "cot": "CoT",
    "xlt": "XLT"
})

data["Model"] = pd.Categorical(data["Model"], categories=["Mistral Large\n123B", "C4AI Command R+\n104B", "Qwen2.5\n72B", "Qwen2.5\n7B", "Llama3.1\n70B", "Mistral\n7B", "Llama3.1\n8B"], ordered=True)
data["Technique"] = pd.Categorical(data["Technique"], categories=["ZS + Task", "CoT", "XLT", "FW + Task", "ZS"], ordered=True)
data["Language"] = pd.Categorical(data["Language"], categories=[
    'fra', 'por', 'ron', 'spa', 'deu', 'eng', 'nld', 'bul', 'ces', 'hbs', 'pol', 'slk', 'ell', 'hin', 'ara', 'hun', 'kor', 'msa', 'mya', 'tha', 
], ordered=True)

average_performance = data.groupby(["Technique", 'Language'])["Performance"].mean().reset_index()

create_heatmap(average_performance, figsize=(10, 2.5), save_as="language_analysis_techniques.pdf")

In [None]:
def create_horizontal_bar_charts_for_languages(
    data, 
    save_as=None,
    n_cols=5,
):
    unique_techniques = data["Technique"].unique()
    unique_languages = data["Language"].unique()
    unique_models = data["Model"].unique()

    data["Technique"] = pd.Categorical(data["Technique"], categories=unique_techniques, ordered=True)
    data["Language"] = pd.Categorical(data["Language"], categories=unique_languages, ordered=True)
    data["Model"] = pd.Categorical(data["Model"], categories=unique_models, ordered=True)

    n_languages = len(unique_languages)
    n_rows = -(-n_languages // n_cols)
    
    if len(unique_models) < 5:
        figsize = (15,22)
    else:
        figsize = (15, 25)
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize, constrained_layout=True)
    axes = axes.flatten()

    for idx, language in enumerate(unique_languages):
        ax = axes[idx]
        
        lang_data = data[data["Language"] == language]
        lang_data["Model_Technique"] = lang_data["Model"].astype(str) + " - " + lang_data["Technique"].astype(str)
        lang_data = lang_data.sort_values(by="Performance", ascending=True)
        
        ax.barh(lang_data["Model_Technique"], lang_data["Performance"], color='skyblue')
        
        ax.set_title(language, fontsize=12)
        ax.set_xlabel("Performance", fontsize=10)
        ax.tick_params(axis="y", labelsize=8)
        ax.tick_params(axis="x", labelsize=8)
        ax.grid(axis="x", linestyle="--", alpha=0.7)
        
        if idx % n_cols == 0:
            ax.set_ylabel("Model - Technique", fontsize=10)
        
    for ax in axes[n_languages:]:
        ax.axis("off")

    if save_as:
        plt.savefig(save_as, bbox_inches="tight")
    plt.show()

In [None]:
languages = [
    'spa',
    'eng',
    'por',
    'fra',
    'msa',
    'deu',
    'ara',
    'tha',
    'hbs',
    'kor',
    'pol',
    'slk',
    'nld',
    'ron',
    'ell',
    'ces',
    'bul',
    'hun',
    'hin',
    'mya',
]

languages = sorted(languages)
models = ["mistral-123b", "c4ai-104b", "qwen2_5-72b", "llama3_1-70b"]
techniques = ["vanilla", "instruct", "fewshot", "cot", "xlt"]

data = pd.DataFrame([
    {"Language": lang, "Model": model, "Technique": tech, "Performance": float(pd.read_csv(f'./results/metrics/{model}-{tech}.csv', index_col=0).at[metric, lang]) * 100}
    for lang in languages for model in models for tech in techniques
])

data["Model"] = data["Model"].replace({
    "mistral-123b": "Mistral Large 123B",
    "c4ai-104b": "C4AI Command R+ 104B",
    "qwen2_5-72b": "Qwen2.5 72B Instruct",
    "llama3_1-70b": "Llama3.1 70B Instruct"
})

data["Technique"] = data["Technique"].replace({
    "vanilla": "ZS",
    "instruct": "ZS + Task",
    "fewshot": "FW + Task",
    "cot": "CoT",
    "xlt": "XLT"
})

create_horizontal_bar_charts_for_languages(
    data, 
    n_cols=4
)