# Create Figures for Paper

Ce notebook permet de créer les figures du papier.

In [1]:
# ---------------------------- PREPARING NOTEBOOK ---------------------------- #
# Autoreload
%load_ext autoreload
%autoreload 2

# Random seed
import numpy as np
np.random.seed(42)

# External modules
import os
from IPython.display import display, Markdown, Latex, clear_output
from tqdm import notebook as tqdm

# Set global log level
import logging
logging.basicConfig(level=logging.INFO)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Define PWD as the current git repository
import git
repo = git.Repo('.', search_parent_directories=True)
pwd = repo.working_dir
os.chdir(pwd)

## Loading Data

### OpenAI Evaluation

In [2]:
import pandas as pd

# Load metrics
path = os.path.join(
    pwd, "results", "difficulty_estimation", "OpenAiEvaluation", "metrics.csv"
)
openai_classification_metrics = pd.read_csv(path)
openai_classification_metrics

Unnamed: 0,dataset,context,model,accuracy,f1 (macro),f1 (micro),precision (macro),precision (micro),recall (macro),recall (micro)
0,sentences,CECRL,gpt-3.5-turbo-1106,0.897917,0.89697,0.897917,0.897494,0.897917,0.897917,0.897917
1,sentences,empty,gpt-3.5-turbo-1106,0.866667,0.864868,0.866667,0.866231,0.866667,0.866667,0.866667
2,sentences,CECRL,davinci-002,0.814583,0.812253,0.814583,0.811908,0.814583,0.814583,0.814583
3,ljl,empty,gpt-3.5-turbo-1106,0.733656,0.74584,0.733656,0.74934,0.733656,0.746278,0.733656
4,ljl,CECRL,gpt-3.5-turbo-1106,0.723971,0.735612,0.723971,0.756418,0.723971,0.7255,0.723971
5,sentences,empty,davinci-002,0.825,0.61909,0.825,0.622441,0.825,0.61875,0.825
6,sentences,empty,babbage-002,0.8125,0.609588,0.8125,0.612527,0.8125,0.609375,0.8125
7,sentences,CECRL,babbage-002,0.8125,0.608993,0.8125,0.610264,0.8125,0.609375,0.8125
8,french-difficulty,CECRL,gpt-3.5-turbo-1106,0.498958,0.423531,0.498958,0.428613,0.498958,0.427679,0.498958
9,ljl,empty,davinci-002,0.585956,0.337429,0.585956,0.347085,0.585956,0.331162,0.585956


In [3]:
# Load GPT-3.5 predictions
ljl_best_predictions = pd.read_csv(
    os.path.join(
        pwd,
        "results",
        "DifficultyEstimationModel",
        f"test_ljl_empty_gpt-3.5-turbo-1106_prepared_for_fine_tuning_predictions.csv",
    )
)
sentences_best_predictions = pd.read_csv(
    os.path.join(
        pwd,
        "results",
        "DifficultyEstimationModel",
        f"test_sentences_CECRL_gpt-3.5-turbo-1106_prepared_for_fine_tuning_predictions.csv",
    )
)
french_difficulty_best_predictions = pd.read_csv(
    os.path.join(
        pwd,
        "results",
        "OpenSourceModelsEvaluation",
        "french_difficulty.csv",
    )
)

# Display
display(ljl_best_predictions)
display(sentences_best_predictions)
display(french_difficulty_best_predictions)

Unnamed: 0,context,user,assistant,predictions
0,,Le soleil se lève sur l' État indien du Mizora...,level4,level4
1,,"— Non , nous ne pouvons pas ! Mani sentit les ...",level4,level4
2,,Manu essaye de se rappeler les nombreuses autr...,level4,level4
3,,Elles se réfugiaient alors dans un endroit ina...,level4,level4
4,,""" Certains sons sont dangereux , nous devons d...",level3,level3
...,...,...,...,...
408,,"Prête pour aller à l' école ? Aujourd'hui , c'...",level2,level2
409,,"« Na adore ces livres , se dit - il . C' est s...",level2,level2
410,,au creux des nuages l’ espoir un été passé san...,level3,level3
411,,Peut - être qu' une petite sieste ne lui ferai...,level4,level4


Unnamed: 0,context,user,assistant,predictions
0,Vous êtes un évaluateur linguistique utilisant...,Il y a des cyniques dans ce genre qui dînent à...,A1,C1
1,Vous êtes un évaluateur linguistique utilisant...,Ce cas me paraît être l'un des plus propres à ...,C2,C2
2,Vous êtes un évaluateur linguistique utilisant...,Et les experts de sonner l' alarme: «Il faut s...,C1,C1
3,Vous êtes un évaluateur linguistique utilisant...,Fonça dans la rue principale en direction de l...,B2,B2
4,Vous êtes un évaluateur linguistique utilisant...,"Retirez-leur leur carte de visite, et ils perd...",C1,C1
...,...,...,...,...
475,Vous êtes un évaluateur linguistique utilisant...,Cet outil – impossible à fabriquer avec des mo...,A2,A2
476,Vous êtes un évaluateur linguistique utilisant...,"Ayant dit, le gendarme s'inclina, son regard f...",B2,B2
477,Vous êtes un évaluateur linguistique utilisant...,"Bref, nous jouissons de la vie comme jamais!» ...",C1,C1
478,Vous êtes un évaluateur linguistique utilisant...,"Car, en instituant l'opération avec exactitude...",C2,C2


Unnamed: 0.1,Unnamed: 0,sentence,difficulty,predictions
0,0,"Dani, est-ce que tu t'es inscrite à ton concou...",A1,A2
1,1,Elle passe aussi à la boulagerie et elle achèt...,A1,A1
2,2,J'ai un frère et une soeur.,A1,A1
3,3,Vous rentrez à quelle heure ?,A1,A1
4,4,Qu'en pensez-vous ?,A1,A1
...,...,...,...,...
955,955,"Je ne savais pas que, bien plus tristement que...",C2,C2
956,956,"Comme Monsieur, un informaticien fondateur de ...",C2,C1
957,957,"Alors qu'au cours du Paléolithique, la diffusi...",C2,C2
958,958,"Ça vous débagoule des raisons, des accusations...",C2,C1


### Open-source Models Evaluation

In [4]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "OpenSourceModelsEvaluation",
    "bert_metrics.csv",
)
bert_classification_metrics = pd.read_csv(path)
bert_classification_metrics

Unnamed: 0.1,Unnamed: 0,accuracy,f1_macro,f1_micro,precision_macro,precision_micro,recall_macro,recall_micro
0,sentences,0.822917,0.821201,0.822917,0.826776,0.822917,0.822917,0.822917
1,ljl,0.624697,0.631121,0.624697,0.626671,0.624697,0.640783,0.624697
2,french_difficulty,0.522917,0.51258,0.522917,0.528225,0.522917,0.522917,0.522917


In [5]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "OpenSourceModelsEvaluation",
    "mistral_metrics.csv",
)
mistral_classification_metrics = pd.read_csv(path)
mistral_classification_metrics

Unnamed: 0,dataset,context,accuracy,f1_micro,precision_micro,recall_micro
0,sentences,CECRL,0.7479,0.7479,0.7479,0.7479
1,ljl,CECRL,0.6368,0.6368,0.6368,0.6368
2,sentences,no-context,0.6312,0.6312,0.6312,0.6312
3,french-difficulty,CECRL,0.5125,0.5125,0.5125,0.5125
4,ljl,no-context,0.4722,0.4722,0.4722,0.4722
5,french-difficulty,no-context,0.3542,0.3542,0.3542,0.3542


### Pairwise Mismatched Evaluation

In [6]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "readability_index_classification_metrics.csv",
)
readability_index_classification_metrics = pd.read_csv(path)
readability_index_classification_metrics

Unnamed: 0,dataset,model,accuracy,f1_micro,precision_micro,recall_micro
0,ljl,gfi,0.4504,0.4504,0.4504,0.4504
1,ljl,fkgl,0.4213,0.4213,0.4213,0.4213
2,ljl,ari,0.3995,0.3995,0.3995,0.3995
3,french-difficulty,fkgl,0.3458,0.3458,0.3458,0.3458
4,sentences,ari,0.3438,0.3438,0.3438,0.3438
5,french-difficulty,gfi,0.3417,0.3417,0.3417,0.3417
6,french-difficulty,ari,0.3417,0.3417,0.3417,0.3417
7,sentences,fkgl,0.3354,0.3354,0.3354,0.3354
8,sentences,gfi,0.3229,0.3229,0.3229,0.3229


In [7]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "readability_index_pairwise_mismatch.csv",
)
readability_index_pairwise_mismatch = pd.read_csv(path)
readability_index_pairwise_mismatch

Unnamed: 0,dataset,model,pairwise_mismatch
0,ljl,gfi,44.0969
1,ljl,ari,47.0944
2,ljl,fkgl,54.5375
3,sentences,ari,88.2375
4,sentences,fkgl,99.3167
5,sentences,gfi,101.9583
6,french-difficulty,ari,111.5646
7,french-difficulty,fkgl,112.3771
8,french-difficulty,gfi,112.6271


In [8]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "bert_pairwise_mismatch.csv",
)
bert_pairwise_mismatch = pd.read_csv(path)
bert_pairwise_mismatch

Unnamed: 0.1,Unnamed: 0,Pairwise mismatch
0,ljl,13.9516
1,sentences,22.2125
2,french_difficulty,36.9812


In [9]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "openai_pairwise_mismatch.csv",
)
openai_pairwise_mismatch = pd.read_csv(path)
openai_pairwise_mismatch

Unnamed: 0,dataset,context,model,pairwise_mismatch
0,ljl,empty,gpt-3.5-turbo-1106,9.2736
1,ljl,CECRL,gpt-3.5-turbo-1106,11.0605
2,sentences,CECRL,gpt-3.5-turbo-1106,13.1125
3,sentences,empty,gpt-3.5-turbo-1106,18.1083
4,ljl,empty,babbage-002,18.3341
5,ljl,CECRL,davinci-002,18.6925
6,sentences,empty,davinci-002,20.9042
7,ljl,empty,davinci-002,21.0896
8,sentences,empty,babbage-002,21.2625
9,ljl,CECRL,babbage-002,23.7482


In [10]:
# Load metrics
path = os.path.join(
    pwd,
    "results",
    "difficulty_estimation",
    "PairwiseMismatch",
    "mistral_pairwise_mismatch.csv",
)
mistral_pairwise_mismatch = pd.read_csv(path)
mistral_pairwise_mismatch

Unnamed: 0.1,Unnamed: 0,Pairwise mismatch
0,ljl_CECRL,13.6223
1,ljl_no-context,29.9855
2,french_difficulty_no-context,39.3771
3,sentences_CECRL,39.7625
4,french_difficulty_CECRL,48.4146
5,sentences_no-context,61.7542


## Figures Creation

In [11]:
# Load metrics
metrics = {
    "readability_index_classification_metrics": readability_index_classification_metrics.copy(),
    "bert_classification_metrics": bert_classification_metrics.copy(),
    "mistral_classification_metrics": mistral_classification_metrics.copy(),
    "openai_classification_metrics": openai_classification_metrics.copy(),
    "readability_index_pairwise_mismatch": readability_index_pairwise_mismatch.copy(),
    "bert_pairwise_mismatch": bert_pairwise_mismatch.copy(),
    "mistral_pairwise_mismatch": mistral_pairwise_mismatch.copy(),
    "openai_pairwise_mismatch": openai_pairwise_mismatch.copy(),
}

# Format all dataframe to have the same columns in classification metrics
## Readability index
metrics["readability_index_classification_metrics"].columns = [
    "dataset",
    "model",
    "accuracy",
    "f1 (micro)",
    "precision (micro)",
    "recall (micro)",
]
metrics["readability_index_classification_metrics"]["context"] = "empty"
## CamemBERT
metrics["bert_classification_metrics"].drop(
    columns=["precision_macro", "recall_macro", "f1_macro"], inplace=True
)
metrics["bert_classification_metrics"].columns = [
    "dataset",
    "accuracy",
    "f1 (micro)",
    "precision (micro)",
    "recall (micro)",
]
metrics["bert_classification_metrics"]["model"] = "CamemBERT"
metrics["bert_classification_metrics"]["context"] = "empty"
## Mistral
metrics["mistral_classification_metrics"].columns = [
    "dataset",
    "context",
    "accuracy",
    "f1 (micro)",
    "precision (micro)",
    "recall (micro)",
]
metrics["mistral_classification_metrics"]["model"] = "Mistral-7B"
## OpenAI
metrics["openai_classification_metrics"].drop(
    columns=["f1 (macro)", "precision (macro)", "recall (macro)"], inplace=True
)
metrics["openai_classification_metrics"].columns = [
    "dataset",
    "context",
    "model",
    "accuracy",
    "f1 (micro)",
    "precision (micro)",
    "recall (micro)",
]

# Format all dataframe to have the same columns in pairwise mismatch
## Readability index
metrics["readability_index_pairwise_mismatch"]["context"] = "empty"
metrics["readability_index_pairwise_mismatch"].columns = [
    "dataset",
    "model",
    "pairwise mismatch",
    "context",
]
## CamemBERT
metrics["bert_pairwise_mismatch"].columns = [
    "dataset",
    "pairwise mismatch",
]
metrics["bert_pairwise_mismatch"]["model"] = "CamemBERT"
metrics["bert_pairwise_mismatch"]["context"] = "empty"
## Mistral
metrics["mistral_pairwise_mismatch"].iloc[:, 0] = (
    metrics["mistral_pairwise_mismatch"]
    .iloc[:, 0]
    .str.replace("french_difficulty", "french-difficulty")
)
metrics["mistral_pairwise_mismatch"]["dataset"] = (
    metrics["mistral_pairwise_mismatch"].iloc[:, 0].apply(lambda x: x.split("_")[0])
)
metrics["mistral_pairwise_mismatch"]["context"] = (
    metrics["mistral_pairwise_mismatch"].iloc[:, 0].apply(lambda x: x.split("_")[1])
)
metrics["mistral_pairwise_mismatch"].drop(
    columns=[metrics["mistral_pairwise_mismatch"].columns[0]], inplace=True
)
metrics["mistral_pairwise_mismatch"].columns = [
    "pairwise mismatch",
    "dataset",
    "context",
]
metrics["mistral_pairwise_mismatch"]["model"] = "Mistral-7B"
## OpenAI
metrics["openai_pairwise_mismatch"].columns = [
    "dataset",
    "context",
    "model",
    "pairwise mismatch",
]

# Merge classification metrics and pairwise mismatch
merged_metrics = {}
for key, metric in metrics.items():
    table = "_".join(key.split("_")[:-2])
    if table not in merged_metrics.keys():
        merged_metrics[table] = metric.copy()
    else:
        merged_metrics[table] = merged_metrics[table].merge(
            metric, on=["dataset", "model", "context"]
        )

# Concatenate all metrics
concatenated_metrics = pd.concat(merged_metrics.values(), ignore_index=True)

# Replace "no-context" by "empty" and "french_difficulty" by "french-difficulty"
concatenated_metrics["context"] = concatenated_metrics["context"].replace(
    "no-context", "empty"
)
concatenated_metrics["dataset"] = concatenated_metrics["dataset"].replace(
    "french_difficulty", "french-difficulty"
)

# Reorder columns
concatenated_metrics = concatenated_metrics[
    [
        "model",
        "context",
        "dataset",
        "pairwise mismatch",
        "accuracy",
        "f1 (micro)",
        "precision (micro)",
        "recall (micro)",
    ]
]

# Remove "babbage-002" model
concatenated_metrics = concatenated_metrics[
    concatenated_metrics["model"] != "babbage-002"
]

# Capitalize model names
concatenated_metrics["model"] = concatenated_metrics["model"].apply(
    lambda x: x.replace("gpt", "GPT")
    .replace("bert", "BERT")
    .replace("mistral", "Mistral")
    .replace("davinci", "Davinci")
    .replace("gfi", "GFI")
    .replace("ari", "ARI")
    .replace("fkgl", "FKGL")
)

# Round all metrics to 2 decimals
concatenated_metrics = concatenated_metrics.round(2)

# Sort by pairwise mismatch
concatenated_metrics.sort_values(by=["pairwise mismatch"], inplace=True)

# Replace context
concatenated_metrics["context"] = (
    concatenated_metrics["context"].replace("empty", "-").replace("CECRL", "\checkmark")
)

# Rename datasets
concatenated_metrics["dataset"] = (
    concatenated_metrics["dataset"]
    .replace("french-difficulty", "SentencesBooks")
    .replace("ljl", "LjL")
    .replace("sentences", "SentencesInternet")
)

# Split by dataset
datasets_metrics = {}
for dataset in concatenated_metrics["dataset"].unique():
    # Split by dataset
    datasets_metrics[dataset] = (
        concatenated_metrics[concatenated_metrics["dataset"] == dataset]
        .copy()
        .drop(columns=["dataset"])
    ).reset_index(drop=True)

    # Set multi-index from model and context
    datasets_metrics[dataset].set_index(["model", "context"], inplace=True)


for dataset, metric in datasets_metrics.items():
    display(Markdown(f"### {dataset}"))
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        display(metric)

### LjL

Unnamed: 0_level_0,Unnamed: 1_level_0,pairwise mismatch,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GPT-3.5-turbo-1106,-,9.27,0.73,0.73,0.73,0.73
GPT-3.5-turbo-1106,\checkmark,11.06,0.72,0.72,0.72,0.72
Mistral-7B,\checkmark,13.62,0.64,0.64,0.64,0.64
CamemBERT,-,13.95,0.62,0.62,0.62,0.62
Davinci-002,\checkmark,18.69,0.61,0.61,0.61,0.61
Davinci-002,-,21.09,0.59,0.59,0.59,0.59
Mistral-7B,-,29.99,0.47,0.47,0.47,0.47
GFI,-,44.1,0.45,0.45,0.45,0.45
ARI,-,47.09,0.4,0.4,0.4,0.4
FKGL,-,54.54,0.42,0.42,0.42,0.42


### SentencesInternet

Unnamed: 0_level_0,Unnamed: 1_level_0,pairwise mismatch,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GPT-3.5-turbo-1106,\checkmark,13.11,0.9,0.9,0.9,0.9
GPT-3.5-turbo-1106,-,18.11,0.87,0.87,0.87,0.87
Davinci-002,-,20.9,0.82,0.82,0.82,0.82
CamemBERT,-,22.21,0.82,0.82,0.82,0.82
Davinci-002,\checkmark,23.82,0.81,0.81,0.81,0.81
Mistral-7B,\checkmark,39.76,0.75,0.75,0.75,0.75
Mistral-7B,-,61.75,0.63,0.63,0.63,0.63
ARI,-,88.24,0.34,0.34,0.34,0.34
FKGL,-,99.32,0.34,0.34,0.34,0.34
GFI,-,101.96,0.32,0.32,0.32,0.32


### SentencesBooks

Unnamed: 0_level_0,Unnamed: 1_level_0,pairwise mismatch,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CamemBERT,-,36.98,0.52,0.52,0.52,0.52
Mistral-7B,-,39.38,0.35,0.35,0.35,0.35
Mistral-7B,\checkmark,48.41,0.51,0.51,0.51,0.51
GPT-3.5-turbo-1106,\checkmark,51.98,0.5,0.5,0.5,0.5
GPT-3.5-turbo-1106,-,61.12,0.49,0.49,0.49,0.49
Davinci-002,\checkmark,66.44,0.47,0.47,0.47,0.47
Davinci-002,-,76.52,0.47,0.47,0.47,0.47
ARI,-,111.56,0.34,0.34,0.34,0.34
FKGL,-,112.38,0.35,0.35,0.35,0.35
GFI,-,112.63,0.34,0.34,0.34,0.34


In [20]:
# Supposons que datasets_metrics est une liste de DataFrames
datasets_metrics_df = pd.concat([df['accuracy'] for df in datasets_metrics.values()], axis=1)
datasets_metrics_df.columns = datasets_metrics.keys()

# Sort by mean ranking
datasets_metrics_df["Ranking"] = datasets_metrics_df.rank(axis=0, ascending=False).mean(axis=1)
datasets_metrics_df = datasets_metrics_df.sort_values(by="Ranking").drop(columns="Ranking")
datasets_metrics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,LjL,SentencesInternet,SentencesBooks
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GPT-3.5-turbo-1106,\checkmark,0.72,0.9,0.5
GPT-3.5-turbo-1106,-,0.73,0.87,0.49
CamemBERT,-,0.62,0.82,0.52
Mistral-7B,\checkmark,0.64,0.75,0.51
Davinci-002,-,0.59,0.82,0.47
Davinci-002,\checkmark,0.61,0.81,0.47
Mistral-7B,-,0.47,0.63,0.35
FKGL,-,0.42,0.34,0.35
GFI,-,0.45,0.32,0.34
ARI,-,0.4,0.34,0.34


In [13]:
predictions = {
    "LjL": ljl_best_predictions,
    "SentencesInternet": sentences_best_predictions,
    "SentencesBooks": french_difficulty_best_predictions,
}

# Format all dataframe to have the same columns
## LJL
predictions["LjL"] = predictions["LjL"][["assistant", "predictions"]]
predictions["LjL"].columns = ["y_true", "y_pred"]
## Sentences
predictions["SentencesInternet"] = predictions["SentencesInternet"][
    ["assistant", "predictions"]
]
predictions["SentencesInternet"].columns = ["y_true", "y_pred"]
## French difficulty
predictions["SentencesBooks"] = predictions["SentencesBooks"][
    ["difficulty", "predictions"]
]
predictions["SentencesBooks"].columns = ["y_true", "y_pred"]

## Exporting as Latex

### Tables

In [14]:
# Mute pandas warning
import warnings
import seaborn as sns

warnings.simplefilter(action="ignore", category=FutureWarning)


# Bold best results
def highlight_best(x):
    # Create empty dataframe
    df = pd.DataFrame("", index=x.index, columns=x.columns)

    # Bold max of accuracy	f1 (micro)	precision (micro)	recall (micro)
    for metric in ["accuracy", "f1 (micro)", "precision (micro)", "recall (micro)"]:
        df[metric].loc[x[metric].idxmax()] += "font-weight: bold; color: #FF9999;"

    return df


for dataset_name, df in datasets_metrics.items():
    # Remove pairwise mismatch
    df = df.drop(columns=["pairwise mismatch"])
    # Sort by accuracy
    df = df.sort_values(by=["accuracy"], ascending=False)

    styled_df = (
        df.style.background_gradient(cmap=sns.light_palette("green", as_cmap=True))
        .apply(highlight_best, axis=None)
        .applymap_index(lambda v: "font-weight: bold;", axis="columns")
        .applymap_index(lambda v: "font-weight: bold;", axis="rows")
        .format(decimal=".", thousands=",", precision=2)
    )
    display(styled_df)
    path = os.path.join(pwd, "figures", "difficulty_estimation", f"{dataset_name}.tex")
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    latex = styled_df.to_latex(
        caption=(f"Metrics for the {dataset_name} dataset"),
        clines="skip-last;data",
        convert_css=True,
        position_float="centering",
        multicol_align="|c|",
        hrules=True,
    )

    # Add \begin{adjustbox}{center}
    latex = latex.replace(
        "\\begin{tabular}", "\\begin{adjustbox}{center}\n\\begin{tabular}"
    ).replace(
        "\\end{tabular}",
        f"\\end{{tabular}}\n\\label{{tab:{dataset_name}_metrics}}\n\\end{{adjustbox}}",
    )

    # Force position of table
    latex = latex.replace("\\begin{table}", "\\begin{table}[!h]")

    print(latex)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GPT-3.5-turbo-1106,-,0.73,0.73,0.73,0.73
GPT-3.5-turbo-1106,\checkmark,0.72,0.72,0.72,0.72
Mistral-7B,\checkmark,0.64,0.64,0.64,0.64
CamemBERT,-,0.62,0.62,0.62,0.62
Davinci-002,\checkmark,0.61,0.61,0.61,0.61
Davinci-002,-,0.59,0.59,0.59,0.59
Mistral-7B,-,0.47,0.47,0.47,0.47
GFI,-,0.45,0.45,0.45,0.45
FKGL,-,0.42,0.42,0.42,0.42
ARI,-,0.4,0.4,0.4,0.4


\begin{table}[!h]
\centering
\caption{Metrics for the LjL dataset}
\begin{adjustbox}{center}
\begin{tabular}{llrrrr}
\toprule
 &  & \bfseries accuracy & \bfseries f1 (micro) & \bfseries precision (micro) & \bfseries recall (micro) \\
model & context &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{\bfseries GPT-3.5-turbo-1106} & \bfseries - & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.73 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.73 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.73 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.73 \\
\bfseries  & \bfseries \checkmark & {\cellcolor[HTML]{068306}} \color[HTML]{F1F1F1} 0.72 & {\cellcolor[HTML]{068306}} \color[HTML]{F1F1F1} 0.72 & {\cellcolor[HTML]{068306}} \color[HTML]{F1F1F1} 0.72 & {\cellcolor[HTML]{068306}} \color[HTML]{F1F1F1} 0.72 \\
\cline{1-6}
\bfseries Mistral-7B & \bfseries \checkmark & {

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GPT-3.5-turbo-1106,\checkmark,0.9,0.9,0.9,0.9
GPT-3.5-turbo-1106,-,0.87,0.87,0.87,0.87
Davinci-002,-,0.82,0.82,0.82,0.82
CamemBERT,-,0.82,0.82,0.82,0.82
Davinci-002,\checkmark,0.81,0.81,0.81,0.81
Mistral-7B,\checkmark,0.75,0.75,0.75,0.75
Mistral-7B,-,0.63,0.63,0.63,0.63
ARI,-,0.34,0.34,0.34,0.34
FKGL,-,0.34,0.34,0.34,0.34
GFI,-,0.32,0.32,0.32,0.32


\begin{table}[!h]
\centering
\caption{Metrics for the SentencesInternet dataset}
\begin{adjustbox}{center}
\begin{tabular}{llrrrr}
\toprule
 &  & \bfseries accuracy & \bfseries f1 (micro) & \bfseries precision (micro) & \bfseries recall (micro) \\
model & context &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{\bfseries GPT-3.5-turbo-1106} & \bfseries \checkmark & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.90 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.90 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.90 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.90 \\
\bfseries  & \bfseries - & {\cellcolor[HTML]{0C860C}} \color[HTML]{F1F1F1} 0.87 & {\cellcolor[HTML]{0C860C}} \color[HTML]{F1F1F1} 0.87 & {\cellcolor[HTML]{0C860C}} \color[HTML]{F1F1F1} 0.87 & {\cellcolor[HTML]{0C860C}} \color[HTML]{F1F1F1} 0.87 \\
\cline{1-6}
\bfseries Davinci-002 & \bfseries

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1 (micro),precision (micro),recall (micro)
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CamemBERT,-,0.52,0.52,0.52,0.52
Mistral-7B,\checkmark,0.51,0.51,0.51,0.51
GPT-3.5-turbo-1106,\checkmark,0.5,0.5,0.5,0.5
GPT-3.5-turbo-1106,-,0.49,0.49,0.49,0.49
Davinci-002,\checkmark,0.47,0.47,0.47,0.47
Davinci-002,-,0.47,0.47,0.47,0.47
Mistral-7B,-,0.35,0.35,0.35,0.35
FKGL,-,0.35,0.35,0.35,0.35
ARI,-,0.34,0.34,0.34,0.34
GFI,-,0.34,0.34,0.34,0.34


\begin{table}[!h]
\centering
\caption{Metrics for the SentencesBooks dataset}
\begin{adjustbox}{center}
\begin{tabular}{llrrrr}
\toprule
 &  & \bfseries accuracy & \bfseries f1 (micro) & \bfseries precision (micro) & \bfseries recall (micro) \\
model & context &  &  &  &  \\
\midrule
\bfseries CamemBERT & \bfseries - & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.52 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.52 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.52 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.52 \\
\cline{1-6}
\bfseries Mistral-7B & \bfseries \checkmark & {\cellcolor[HTML]{0D860D}} \color[HTML]{F1F1F1} 0.51 & {\cellcolor[HTML]{0D860D}} \color[HTML]{F1F1F1} 0.51 & {\cellcolor[HTML]{0D860D}} \color[HTML]{F1F1F1} 0.51 & {\cellcolor[HTML]{0D860D}} \color[HTML]{F1F1F1} 0.51 \\
\cline{1-6}
\multirow[c]{2}{*}{\bfseries GPT-3.5-turbo-

In [21]:
# Mute pandas warning
import warnings
import seaborn as sns

warnings.simplefilter(action="ignore", category=FutureWarning)


# Bold best results
def highlight_best(x):
    # Create empty dataframe
    df = pd.DataFrame("", index=x.index, columns=x.columns)

    # Bold max of accuracy	f1 (micro)	precision (micro)	recall (micro)
    for metric in df.columns:
        df[metric].loc[x[metric].idxmax()] += "font-weight: bold; color: #FF9999;"

    return df

df = datasets_metrics_df.copy()

# Styled dataframe
styled_df = (
    df.style.background_gradient(cmap=sns.light_palette("green", as_cmap=True))
    .apply(highlight_best, axis=None)
    .applymap_index(lambda v: "font-weight: bold;", axis="columns")
    .applymap_index(lambda v: "font-weight: bold;", axis="rows")
    .format(decimal=".", thousands=",", precision=2)
)
display(styled_df)
path = os.path.join(pwd, "figures", "difficulty_estimation", f"difficulty_estimation_metrics.tex")
if not os.path.exists(os.path.dirname(path)):
    os.makedirs(os.path.dirname(path))
latex = styled_df.to_latex(
    caption=(f"Difficulty estimation metrics for all datasets"),
    clines="skip-last;data",
    convert_css=True,
    position_float="centering",
    multicol_align="|c|",
    hrules=True,
)

# Add \begin{adjustbox}{center}
latex = latex.replace(
    "\\begin{tabular}", "\\begin{adjustbox}{center}\n\\begin{tabular}"
).replace(
    "\\end{tabular}",
    f"\\end{{tabular}}\n\\label{{tab:difficulty_estimation_metrics}}\n\\end{{adjustbox}}",
)

# Force position of table
latex = latex.replace("\\begin{table}", "\\begin{table}[!h]")

print(latex)

Unnamed: 0_level_0,Unnamed: 1_level_0,LjL,SentencesInternet,SentencesBooks
model,context,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GPT-3.5-turbo-1106,\checkmark,0.72,0.9,0.5
GPT-3.5-turbo-1106,-,0.73,0.87,0.49
CamemBERT,-,0.62,0.82,0.52
Mistral-7B,\checkmark,0.64,0.75,0.51
Davinci-002,-,0.59,0.82,0.47
Davinci-002,\checkmark,0.61,0.81,0.47
Mistral-7B,-,0.47,0.63,0.35
FKGL,-,0.42,0.34,0.35
GFI,-,0.45,0.32,0.34
ARI,-,0.4,0.34,0.34


\begin{table}[!h]
\centering
\caption{Difficulty estimation metrics for all datasets}
\begin{adjustbox}{center}
\begin{tabular}{llrrr}
\toprule
 &  & \bfseries LjL & \bfseries SentencesInternet & \bfseries SentencesBooks \\
model & context &  &  &  \\
\midrule
\multirow[c]{2}{*}{\bfseries GPT-3.5-turbo-1106} & \bfseries \checkmark & {\cellcolor[HTML]{068306}} \color[HTML]{F1F1F1} 0.72 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.90 & {\cellcolor[HTML]{1A8D1A}} \color[HTML]{F1F1F1} 0.50 \\
\bfseries  & \bfseries - & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.73 & {\cellcolor[HTML]{0C860C}} \color[HTML]{F1F1F1} 0.87 & {\cellcolor[HTML]{279327}} \color[HTML]{F1F1F1} 0.49 \\
\cline{1-5}
\bfseries CamemBERT & \bfseries - & {\cellcolor[HTML]{4EA64E}} \color[HTML]{F1F1F1} 0.62 & {\cellcolor[HTML]{209020}} \color[HTML]{F1F1F1} 0.82 & {\cellcolor[HTML]{008000}} \color[HTML]{F1F1F1} \bfseries \color[HTML]{FF9999} 0.52 \\

### Figures

In [14]:
from matplotlib import pyplot as plt
import matplotlib

matplotlib.use("pgf")
matplotlib.rcParams.update(
    {
        "pgf.texsystem": "pdflatex",
        "font.family": "serif",
        "text.usetex": True,
        "pgf.rcfonts": False,
        "font.size": 10,
    }
)

# Bar plot of accuracy one bar per model and context on each dataset
df = concatenated_metrics.copy()
# Create model + context column
df["model + context"] = df["model"] + df["context"].replace("-", "").replace(
    "\checkmark", " + CECRL"
)
plt.figure()
barplot = sns.barplot(
    x="dataset",
    y="accuracy",
    hue="model + context",
    data=df,
    palette=sns.color_palette(
        [
            "#6baed6",
            "#3182bd",
            "#e6550d",
            "yellow",
            "#756bb1",
            "#9e9ac8",
            "#fd8d3c",
            "#31a354",
            "#74c476",
            "#a1d99b",
        ]
    ),
    ci=None,
)

# Itérer sur les barres du graphique et ajouter le texte à l'intérieur de chaque barre.
for p in barplot.patches:
    if p.get_height() > 0.0:
        barplot.annotate(
            "{}%".format(int(p.get_height() * 100)),  # Format pour la hauteur/accuracy
            (p.get_x() + p.get_width() / 2.0, p.get_height()),
            ha="center",
            va="center",
            xytext=(0, 5),
            textcoords="offset points",
            fontsize=5,
        )

plt.legend(loc="upper right", fontsize=7)
plt.xlabel("Dataset", fontsize=12)
plt.ylabel("Accuracy", fontsize=12)
sns.despine()  # Retrait des bordures indésirables
plt.tight_layout()
# Set size inches
plt.gcf().set_size_inches(8, h=8 / 1.618)
# Reduce font size
plt.rcParams.update({"font.size": 6})

# Export to latex
path = os.path.join(pwd, "figures", "difficulty_estimation", "accuracy.pgf")
if not os.path.exists(os.path.dirname(path)):
    os.makedirs(os.path.dirname(path))
plt.savefig(path)

# Display latex
with open(path, "r") as file:
    latex = file.read()
print(latex)

%% Creator: Matplotlib, PGF backend
%%
%% To include the figure in your LaTeX document, write
%%   \input{<filename>.pgf}
%%
%% Make sure the required packages are loaded in your preamble
%%   \usepackage{pgf}
%%
%% Also ensure that all the required font packages are loaded; for instance,
%% the lmodern package is sometimes necessary when using math font.
%%   \usepackage{lmodern}
%%
%% Figures using additional raster images can only be included by \input if
%% they are in the same directory as the main LaTeX file. For loading figures
%% from other directories you can use the `import` package
%%   \usepackage{import}
%%
%% and then include the figures with
%%   \import{<path to file>}{<filename>.pgf}
%%
%% Matplotlib used the following preamble
%%   \def\mathdefault#1{#1}
%%   \everymath=\expandafter{\the\everymath\displaystyle}
%%   
%%   \makeatletter\@ifpackageloaded{underscore}{}{\usepackage[strings]{underscore}}\makeatother
%%
\begingroup%
\makeatletter%
\begin{pgfpicture}%
\pgfpa

In [15]:
# ----------------------------- CONFUSION MATRIX ----------------------------- #
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns

matplotlib.use("pgf")
matplotlib.rcParams.update(
    {
        "pgf.texsystem": "pdflatex",
        "font.family": "serif",
        "font.size": 12,
        "text.usetex": True,
        "pgf.rcfonts": False,
    }
)

FONTSIZE = 8

# Create confusion matrix for each dataset
confusion_matrices = {}

for dataset_name, df in predictions.items():
    confusion_matrices[dataset_name] = confusion_matrix(df["y_true"], df["y_pred"])

# Plot confusion matrix
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
# Delete last subplot
fig.delaxes(axes[1, 1])

for i, (dataset_name, cm) in enumerate(confusion_matrices.items()):
    ax = axes[i // 2, i % 2]
    if dataset_name == "LjL":
        ax.set_title(f"{dataset_name} (gpt-3.5-turbo-1106 without context)")
    elif dataset_name == "SentencesInternet":
        ax.set_title(f"{dataset_name} (gpt-3.5-turbo-1106 with context)")
    else:
        ax.set_title(f"{dataset_name} (camembert-base)")
    ax.set_xlabel("Predicted label")
    ax.set_ylabel("True label")
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        ax=ax,
        cmap=sns.light_palette("green", as_cmap=True),
        cbar=False,
        annot_kws={"fontsize": 12},
    )
    if dataset_name != "LjL":
        ax.set_xticklabels(["A1", "A2", "B1", "B2", "C1", "C2"])
        ax.set_yticklabels(["A1", "A2", "B1", "B2", "C1", "C2"])
    else:
        ax.set_xticklabels(["Level 1", "Level 2", "Level 3", "Level 4"])
        ax.set_yticklabels(["Level 1", "Level 2", "Level 3", "Level 4"])

    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=FONTSIZE)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=FONTSIZE)
    ax.set_ylim(len(cm), -0.5)
    ax.set_xlim(-0.5, len(cm))
    ax.set_aspect("equal")

    # Set axis labels (True VS Predicted)
    ax.set_xlabel("Predicted label", fontsize=FONTSIZE + 2)
    ax.set_ylabel("True label", fontsize=FONTSIZE + 2)

    # Set title font size
    ax.title.set_size(FONTSIZE + 4)

# Make graph more compact
plt.tight_layout()

# Export to latex
path = os.path.join(pwd, "figures", "difficulty_estimation", "confusion_matrix.pgf")
if not os.path.exists(os.path.dirname(path)):
    os.makedirs(os.path.dirname(path))
plt.savefig(path)

# Display latex
with open(path, "r") as file:
    latex = file.read()
print(latex)

%% Creator: Matplotlib, PGF backend
%%
%% To include the figure in your LaTeX document, write
%%   \input{<filename>.pgf}
%%
%% Make sure the required packages are loaded in your preamble
%%   \usepackage{pgf}
%%
%% Also ensure that all the required font packages are loaded; for instance,
%% the lmodern package is sometimes necessary when using math font.
%%   \usepackage{lmodern}
%%
%% Figures using additional raster images can only be included by \input if
%% they are in the same directory as the main LaTeX file. For loading figures
%% from other directories you can use the `import` package
%%   \usepackage{import}
%%
%% and then include the figures with
%%   \import{<path to file>}{<filename>.pgf}
%%
%% Matplotlib used the following preamble
%%   \def\mathdefault#1{#1}
%%   \everymath=\expandafter{\the\everymath\displaystyle}
%%   
%%   \makeatletter\@ifpackageloaded{underscore}{}{\usepackage[strings]{underscore}}\makeatother
%%
\begingroup%
\makeatletter%
\begin{pgfpicture}%
\pgfpa