# Mistral Evaluation

Thanks to our two previous notebooks, we now have a Mistral-7B model that is theoretically capable of simplifying a sentence in French. It's time to make sure that this model works properly and to try and evaluate its performance.

In [1]:
# ---------------------------- PREPARING NOTEBOOK ---------------------------- #
# Autoreload
%load_ext autoreload
%autoreload 2

# Random seed
import numpy as np
np.random.seed(42)

# External modules
import os
from IPython.display import display

# Set global log level
import logging
logging.basicConfig(level=logging.INFO)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Define PWD as the current git repository
import git
repo = git.Repo('.', search_parent_directories=True)
pwd = repo.working_dir
os.chdir(pwd)

# import

In [2]:
# -------------------------- LOAD PREVIOUS NOTEBOOKS ------------------------- #
import json
import __main__
import black

paths = [
    os.path.join(pwd, "notebooks", "text_simplification", "a_DatasetCreation.ipynb"),
    os.path.join(pwd, "notebooks", "text_simplification", "b_FineTuningMistral.ipynb"),
]
# Read notebooks
code_dict = {}
for path in paths:
    code = ""
    with open(path, "r") as f:
        temp = json.load(f)

    cells = [
        cell
        for cell in temp["cells"]
        if cell["cell_type"] == "code"
        and len(cell["source"]) > 0
        and cell["source"][-1] == "# import"
    ]
    notebook_code = "\n".join(
        line
        for cell in cells
        for line in cell["source"]
        if line != "# import" and len(line) > 0 and line[0] != "%"
    )
    # Create something like a header
    code += f"# {'-'*76} #\n"
    code += f"# {os.path.basename(path).upper():^76} #\n"
    code += f"# {'-'*76} #\n"
    code += notebook_code

    # Add "Module Creation"
    notebook_name = (
        os.path.basename(path).replace("imported_", "").replace(".ipynb", "")
    )
    code += """
# --------------------------------- IMPORTER --------------------------------- #
import types


class MyNotebook:
    pass


NOTEBOOK_NAME = MyNotebook()
# Put every function defined in the notebook in the class
NOTEBOOK_NAME.__dict__.update(
    {
        name: obj
        for name, obj in locals().items()
        if isinstance(obj, (type, types.FunctionType))
        if not (name.startswith("_") or name == "MyNotebook")
    }
)
    """.replace(
        "NOTEBOOK_NAME", notebook_name
    )

    # Remove empty lines
    code = "\n".join([line for line in code.split("\n") if len(line) > 0])
    # Format code
    code = black.format_str(code, mode=black.FileMode())

    # Write scrach file
    path = os.path.join(
        pwd, "scratch", f"imported_{os.path.basename(path).replace('ipynb', 'py')}"
    )
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    with open(path, "w") as f:
        f.write(code)
    code_dict[path] = code


# Mainify code
for path, code in code_dict.items():
    compiled = compile(code, path, "exec")
    exec(compiled, __main__.__dict__)

# import

## Chargement des données Zeeguu

In [3]:
# --------------------------- DOWNLOAD ZEEGUU DATA --------------------------- #
import pandas as pd


def download_difficulty_estimation(pwd: str = None):
    csv_path = a_DatasetCreation.ft_download_data("Data", pwd)
    csv_dict = {}
    for path in csv_path:
        csv_dict[path.split("/")[-1].split(".")[0]] = pd.read_csv(
            path,
            sep=",",
            names=["Sentence", "Difficulty"],
            header=None,
        ).iloc[1:]

    return csv_dict


# import

In [4]:
# ------------ KEEP ONLY 5 SENTENCE OF EACH LEVEL OF EACH DATASET ------------ #
import pandas as pd


def get_balanced_dataframe(csv_dict: dict, nbr: int = 5):
    if nbr is None:
        nbr = float("inf")

    # Estimate number of sentences
    df_grouped_by_difficulty = [
        df.groupby("Difficulty")
        for key, df in csv_dict.items()
        if "ljl" not in key and "test" not in key
    ]
    to_sample = min(
        nbr, min([int(df.count().min().iloc[0]) for df in df_grouped_by_difficulty])
    )

    # Concatenate all dataframes
    result = pd.concat(
        [df_grouped.sample(to_sample) for df_grouped in df_grouped_by_difficulty]
    )
    # Remove A1
    result = result[result["Difficulty"] != "A1"]
    return result.sort_values(by=["Difficulty"]).reset_index(drop=True)


# import

In [5]:
test_df = get_balanced_dataframe(download_difficulty_estimation(pwd), nbr=100)
test_df.value_counts("Difficulty")

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Difficulty
A2    200
B1    200
B2    200
C1    200
C2    200
Name: count, dtype: int64

### Prepare for the evaluation

Now that we have our dataframe, let's see if we can use the functions defined in the previous notebooks to obtain a test dataset.

In [6]:
# Charger tokenizer
tokenizer = a_DatasetCreation.download_tokenizer()

# Create dataset
dataset = a_DatasetCreation.format_data(test_df, tokenizer, training=False)

# Encode dataset
encoded_dataset = a_DatasetCreation.encode_dataset(dataset, tokenizer)

INFO:root:Create conversation...
INFO:root:Create dataset...
INFO:root:Format dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

INFO:root:Determine max length...


  0%|          | 0/1000 [00:00<?, ?it/s]

INFO:root:Encode dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

INFO:root:Create labels...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

INFO:root:Create dataset ready for training...


In [7]:
# Decode dataset
display(
    pd.Series(
        encoded_dataset.map(
            lambda e: {"decoded": tokenizer.decode(e["input_ids"])},
            remove_columns=["input_ids", "attention_mask", "labels"],
        )["decoded"]
    )
    .apply(lambda x: x[:40])
    .value_counts()
)
display(
    pd.Series(
        encoded_dataset.map(
            lambda e: {"decoded": tokenizer.decode(e["input_ids"])},
            remove_columns=["input_ids", "attention_mask", "labels"],
        )["decoded"]
    )
    .apply(lambda x: x[-40:])
    .value_counts()
)

pd.Series(
    encoded_dataset.map(
        lambda e: {"size": len(e["input_ids"])},
        remove_columns=["input_ids", "attention_mask", "labels"],
    )["size"]
).astype(int).describe()

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

</s></s></s></s></s></s></s></s></s></s>     999
<s><s> [INST] <<SYS>>\nVous êtes un modèl      1
Name: count, dtype: int64

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

au maximum son sens original [/INST]</s>    1000
Name: count, dtype: int64

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

count    1000.0
mean      604.0
std         0.0
min       604.0
25%       604.0
50%       604.0
75%       604.0
max       604.0
dtype: float64

## Prepare Inference

We have a test set with 60 French sentences at various levels. We also have a pre-trained Mistral-7B model. We will use this model to generate predictions on these sentences and compare them with the original sentences.

### Define the evaluation function

We define the function to be executed on the server. It will :
1. Load the model
2. Load the data
3. Encode data
4. Make predictions

In [8]:
# ---------------------------- EVALUATION FUNCTION --------------------------- #
import os

import pandas as pd
import torch
from peft import PeftModel
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM
from tqdm import tqdm as console_tqdm

MODEL = "bofenghuang/vigostral-7b-chat"

def evaluate_mistral(
    pwd: str = "/scratch/hjamet",
    zero_shot: bool = False,
):
    # Fix partial import bug
    import ray.train.huggingface
    import ray.train.huggingface.transformers

    # Load data
    df = download_difficulty_estimation(pwd)
    test_df = get_balanced_dataframe(df, nbr=100)

    # Charger tokenizer
    tokenizer = a_DatasetCreation.download_tokenizer(training=False)

    # Create dataset
    dataset = a_DatasetCreation.format_data(test_df, tokenizer, training=False)

    # Encode dataset
    encoded_dataset = a_DatasetCreation.encode_dataset(dataset, tokenizer)

    # Load model
    path = os.path.join(
        pwd,
        "models",
        "difficulty_estimation",
        MODEL.replace("/", "_"),
    )
    if zero_shot:
        model = b_FineTuningMistral.load_model(MODEL)
    else:
        base_model = AutoModelForCausalLM.from_pretrained(
            os.path.join(path, "mistral_simplification_trained"),
            device_map="auto",
            use_cache=False,
            trust_remote_code=True,
        )
        model = PeftModel.from_pretrained(
            base_model, os.path.join(path, "mistral_simplification_trained")
        )

    # Move everything to GPU
    model.to("cuda")
    test_loader = DataLoader(encoded_dataset, batch_size=16)

    # Generate predictions
    with torch.no_grad():
        model.eval()
        predictions_ids = []

        for batch in console_tqdm(test_loader):
            input_ids_batch = batch["input_ids"].to("cuda")
            attention_mask_batch = batch["attention_mask"].to("cuda")

            outputs = model.generate(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                max_length=max(128, input_ids_batch.shape[1] * 2),
                num_return_sequences=1,
            )

            predictions_ids.extend(outputs)
        predictions = [
            tokenizer.decode(prediction, skip_special_tokens=True)
            for prediction in predictions_ids
        ]
        predictions_series = pd.Series(predictions)

    return predictions_series

### Creation of the Slurmray launcher

We're now going to define the launcher that will allow us to run our code on the server using Slurmray.

In [9]:
# ------------------------------- RAY LAUNCHER ------------------------------- #
from slurmray.RayLauncher import RayLauncher

def compute_predictions(
    zero_shot: bool = False,
):
    # Define launcher
    launcher = RayLauncher(
        project_name="mistral_sentence_simplification",
        func=evaluate_mistral,
        args={
            "zero_shot": zero_shot,
        },
        modules=[],
        node_nbr=1,
        use_gpu=True,
        memory=128,
        max_running_time=60,
        server_run=True,
        server_ssh="curnagl.dcsr.unil.ch",
        server_username="hjamet",
    )
    
    # Compute predictions
    predictions = launcher()
    
    # Return predictions
    return predictions

## Computing Fine-tuned predictions

In [10]:
# --------------------------- CALCULATE PREDICTIONS -------------------------- #
# Calculate predictions
predictions = compute_predictions(zero_shot=False)

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1402-15h18.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1402-15h18_queue.log>
Submitted batch job 38687119
IP Head: 10.203.101.82:6379
STARTING HEAD at dnagpu002
2024-02-14 15:18:43,473	INFO usage_lib.py:449 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See ht

In [12]:
# Save original predictions
path = os.path.join(pwd, "results", "text_simplification", "MistralEvaluation")
if not os.path.exists(path):
    os.makedirs(path)
predictions.to_csv(os.path.join(path, "predictions_fine_tuned.csv"), index=False)

# Save formatted predictions
predictions_df = pd.concat(
    [
        predictions.str.extract(r"\"\"\"(.*)\"\"\"")
        .iloc[:, 0]
        .rename("Original")
        .str.strip(),
        predictions.str.extract(r"\[/INST\] (.*[\.\n])")
        .iloc[:, 0]
        .rename("Simplified")
        .str.strip(),
    ],
    axis=1,
)
predictions_df.to_csv(os.path.join(path, "predictions_fine_tuned_formatted.csv"), index=False)
predictions_df

Unnamed: 0,Original,Simplified
0,La classe de Mme Gaudé a fait une pièce de thé...,La classe de Mme Gaudé a joué une pièce en ang...
1,Finalement elles s’égorgèrent l’une l’autre et...,"Finalement, elles se tuèrent l'une l'autre et ..."
2,"D’abord il ne vit rien, mais il finit par déco...","D'abord, il ne voyait rien, mais il a trouvé u..."
3,Aurait-il des visiteurs ?,Y aurait-il des gens qui viennent ici ?
4,"Robinson connaissait cette enfant, il en était...","Robinson avait déjà vu cette enfant, il le sav..."
...,...,...
995,"Selon une récente enquête réalisée par Ipsos, ...",Une étude Ipsos montre que 84% des gens pensen...
996,Et les mèches de ses cheveux roux crespelés pa...,"Les cheveux roux de cette personne, ondulés pa..."
997,"Un néant à l'égard de l'infini, un tout à l'ég...","Pascal dit que l'homme est entre deux infinis,..."
998,Elle n'avait pas assez d'yeux pour contempler ...,Elle était tellement impressionnée par tout ce...


## Computing Zero-shot predictions

In [10]:
predictions = compute_predictions(zero_shot=True)

Serializing function and arguments...
Connecting to the cluster...


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_8.0)
INFO:paramiko.transport:Authentication (password) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


Writing slurmray server script...
Downloading server...
Running server...
Installing slurmray server
Writing python script...
Writing slurm script...
No serialization done.
Cluster detected, running on cluster...
Canceling old jobs...
Start to submit job!
Job submitted! Script file is at: </users/hjamet/slurmray-server/.slogs/server/sbatch.sh>. Log file is at: </users/hjamet/slurmray-server/.slogs/server/server_1502-10h34.log>
Start to monitor the queue... You can check the queue at: </users/hjamet/slurmray-server/.slogs/server/server_1502-10h34_queue.log>
Submitted batch job 38780508
IP Head: 10.203.101.86:6379
STARTING HEAD at dnagpu006
2024-02-15 10:35:53,071	INFO usage_lib.py:449 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See ht

In [11]:
# Save original predictions
path = os.path.join(pwd, "results", "text_simplification", "MistralEvaluation")
if not os.path.exists(path):
    os.makedirs(path)
predictions.to_csv(os.path.join(path, "predictions_zero_shot.csv"), index=False)

# Save formatted predictions
predictions_df = pd.concat(
    [
        predictions.str.extract(r"\"\"\"(.*)\"\"\"")
        .iloc[:, 0]
        .rename("Original")
        .str.strip(),
        predictions.str.extract(r"\[/INST\] (.*[\.\n])")
        .iloc[:, 0]
        .rename("Simplified")
        .str.strip(),
    ],
    axis=1,
)
predictions_df.to_csv(os.path.join(path, "predictions_zero_shot_formatted.csv"), index=False)
predictions_df

Unnamed: 0,Original,Simplified
0,Je ne sais pas ; pas beaucoup peut-être ; pas ...,Je ne sais pas ; peut-être pas beaucoup ; en t...
1,Il fit ainsi par deux fois le tour de l’épave.,Il a donc fait deux fois le tour de l'épave.
2,Comme il avait froid !,
3,Il referma son tonnelet à tabac et se laissa a...,Il a fermé sa boîte à tabac et est allé paress...
4,En somme sa situation était loin d’être désesp...,"En résumé, sa situation n'était pas très mauva..."
...,...,...
995,"Les langues sont un système de signes, constit...","Les langues sont des systèmes de signes, compo..."
996,Nous sommes tous des planches lithographiques ...,Voici une phrase simplifiée au niveau C1 :
997,"Seules, les grandes familles font usage du nom...",Seules les grandes familles utilisent le nom d...
998,Ils venaient se délasser dans les beaux-arts d...,Ils allaient se détendre dans les beaux-arts p...
