In [None]:
from datasets import *

data_path = 'exos_to_csv.csv'
dataset = load_dataset('csv', data_files=data_path)

In [None]:
# Split the dataset into 3 sets for train, test and validation
train_testsplit = dataset['train'].train_test_split(test_size=0.2)
test_validsplit = train_testsplit['test'].train_test_split(test_size=0.5)

In [None]:
ds = DatasetDict({
    'train': train_testsplit['train'],
    'test': test_validsplit['test'],
    'valid': test_validsplit['train']
})

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['filename', 'text'],
        num_rows: 1022
    })
    test: Dataset({
        features: ['filename', 'text'],
        num_rows: 128
    })
    valid: Dataset({
        features: ['filename', 'text'],
        num_rows: 128
    })
})

## Load Mistral Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# With left-padding
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", padding_side="left")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default

### Using HuggingFace Pipeline Method for Text Generation Task

In [None]:
from transformers import pipeline


2024-05-21 16:04:06.311193: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 16:04:06.311236: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 16:04:06.313060: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-21 16:04:06.321756: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Iterate through the dataset
def data_iterator(corpus):
    for item in corpus["text"]:

        # Yield a chat-formatted message
        yield {
            "role": "user",
            "content": item
        }

        # formatted_message = tokenizer.apply_chat_template([message], add_generation_prompt=True, tokenize=False)


### Generation Process

In [None]:
import torch

In [None]:
def get_generated_text(corpus, model, tokenizer):

    # Initialize the generator
    generator = pipeline(
        task='text-generation',
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        device_map='auto',
        batch_size=2
    )

    # Get the generated text
    messages = data_iterator(corpus)
    generated_text = []

    for msg in messages:
        # Define a prompt
        prompt = f"Prompt: Each text is an exercise extracted from school manual for elementary French student. Analyze the following text from {msg['content']} and summarize the key themes. Based on the information, generate some semantically exercises, in French, not English. They can contain french grammar, vocabulary, comprehension and so on. Remember to keep the same format and do not generate directly answers to these exercises. You can play the role of a teacher. ***Generated text***:"

        outputs = generator(
            prompt,
            max_new_tokens = 50,
            do_sample=True,
            temperature=0.5,
            top_k=50,
            top_p=0.95
        )

        generated_text.append(outputs[0]['generated_text'])

    return generated_text


In [None]:
consigne_generated = get_generated_text(ds["train"], model=model, tokenizer=tokenizer)

In [None]:
for sample in consigne_generated:
    print(sample)
    print(100 * "-")

Prompt: Each text is an exercise extracted from school manual for elementary French student. Analyze the following text from Recopie ces phrases et conjugue les verbes entre parenthèses au présent.
a. Elle (prêter) sa voiture.
b. Ils (annoncer) une bonne nouvelle.
c. Tu (chercher) un ballon.
d. J’ (admirer) le paysage.
e. Nous (sauter) dans la piscine. and summarize the key themes. Based on the information summarized, generate the similar exercises, in French, not English. They can contain french grammar, vocabulary, comprehension and so on. Remember to keep the same format. 
 Generated text:
a. Il prête sa voiture.
b. Ils annoncent une bonne nouvelle.
c. Tu cherches un ballon.
d. Je admire le paysage.
e. Nous sautons
----------------------------------------------------------------------------------------------------
Prompt: Each text is an exercise extracted from school manual for elementary French student. Analyze the following text from Recopie en choisissant le verbe être ou le ver