## Imports

In [1]:
import os
import pandas as pd
import sklearn.model_selection as sk
import gc
import transformers
import torch
import csv
import re
import subprocess
import datasets 
from IPython.display import FileLink
import time

## Carga de datos y division del data set

In [2]:
# Check if the zip file is present and has been unzipped
if not os.path.exists("cefr-levelled-english-texts.zip"):
    # Download the dataset if the zip file is not present
    !kaggle datasets download -d amontgomerie/cefr-levelled-english-texts

if not os.path.exists("cefr_leveled_texts.csv"):  # Adjust this to match the folder name after unzipping
    # Unzip the file if the unzipped folder does not exist
    !unzip cefr-levelled-english-texts.zip

In [3]:
# Load the CSV file into a DataFrame
df = pd.read_csv('cefr_leveled_texts.csv')

# Get the minimum number of samples in any class
min_samples = df['label'].value_counts().min()

# Downsample each class to have the same number of samples as the smallest class
df_balanced = df.groupby('label').apply(lambda x: x.sample(n=min_samples, random_state=60)).reset_index(drop=True)

distribution = df_balanced['label'].value_counts()
train, div = sk.train_test_split(df_balanced, test_size=0.2, random_state=70)
dev, holdout = sk.train_test_split(div, test_size=0.5, random_state=50)
#holdout, dev = sk.train_test_split(div, test_size=0.08, random_state=50)

train = train.reset_index(drop=True)
div = div.reset_index(drop=True)
holdout = holdout.reset_index(drop=True)

  df_balanced = df.groupby('label').apply(lambda x: x.sample(n=min_samples, random_state=60)).reset_index(drop=True)


In [4]:
textos_metricas = pd.concat([dev, train])
batch_texts = textos_metricas['text']

## Prompts

In [5]:
sin_lecto = "you are an English teacher and I want you to adapt the following text according to the CEFR classes. I want you to adapt the following text to "
con_lecto = "you are an English teacher and I want you to adapt  the following text for reading comprehension according to the CEFR classes.  I want you to adapt the following text to "
correccion = "I detect that you have a bias to classify everything as B2, correct it."
aclaracion = "I want you to give me just the text."
A1 = "A1 class. "
A2 = "A2 class. "
B1 = "B1 class. "
B2 = "B2 class. "
C1 = "C1 class. "	
C2 = "C2 class. "

In [6]:
prompt_0shot_sin_lecto_A1 = sin_lecto + A1 + aclaracion
prompt_0shot_sin_lecto_A2 = sin_lecto + A2 + aclaracion
prompt_0shot_sin_lecto_B1 = sin_lecto + B1 + aclaracion
prompt_0shot_sin_lecto_B2 = sin_lecto + B2 + aclaracion
prompt_0shot_sin_lecto_C1 = sin_lecto + C1 + aclaracion
prompt_0shot_sin_lecto_C2 = sin_lecto + C2 + aclaracion

prompt_0shot_con_lecto_A1 = con_lecto + A1 + aclaracion
prompt_0shot_con_lecto_A2 = con_lecto + A2 + aclaracion
prompt_0shot_con_lecto_B1 = con_lecto + B1 + aclaracion
prompt_0shot_con_lecto_B2 = con_lecto + B2 + aclaracion
prompt_0shot_con_lecto_C1 = con_lecto + C1 + aclaracion
prompt_0shot_con_lecto_C2 = con_lecto + C2 + aclaracion

## Definicion de los experimentos

In [7]:
experimento_0 = prompt_0shot_sin_lecto_A1
experimento_1 = prompt_0shot_sin_lecto_A2
experimento_2 = prompt_0shot_sin_lecto_B1
experimento_3 = prompt_0shot_sin_lecto_B2
experimento_4 = prompt_0shot_sin_lecto_C1
experimento_5 = prompt_0shot_sin_lecto_C2

experimento_6 = prompt_0shot_con_lecto_A1
experimento_7 = prompt_0shot_con_lecto_A2
experimento_8 = prompt_0shot_con_lecto_B1
experimento_9 = prompt_0shot_con_lecto_B2
experimento_10 = prompt_0shot_con_lecto_C1
experimento_11 = prompt_0shot_con_lecto_C2

In [8]:
#experimentos = [experimento_0]
#experimentos = [experimento_1, experimento_2, experimento_3]
#experimentos = [experimento_4, experimento_5]
#experimentos = [experimento_6, experimento_7, experimento_8]
#experimentos = [experimento_9]
experimentos = [experimento_9, experimento_10, experimento_11]

## Carga del modelo

In [9]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
device = "cuda"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=device,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Ejecucion del experimento

In [10]:
gc.collect() 
#the max token in the texts is 2600
max_new_tokens = 3000
j = 9
start_time = time.time()
for experimento in experimentos: 
    system_message = experimento 
    filename =f'experimento_adaptacion_{j}.csv' 
    adaptation_list = []

    for text in batch_texts:
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": text}
        ]

        #print (messages)
        # Get the model's classification output for the text
        output = pipeline(messages, max_new_tokens=max_new_tokens)
        
        # Extract the predicted CEFR level (classification) from the generated text
        adaptation = output[0]["generated_text"][2]['content']
        
        # Append the predicted classification to the list
        adaptation_list.append(adaptation)
        
        torch.cuda.empty_cache()
        del output  # Delete the output after each step to free memory
        gc.collect()


    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Adaptacion'])
        for adaptation in adaptation_list:
            writer.writerow([adaptation])

    j = j + 1
end_time = time.time()
elapsed_time_seconds = end_time - start_time
elapsed_time_minutes = elapsed_time_seconds / 60
print(f"Time taken: {elapsed_time_minutes:.4f} minutes")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:Non

Time taken: 800.6161 minutes
