## Imports

In [1]:
import os
import pandas as pd
import sklearn.model_selection as sk
import gc
import transformers
import torch
import csv
import re
import subprocess
from IPython.display import FileLink

## Carga de datos y division del dataset

In [2]:
# Check if the zip file is present and has been unzipped
if not os.path.exists("cefr-levelled-english-texts.zip"):
    # Download the dataset if the zip file is not present
    !kaggle datasets download -d amontgomerie/cefr-levelled-english-texts

if not os.path.exists("cefr_leveled_texts.csv"):  # Adjust this to match the folder name after unzipping
    # Unzip the file if the unzipped folder does not exist
    !unzip cefr-levelled-english-texts.zip

In [4]:
# Load the CSV file into a DataFrame
df = pd.read_csv('cefr_leveled_texts.csv')

# Get the minimum number of samples in any class
min_samples = df['label'].value_counts().min()

# Downsample each class to have the same number of samples as the smallest class
df_balanced = df.groupby('label').apply(lambda x: x.sample(n=min_samples, random_state=60)).reset_index(drop=True)

distribution = df_balanced['label'].value_counts()
train, div = sk.train_test_split(df_balanced, test_size=0.2, random_state=70)
dev, holdout = sk.train_test_split(div, test_size=0.5, random_state=50)
#holdout, dev = sk.train_test_split(div, test_size=0.08, random_state=50)

train = train.reset_index(drop=True)
div = div.reset_index(drop=True)
holdout = holdout.reset_index(drop=True)

textos_metricas = pd.concat([dev, train])

  df_balanced = df.groupby('label').apply(lambda x: x.sample(n=min_samples, random_state=60)).reset_index(drop=True)


## Prompts

In [5]:
sin_lecto = "you are an English teacher and I want you to classify the following text according to the CEFR classes. I want you to responds only with the class (A1|B1|C1|A2|B2|C2). without introduction."
con_lecto = "you are an English teacher and I want you to classify the following text for reading comprehension according to the CEFR classes. I want you to responds only with the class (A1|B1|C1|A2|B2|C2). without introduction."
correccion = "I detect that you have a bias to classify everything as B2, correct it."

propmt_1 = "Can you classify the following text according to its Common European Framework of Reference (CEFR) readibility level? I want you to responds only with the class (A1|B1|C1|A2|B2|C2)"
propmt_2 = "I'm a teacher of English who is preparing a reading task. Can you classify the following text regarding its readibility based on the CEFR? I want you to responds only with the class (A1|B1|C1|A2|B2|C2)"
propmt_3 = "I'm a teacher of English as a foreign language. I'm preparing  a reading task for my students and I would like to use the following text. Can you tell me which CEFR level is the following text suitable for? I want you to responds only with the class (A1|B1|C1|A2|B2|C2)"
propmt_4 = "I'm a teacher of English as a foreign language. I'm preparing  a reading task for my students and I would like to use the following text. Can you classify the following test according to its CEFR readibility level? I want you to responds only with the class (A1|B1|C1|A2|B2|C2)"
propmt_5 = "I'm a teacher of English as a foreign language. I'm preparing  a reading task for my students and I would like to use the following text. Can you annotate the following test according to its CEFR readibility level? I want you to responds only with the class (A1|B1|C1|A2|B2|C2)"

header_A1 = "this is an example of a text for level A1"
header_A2 = "this is an example of a text for level A2"
header_B1 = "this is an example of a text for level B1"
header_B2 = "this is an example of a text for level B2"
header_C1 = "this is an example of a text for level C1"
header_C2 = "this is an example of a text for level C2"

ejemplo_1_A1 = "Let's play in the pool, Helen.\nOkay, Linda. \nIt's fun.\nIt's so hot today.\nNot in the pool.\nI know. I like it here.\nWe can do this every day.\nSure, we can, Linda."
ejemplo_2_A1 = "Hi Lisa. How are you? \nI'm fine. What are you doing?\nI'm reading a book.\nReally? All of it?\nNo. Just this part.\nWho's that?\nIt's Mary. She has a dog.\nLet's read it together."
ejemplo_1_A2 = "I have a 89.5%.\nOkay, so what?\nIt's really close to an A.\nYou know I don't round up.\nBut I really need a 4.0 GPA.\nYou get what you deserve.\nI know I deserve an A. I always do great work and participate.\nI can't change it.\nPlease? I'll wash your car.\nAre you bribing me?\nI just want an A.\nI'll give you a C if you don't leave now."
ejemplo_2_A2 = "The parking permit is $200 per semester, right?\nIt's actually $300.\nWhy?! That's ridiculous.\nIt's because of the budget cuts.\nThat's a terrible excuse.\nIt's the truth.\nGive me something better than that.\nMiss, if you don't want to pay, then don't.\nThis is so stressful.\nI advise writing a letter to the dean.\nI bet he'll just throw it away.\nYou would be surprised."
ejemplo_1_B1 = "Bruce picked up the cat. The cat meowed. The cat didn't like most people. The cat liked to be alone. It liked to sleep on the sofa. It liked to sleep in the fruit bowl. It liked to sleep on top of the TV. It liked to chase bugs in the front yard. It liked to chase lizards in the back yard. It liked to chase flies in the kitchen. Bruce put the cat on the floor. He rubbed the cat's stomach. The cat liked that. The cat licked his hand. Bruce rubbed the cat's stomach some more. The cat meowed. The cat was happy."
ejemplo_2_B1 = "Bob pushed the button on the door handle. He pulled on the door handle. He opened the car door. He got into the car. He sat down. He sat down in the driver's seat. He sat down behind the steering wheel. Bob put the seat belt on. He buckled the seat belt. The seat belt went across his chest. The seat belt went across his lap. The seat belt kept him safe. He put his car key into the ignition. The ignition was next to the steering wheel. The ignition starts a car. Bob turned the car key in the ignition. The car started."
ejemplo_1_B2 = "Legendary football coach and broadcaster John Madden is retiring, he announced Thursday.\nJohn Madden appears at the TV Critics Association Press Tour in Beverly Hills, California, in 2008.\n It's been such a great ride... the NFL has been my life for more than 40 years, it has been my passion -- it still is, he said in a statement released by NBC Sports.\nMadden, 73, was a Hall of Fame coach for the Oakland Raiders, but is best known to millions as an ebullient football commentator.\nHe won 16 Emmy awards for outstanding sports analyst/personality, NBC said.\n"
ejemplo_2_B2 = "Dear Professor Henley,\nI am writing to inform you that, unfortunately, I am unable to continue to attend the Logic II course this semester. I would like to request permission to defer as I understand that this is only possible with your approval.\nThe issue is that I am currently doing an internship with ABC Ltd. It started in July and will continue until the end of the semester. The internship takes up 25 hours per week and I am concerned that it does not leave me with enough time to study. I have already asked if I can reduce my hours there, but this is not possible.\nWith your approval, I could take Logic II next semester instead. I realise that this would mean a heavier workload than usual next semester, but I assure you that I would be able to manage my time and keep up.\nThank you for considering my request and I would be happy to come in and discuss the matter further.\nRegards,\nSarah Price"
ejemplo_1_C1 = "Dear Ms Leitman,\nI am writing to request your help following a change in my circumstances.\nAs you know, I am enrolled on the Basic Spanish course at your college, which starts in September. However, due to unforeseen family events, I have had to leave the country for a while to assist my parents in Hong Kong.\nAt present it is not clear when I will be able to return and unfortunately I will not be able to start the course as planned.\nI would like to request a refund for the course fees already paid. I apologise for the short notice and for any inconvenience caused. In the event that a refund is not possible, I would be grateful if you could postpone my enrolment until my return.\nThank you in advance for your help and I hope to be able to update you on the situation soon.\nYours sincerely,\nHonor Singh"
ejemplo_2_C1 = "-LRB- CNN -RRB- Spain is officially clear of Ebola, the World Health Organization declared Tuesday, after no new cases were reported since a nurse's assistant who contracted the virus there tested negative for it.\nSince then, 42 days have passed -- double the maximum known incubation period for the virus -- without another case, allowing Spain to be declared free of Ebola.\nSpanish authorities had been monitoring 87 people who came into contact with healthcare worker Teresa Romero Ramos, 15 of whom were considered high-risk and were quarantined at a Madrid hospital, WHO said.\nAnother 145 hospital employees who helped care for Romero during her month-long stay at the Carlos III Hospital were also monitored.\nThe WHO statement said it commends Spain for the measures put in place to identify potential cases and prevent further transmission of the Ebola virus. \nRomero contracted the illness while helping to care for an infected missionary who had been brought back from West Africa. He died of the disease.\nCNN's Anna Maja Rappard contributed to this report.\n"
ejemplo_1_C2 = "Twelve photographers from four continents have been shortlisted for the fourth Prix Pictet award in photography and sustainability.\nThis year's theme of Power, has enormous breadth, embracing contradiction and paradox in equal measure that has uncovered images and issues that are both awe-inspiring and disturbing, organizers say.\nThe aim of the award is to use the power of photography to raise public awareness of the social and environmental challenges of the new millennium.\nThe winner will be announced by Kofi Annan, the awardshonorary president, in October at the opening of the finalistsexhibition of the shortlisted works at the Saatchi Gallery in London. The exhibition runs from the 10th to the 28th of October 2012.\n"
ejemplo_2_C2 = "Singapore's economy shrank by 4.2 percent in the fourth quarter of 2008, the Ministry of Trade and Industry said Thursday, as it forecast the economy would contract between 2 and 5 percent this year.\nBoats ply under a bridge near the financial district of Singapore.\nCompared to a robust growth of 7.8 percent a year earlier, the economy grew by 1.1 percent for the whole of 2008, the ministry added.\nIt called Gross Domestic Product growth prospects for 2009 weak... on account of the pessimistic global economic outlook. \nAll major sectors, except for construction, business services and information and communications, saw contractions, the ministry said.\nThe ministry cited a decline in private sector investments and private consumption expenditure for dragging down total domestic demand.\nDeclines in global demand for electronics products, pharmaceuticals and chemicals were also likely to weigh on the manufacturing sector.\n"

In [6]:
propmt_descriptores="Using the above descriptors I want you to classify the following text according to its Common European Framework of Reference (CEFR) level. I want you to responds only with the class (A1|B1|C1|A2|B2|C2)"


header_des_A1 = "this is a list of CEFR level descriptors for level A1:"
header_des_A2 = "this is a list of CEFR level descriptors for level A2:"
header_des_B1 = "this is a list of CEFR level descriptors for level B1:"
header_des_B2 = "this is a list of CEFR level descriptors for level B2:"
header_des_C1 = "this is a list of CEFR level descriptors for level C1:"
header_des_C2 = "this is a list of CEFR level descriptors for level C2:"

descriptores_A1 = r"""very short, simple texts.
short, simple messages on postcards.
short, simple messages sent via social media or e-mail (e.g. proposing what to do, when and where to meet).
store guides (information on which floors departments are on) and directions (e.g. where to find lifts).
basic hotel information (e.g. times when meals are served).
simple, important information in advertisements, programmes for special events, leaflets and brochures (e.g. what is proposed, costs, the date and place of the event, departure times).
short, simple descriptions, especially if there is visual support.
short texts on subjects of personal interest (e.g. news flashes about sports, music, travel or stories) composed in very simple language and supported by illustrations and pictures.
simple messages written by friends or colleagues, for example "back at 4 o’clock”.
familiar names, words and basic phrases.
familiar names, words/signs and very basic phrases on simple notices in the most common everyday situations.
very simple language.
direct commands (e.g., “open the door”).
descriptions of clothes (pattern, colour).
proportions, quantities, and size ratios.
simple negation with <no>, <not>.
a direct request, question or order.
lists and sequences (<and>/<both-and>/<and then>
time indicators (<day-before-yesterday>, <3-years-ago>, etc.), when the time references are clearly indicated.
words and phrases on everyday signs (for example "station”, "car park”, "no parking”, ""no smoking”, "keep left”)."""

descriptores_A2="""Short, simple texts.
Short, simple personal letters.
Simple everyday material such as advertisements, prospectuses, menus, reference lists and timetables.
everyday signs and notices, etc. in public places, such as streets, restaurants, railway stations; in workplaces, such as directions, instructions, hazard warnings.
texts describing people, places, everyday life and culture, etc., provided they use simple language.
information given in illustrated brochures and maps (e.g. the principal attractions of a city).
main points in short news items on subjects of personal interest (e.g. sport, celebrities).
short factual description or report within their own field, provided simple language is used and that it does not contain unpredictable detail.
what people say about themselves in a personal ad or post and what they say they like in other people.
simple instructions on equipment encountered in everyday life – such as a public telephone.
simple, brief instructions, provided they are illustrated and not presented in continuous text.
a simple recipe, especially if there are pictures to illustrate the most important steps.
short narratives and descriptions of someone’s life composed in simple language.
short description of a person (e.g. a celebrity).
very short, simple texts by understanding familiar names, words and basic phrases.
basic information in posters, adverts or catalogues.
short simple greetings and messages e.g. on birthday cards, party invitations or in SMS phone messages.
short simple messages from friends. For example: e-mails, web chats, postcards or short letters.
highest frequency vocabulary, including a proportion of shared international vocabulary items.
simple commands (e.g. “Take before meals” or “Do not take if driving”).
simple language.
high frequency everyday language.
details in an extensive description of a person/object, such as body shape, hairstyle, or occupation.
simple instructions, wishes, recommendations, etc.
basic causal relations (e.g., “I’m late because I got stuck in the traffic”).
indirect messages (questions, requests, wishes, rejection, etc.).
different ways of expressing negation."""

descriptores_B1= r"""straightforward factual texts on subjects related to their field of interest.
description of events, feelings and wishes in personal letters.
straightforward personal letters, e-mails or postings giving a relatively detailed account of events and experiences.
standard formal correspondence and online postings in their area of professional interest.
relevant information in everyday material, such as letters, brochures and short official documents.
information about preparation and usage on the labels on foodstuff and medicine.
information in simple, clearly drafted adverts in newspapers or magazines, provided there are not too many abbreviations.
main points in descriptive notes such as those on museum exhibits and explanatory boards in exhibitions.
clearly expressed, straightforward instructions for a piece of equipment.
simple instructions given on packaging (e.g. cooking instructions).
short safety instructions, (e.g. on public transport or in manuals for the use of electrical equipment).
descriptions of places, events, explicitly expressed feelings and perspectives in narratives, guides and magazine articles that employ high frequency everyday language.
travel diary mainly describing the events of a journey and the experiences and discoveries of the writer.
the plot of stories, simple novels and comics with a clear linear storyline and high frequency everyday language.
brochures, leaflets and other short texts relating to my interests
short newspaper and magazine articles about current and familiar topics.
simple instructions, for example for a game, using familiar types of equipment or cooking a meal.
simplified versions of novels, and follow the story line in short stories with a clear structure.
private letters about events, feelings and wishes.
high frequency everyday language."""

descriptores_B2= r"""correspondence relating to their field of interest and readily grasp the essential meaning.
a personal e-mail or posting even where some colloquial language is used.
articles and reports concerned with contemporary problems in which particular stances or viewpoints are adopted.
lengthy, complex instructions in their field, including details on conditions and warnings.
novels with a strong, narrative plot and that use straightforward, unelaborated language.
articles, reports and reviews in which the writers express specific points of view (e.g., political commentary, critiques of exhibitions, plays, films, etc).
lengthy instructions, for example in a user manual for a TV or digital camera, for installing software.
short stories and novels written in a straightforward language and style, if I am familiar with the story and/or the writer.
main points in formal and informal letters relating to my personal and professional interests.
verbal aspect (e.g., completion, repetition, continuation, result of actions).
idioms.
various temporal relationships between the actions and events (simultaneous events, previous event, subsequent event).
direct and indirect speech.
statements that contain predicates that take no agent, e.g., “the water is flowing now”.
rhetorical questions even if linguistically expressed in a very economical way, e.g., by raising the eyebrows."""

descriptores_C1= r"""lengthy, complex texts, whether or not these relate to their own area of speciality.
a wide variety of texts including literary writings, newspaper or magazine articles, and specialised academic or professional publications.
any correspondence.
implicit as well as explicit attitudes, emotions and opinions expressed in e-mails, discussion forums, vlogs/blogs, etc
slang, idiomatic expressions and jokes in private correspondence.
a wide range of lengthy, complex texts likely to be encountered in social, professional or academic life."""

descriptores_C2= r"""all types of texts including abstract, structurally complex, or highly colloquial literary and non-literary writings.
a wide range of long and complex texts.
specialized, formal correspondence on a complex topic.
a complex report or article even outside their area of specialization.
all forms of texts including classical or colloquial literary and non-literary texts in different genres.
lengthy, complex texts, whether or not they relate to my area of speciality.
complex reports, analyses and commentaries in which opinions, viewpoints and connections are discussed.
complex manuals, regulations and contracts even within unfamiliar fields.
formal or informal correspondence.
any kind of text including those written in a very colloquial style and containing many idiomatic expressions or slang.
texts (for example newspaper columns and satirical glosses) in which much is said in an indirect and ambiguous way and which contain hidden value judgements..
classical as well as contemporary literary texts in different genres.
formal correspondence, including on specialized or legal matters."""

In [7]:
# 0 SHOT:
propmt_0shot_sin_lecto = sin_lecto
propmt_0shot_con_lecto = con_lecto

prompt_0shot_sin_lecto_correccion = sin_lecto + "\n" + correccion
prompt_0shot_con_lecto_correccion = con_lecto + "\n" + correccion

    #PROMPS ELI

# 1 SHOT:
propmt_1shot_sin_lecto =( header_A1 + "\n" + ejemplo_1_A1 + "\n" +
                          header_A2 + "\n" + ejemplo_1_A2 + "\n" +
                          header_B1 + "\n" + ejemplo_1_B1 + "\n" +
                          header_B2 + "\n" + ejemplo_1_B2 + "\n" +
                          header_C1 + "\n" + ejemplo_1_C1 + "\n" +
                          header_C2 + "\n" + ejemplo_1_C2 + "\n" +
                          sin_lecto)

propmt_1shot_con_lecto =( header_A1 + "\n" + ejemplo_1_A1 + "\n" +
                          header_A2 + "\n" + ejemplo_1_A2 + "\n" +
                          header_B1 + "\n" + ejemplo_1_B1 + "\n" +
                          header_B2 + "\n" + ejemplo_1_B2 + "\n" +
                          header_C1 + "\n" + ejemplo_1_C1 + "\n" +
                          header_C2 + "\n" + ejemplo_1_C2 + "\n" +
                          con_lecto)

prompt_1shot_sin_lecto_correccion = propmt_1shot_sin_lecto + "\n" + correccion
prompt_1shot_con_lecto_correccion = propmt_1shot_con_lecto + "\n" + correccion

prompt_1shot_sin_lecto_sacando_b2 =(header_A1 + "\n" + ejemplo_1_A1 + "\n" +
                                    header_A2 + "\n" + ejemplo_1_A2 + "\n" +
                                    header_B1 + "\n" + ejemplo_1_B1 + "\n" +
                                    header_C1 + "\n" + ejemplo_1_C1 + "\n" +
                                    header_C2 + "\n" + ejemplo_1_C2 + "\n" +
                                    sin_lecto)

prompt_1shot_con_lecto_sacando_b2 =(header_A1 + "\n" + ejemplo_1_A1 + "\n" +
                                    header_A2 + "\n" + ejemplo_1_A2 + "\n" +
                                    header_B1 + "\n" + ejemplo_1_B1 + "\n" +
                                    header_C1 + "\n" + ejemplo_1_C1 + "\n" +
                                    header_C2 + "\n" + ejemplo_1_C2 + "\n" +
                                    con_lecto)

# 2 SHOT:
propmt_2shot_sin_lecto =( header_A1 + "\n" + ejemplo_1_A1 + "\n" + header_A1 + "\n" + ejemplo_2_A1 + "\n" +
                          header_A2 + "\n" + ejemplo_1_A2 + "\n" + header_A2 + "\n" + ejemplo_2_A2 + "\n" +
                          header_B1 + "\n" + ejemplo_1_B1 + "\n" + header_B1 + "\n" + ejemplo_2_B1 + "\n" +
                          header_B2 + "\n" + ejemplo_1_B2 + "\n" + header_B2 + "\n" + ejemplo_2_B2 + "\n" +
                          header_C1 + "\n" + ejemplo_1_C1 + "\n" + header_C1 + "\n" + ejemplo_2_C1 + "\n" +
                          header_C2 + "\n" + ejemplo_1_C2 + "\n" + header_C2 + "\n" + ejemplo_2_C2 + "\n" +
                          sin_lecto)

propmt_2shot_con_lecto =( header_A1 + "\n" + ejemplo_1_A1 + "\n" + header_A1 + "\n" + ejemplo_2_A1 + "\n" +
                          header_A2 + "\n" + ejemplo_1_A2 + "\n" + header_A2 + "\n" + ejemplo_2_A2 + "\n" +
                          header_B1 + "\n" + ejemplo_1_B1 + "\n" + header_B1 + "\n" + ejemplo_2_B1 + "\n" +
                          header_B2 + "\n" + ejemplo_1_B2 + "\n" + header_B2 + "\n" + ejemplo_2_B2 + "\n" +
                          header_C1 + "\n" + ejemplo_1_C1 + "\n" + header_C1 + "\n" + ejemplo_2_C1 + "\n" +
                          header_C2 + "\n" + ejemplo_1_C2 + "\n" + header_C2 + "\n" + ejemplo_2_C2 + "\n" +
                          con_lecto)

prompt_2shot_sin_lecto_correccion = propmt_2shot_sin_lecto + "\n" + correccion
prompt_2shot_con_lecto_correccion = propmt_2shot_con_lecto + "\n" + correccion

prompt_2shot_sin_lecto_sacando_b2 =(header_A1 + "\n" + ejemplo_1_A1 + "\n" + header_A1 + "\n" + ejemplo_2_A1 + "\n" +
                                    header_A2 + "\n" + ejemplo_1_A2 + "\n" + header_A2 + "\n" + ejemplo_2_A2 + "\n" +
                                    header_B1 + "\n" + ejemplo_1_B1 + "\n" + header_B1 + "\n" + ejemplo_2_B1 + "\n" +
                                    header_C1 + "\n" + ejemplo_1_C1 + "\n" + header_C1 + "\n" + ejemplo_2_C1 + "\n" +
                                    header_C2 + "\n" + ejemplo_1_C2 + "\n" + header_C2 + "\n" + ejemplo_2_C2 + "\n" +
                                    sin_lecto)

prompt_2shot_con_lecto_sacando_b2 =(header_A1 + "\n" + ejemplo_1_A1 + "\n" + header_A1 + "\n" + ejemplo_2_A1 + "\n" +
                                    header_A2 + "\n" + ejemplo_1_A2 + "\n" + header_A2 + "\n" + ejemplo_2_A2 + "\n" +
                                    header_B1 + "\n" + ejemplo_1_B1 + "\n" + header_B1 + "\n" + ejemplo_2_B1 + "\n" +
                                    header_C1 + "\n" + ejemplo_1_C1 + "\n" + header_C1 + "\n" + ejemplo_2_C1 + "\n" +
                                    header_C2 + "\n" + ejemplo_1_C2 + "\n" + header_C2 + "\n" + ejemplo_2_C2 + "\n" +
                                    con_lecto)

#DESCRIPTORES:
propmt_descriptores =(header_des_A1 + "\n" + descriptores_A1 + "\n" +
                      header_des_A2 + "\n" + descriptores_A2 + "\n" +
                      header_des_B1 + "\n" + descriptores_B1 + "\n" +
                      header_des_B2 + "\n" + descriptores_B2 + "\n" +
                      header_des_C1 + "\n" + descriptores_C1 + "\n" +
                      header_des_C2 + "\n" + descriptores_C2 + "\n" +
                      propmt_descriptores)

propmt_descriptores_sin_B2 =( header_des_A1 + "\n" + descriptores_A1 + "\n" +
                              header_des_A2 + "\n" + descriptores_A2 + "\n" +
                              header_des_B1 + "\n" + descriptores_B1 + "\n" +
                              header_des_C1 + "\n" + descriptores_C1 + "\n" +
                              header_des_C2 + "\n" + descriptores_C2 + "\n" +
                              propmt_descriptores)

## Definicion de los experimentos

In [14]:
experimento_0 = propmt_0shot_sin_lecto
experimento_1 = propmt_0shot_con_lecto
experimento_2 = prompt_0shot_sin_lecto_correccion
experimento_3 = prompt_0shot_con_lecto_correccion
experimento_4 = propmt_1
experimento_5 = propmt_2
experimento_6 = propmt_3
experimento_7 = propmt_4
experimento_8 = propmt_5
experimento_9 = propmt_1shot_sin_lecto
experimento_10 = propmt_1shot_con_lecto
experimento_11 = prompt_1shot_sin_lecto_correccion
experimento_12 = prompt_1shot_con_lecto_correccion
experimento_13 = prompt_1shot_sin_lecto_sacando_b2
experimento_14 = prompt_1shot_con_lecto_sacando_b2
experimento_15 = propmt_2shot_sin_lecto
experimento_16 = propmt_2shot_con_lecto
experimento_17 = prompt_2shot_sin_lecto_correccion
experimento_18 = prompt_2shot_con_lecto_correccion
experimento_19 = prompt_2shot_sin_lecto_sacando_b2
experimento_20 = prompt_2shot_con_lecto_sacando_b2
experimento_21 = propmt_descriptores
experimento_22 = propmt_descriptores_sin_B2

experimentos = [experimento_0]
#experimentos = [experimento_1, experimento_2, experimento_3, experimento_4, experimento_5, experimento_6, experimento_7, experimento_8, experimento_9, experimento_10, experimento_11, experimento_12, experimento_13, experimento_14, experimento_15, experimento_16, experimento_17, experimento_18, experimento_19, experimento_20, experimento_21, experimento_22]

## Carga del modelo

In [9]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    "microsoft/Orca-2-7b",
    device_map='auto',           # Automatically maps model layers to available devices
    offload_folder="./offload",  # Specifies a folder for offloading layers to disk if needed
    torch_dtype=torch.float16
)

# Load the tokenizer (use the slow tokenizer as recommended)
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "microsoft/Orca-2-13b",
    use_fast=False
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Ejecucion del experimento

In [None]:
gc.collect()
j = 0
for experimento in experimentos:
    system_message = experimento
    filename =f'experimento_{j}real.csv'
    batch_texts = textos_metricas['text']
    predicted_labels = []

    for text in batch_texts:
        prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
        
        inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
        output_ids = model.generate(inputs["input_ids"], max_new_tokens=20)
        response = tokenizer.batch_decode(output_ids)[0]
        
        match = re.search(r'<\|im_start\|> assistant\s*(.*)', response)
        if match:
            classification = match.group(1).strip() 
        else :
            classification ='Unknown'
            print("fallo primer filtro")
        
        match = re.search(r'\b(A1|B1|C1|A2|B2|C2)\b', classification)
        classification_filtered = match.group() if match else 'Unknown'
        
        # Append the predicted classification to the list
        predicted_labels.append(classification_filtered)
        
        torch.cuda.empty_cache()
        del response  # Delete the output after each step to free memory
        gc.collect()
    
    
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Predicted Labels'])
        for label in predicted_labels:
            writer.writerow([label])
    
    FileLink(filename)
    j = j + 1