## Imports

In [None]:
import os
import pandas as pd
import sklearn.model_selection as sk
import gc
import transformers
import torch
import csv
import re
import subprocess
from IPython.display import FileLink

## Carga de datos y division del data set

In [None]:
# Check if the zip file is present and has been unzipped
if not os.path.exists("cefr-levelled-english-texts.zip"):
    # Download the dataset if the zip file is not present
    !kaggle datasets download -d amontgomerie/cefr-levelled-english-texts

if not os.path.exists("cefr_leveled_texts.csv"):  # Adjust this to match the folder name after unzipping
    # Unzip the file if the unzipped folder does not exist
    !unzip cefr-levelled-english-texts.zip

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('cefr_leveled_texts.csv')

# Get the minimum number of samples in any class
min_samples = df['label'].value_counts().min()

# Downsample each class to have the same number of samples as the smallest class
df_balanced = df.groupby('label').apply(lambda x: x.sample(n=min_samples, random_state=60)).reset_index(drop=True)

distribution = df_balanced['label'].value_counts()
train, div = sk.train_test_split(df_balanced, test_size=0.2, random_state=70)
dev, holdout = sk.train_test_split(div, test_size=0.5, random_state=50)
#holdout, dev = sk.train_test_split(div, test_size=0.08, random_state=50)

train = train.reset_index(drop=True)
div = div.reset_index(drop=True)
holdout = holdout.reset_index(drop=True)

textos_metricas = pd.concat([dev, train])

## Carga del modelo

In [None]:
model_name = "AbdulSami/bert-base-cased-cefr"

device = "cuda"

classifier = transformers.pipeline(
  "text-classification",
  model=model_name,
  truncation=True,
  device=device  
)

## Ejecucion del experimento

In [None]:
gc.collect()

filename =f'experimento_bert_ab.csv'
batch_texts = textos_metricas['text']
predicted_labels = []

for text in batch_texts:
    predictions = classifier(text)

    # Extract label and score
    label = predictions[0]['label']
    score = predictions[0]['score']

    # Append the label and score as a tuple to the list
    predicted_labels.append((label, score))

    torch.cuda.empty_cache()
    del predictions  # Delete the output after each step to free memory
    gc.collect()


with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Predicted Label', 'Score'])
    for label, score in predicted_labels:
        writer.writerow([label, score])

FileLink(filename)