In [None]:
!huggingface-cli login

In [None]:
import pandas as pd

data_file = "data/es_nominal_group_dataset.pkl"

dataset = pd.read_pickle(data_file)
dataset = dataset[["text", "tokens", "annotation", "metrics"]]
dataset

In [None]:
# Convert data to CoNLL format
def to_conll_eval_format(data):
    output = []
    for d in data:
        sentence_tags = []
        for token in d['tokens']:
            sentence_tags.append(token['tag'])
        output.append(sentence_tags)
    return output

# Create a list of tokens
tokens = dataset["tokens"].tolist()

# Create a list of CoNLL tags
conll = to_conll_eval_format(dataset["metrics"].tolist())
conll_total = [c for s in conll for c in s]

# Create a DataFrame with tokens and CoNLL tags
data = pd.DataFrame({"tokens": tokens, "ng_tags": conll})

# If any element in ng_tags is None, remove the whole row
data["has_none"] = data["ng_tags"].apply(lambda x: None in x)

# Drop rows with has_none = True
data = data[data["has_none"] == False].reset_index(drop=True)

# Drop has_none column
data = data.drop(columns=["has_none"])

# Create column id
data = data.reset_index()
# Rename index to id
data = data.rename(columns={"index": "id"})

data

In [None]:
from datasets import Dataset, Features, ClassLabel, Sequence, Value

dataset = Dataset.from_pandas(data)

new_features = Features(
    {
        'id': Value(dtype='int64', id=None),
        'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
        'ng_tags': Sequence(feature=ClassLabel(names=['B-NG', 'I-NG', 'O'], id=None), length=-1, id=None),
    }
)

# Cast the dataset with the updated features.
dataset = dataset.cast(new_features)
dataset


In [None]:
dataset.features["ng_tags"].feature.names

In [None]:
# El método train_test_split devuelve un diccionario con dos claves: train y test.
dataset = dataset.train_test_split(test_size=0.3, shuffle=True, seed=13)

# Si quiero un conjunto de validación, entonces divido nuevamente el conjunto de test.
test_divided = dataset["test"].train_test_split(test_size=0.5, shuffle=True, seed=13)
dataset["test"] = test_divided["test"]
dataset["validation"] = test_divided["train"]
dataset

In [None]:
dataset.push_to_hub("spanish_nominal_groups_conll2003")