In [1]:
import pandas as pd
import argilla as rg

data_file = "data/data_etiquetada_attitude.pkl"

dataset = pd.read_pickle(data_file)



In [2]:
def to_conll_eval_format(data):
    output = []
    for d in data:
        sentence_tags = []
        for token in d['tokens']:
            sentence_tags.append(token['tag'])
        output.append(sentence_tags)
    return output

tokens = dataset["tokens"].tolist()
conll = to_conll_eval_format(dataset["metrics"].tolist())



In [3]:
conll_total = [c for s in conll for c in s]
tags = list(set(conll_total))

# Remove "None" tag
tags.remove(None)
tags


['I-propriety (J3)',
 'B-NG',
 'B-capacity (J3)',
 'B-propriety (J3)',
 'I-tenacity (J3)',
 'B-Affect',
 'B-Social Esteem (J2)',
 'I-Negative',
 'I-Social Sanction (J2)',
 'B-Judgment (J1)',
 'I-Affect',
 'I-normality (J3)',
 'O',
 'I-Judgment (J1)',
 'B-Social Sanction (J2)',
 'B-normality (J3)',
 'B-veracity (J3)',
 'B-Appreciation',
 'I-NG',
 'I-Social Esteem (J2)',
 'B-Negative',
 'B-tenacity (J3)',
 'I-capacity (J3)',
 'I-veracity (J3)',
 'I-Appreciation']

In [4]:
import pandas as pd
data = pd.DataFrame({"tokens": tokens, "att_tags": conll})


In [5]:
data

# If any element in ng_tags is None, remove the whole row
data["has_none"] = data["att_tags"].apply(lambda x: None in x)

# Count has_none
print(data["has_none"].value_counts())

# Drop rows with has_none = True
data = data[data["has_none"] == False].reset_index(drop=True)

False    797
True       4
Name: has_none, dtype: int64


In [6]:
data = data.reset_index()
# Rename index to id
data = data.rename(columns={"index": "id"})
#data = data.to_dict('records')


In [7]:
del data["has_none"]

In [8]:
# Len of dataset

# data["len_t"] = data["tokens"].apply(lambda x: len(x))
# data["len_ng"] = data["att_tags"].apply(lambda x: len(x))

# # Check if len_t == len_ng and count the number of sentences that are not equal
# data["len_equal"] = data["len_t"] == data["len_ng"]
# data["len_equal"].value_counts()

In [9]:
#features = Features({"att_tags": Sequence(ClassLabel(num_classes=3, names=['B-NG', 'I-NG', 'O']))})

#hf_dataset = Dataset.from_list(data, features=features)

In [10]:
from datasets import Dataset, Features, ClassLabel, Sequence, Value

hf_dataset = Dataset.from_pandas(data)

new_features = Features(
    {
        'id': Value(dtype='int64', id=None),
        'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
        'att_tags': Sequence(feature=ClassLabel(names=tags, id=None), length=-1, id=None),
    }
)

# Cast the dataset with the updated features.
hf_dataset = hf_dataset.cast(new_features)
hf_dataset

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['id', 'tokens', 'att_tags'],
    num_rows: 797
})

In [11]:
hf_dataset.features["att_tags"].feature.names


['I-propriety (J3)',
 'B-NG',
 'B-capacity (J3)',
 'B-propriety (J3)',
 'I-tenacity (J3)',
 'B-Affect',
 'B-Social Esteem (J2)',
 'I-Negative',
 'I-Social Sanction (J2)',
 'B-Judgment (J1)',
 'I-Affect',
 'I-normality (J3)',
 'O',
 'I-Judgment (J1)',
 'B-Social Sanction (J2)',
 'B-normality (J3)',
 'B-veracity (J3)',
 'B-Appreciation',
 'I-NG',
 'I-Social Esteem (J2)',
 'B-Negative',
 'B-tenacity (J3)',
 'I-capacity (J3)',
 'I-veracity (J3)',
 'I-Appreciation']

In [12]:
hf_dataset["att_tags"]

[[1,
  1,
  1,
  12,
  12,
  1,
  12,
  17,
  24,
  12,
  1,
  18,
  12,
  12,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  3,
  0,
  0,
  0,
  12],
 [12,
  12,
  12,
  1,
  18,
  18,
  12,
  1,
  18,
  18,
  12,
  12,
  1,
  18,
  18,
  18,
  18,
  18,
  12,
  12,
  12,
  12,
  12,
  12,
  1,
  12,
  12,
  17,
  24,
  1,
  18,
  18,
  12,
  12,
  17,
  24,
  12],
 [1, 12, 12, 21, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 17, 12, 12, 12, 1],
 [1,
  12,
  1,
  12,
  12,
  12,
  12,
  1,
  18,
  12,
  12,
  12,
  12,
  1,
  18,
  12,
  1,
  12,
  12,
  12,
  12,
  1,
  12,
  12,
  2,
  2,
  12,
  12,
  3,
  1,
  12,
  12,
  12,
  1,
  12,
  12,
  12,
  12],
 [1,
  1,
  12,
  1,
  12,
  12,
  12,
  12,
  1,
  18,
  18,
  18,
  2,
  22,
  12,
  12,
  12,
  1,
  12,
  1,
  18,
  18,
  18,
  18,
  18,
  18],
 [12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  12,
  1,
  18,
  18,
  12,
  1,
  12,
  1,
  12,
  1,
  18,
  18,
  12,
  12,
  12,

In [13]:
hf_dataset = hf_dataset.train_test_split(test_size=0.2, shuffle=True, seed=13)
# Rename datasets["test"] to datasets["validation"]
hf_dataset["validation"] = hf_dataset["test"]
del hf_dataset["test"]
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'att_tags'],
        num_rows: 637
    })
    validation: Dataset({
        features: ['id', 'tokens', 'att_tags'],
        num_rows: 160
    })
})

In [14]:
hf_dataset.push_to_hub("spanish_attitude_conll2003")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/529 [00:00<?, ?B/s]