In [28]:
localuser = "argilla"
password = "1234"
apikey = "argilla.apikey"
url = "http://aicontroller:6900/"

In [1]:
hfuser = "owner"
password = "12345678"
apikey = "owner.apikey"
# Kamaljp/yttutorialserver
url = "https://kamaljp-yttutorialserver.hf.space/"

In [29]:
import argilla as rg
# get the docs here https://.hf.space/api/docs
# need to signin as owner with 12345678 to proceed the following.
# https://docs.argilla.io/en/latest/getting_started/installation/configurations/user_management.html
rg.init(
    api_url=url,
    api_key=apikey
)

This may lead to potential compatibility issues during your experience.
To ensure a seamless and optimized connection, we highly recommend aligning your client version with the server version.


In [3]:
rg.Workspace.create("localgilla")

Workspace(id=5ed06c89-dbe5-47a8-a9d0-5b59872481a6, name=localgilla, inserted_at=2024-04-02 07:18:14.945504, updated_at=2024-04-02 07:18:14.945504)

In [4]:
user = rg.User.create(
    username="new-user",
    first_name="New",
    last_name="User",
    password="12345678",
    role="annotator",
    workspaces=["localgilla"]
)

In [7]:
user.first_name

'New'

In [8]:
dataset = rg.FeedbackDataset.for_text_classification(
    labels=["sadness", "joy"],
    multi_label=False,
    use_markdown=True,
    guidelines=None,
    metadata_properties=None,
    vectors_settings=None,
)

In [9]:
dataset

FeedbackDataset(
   fields=[TextField(name='text', title='Text', required=True, type='text', use_markdown=True)]
   questions=[LabelQuestion(name='label', title='Label', description='Classify the text by selecting the correct label from the given list of labels.', required=True, type='label_selection', labels=['sadness', 'joy'], visible_labels=None)]
   guidelines=This is a text classification dataset that contains texts and labels. Given a set of texts and a predefined set of labels, the goal of text classification is to assign one label to each text based on its content. Please classify the texts by making the correct selection.)
   metadata_properties=[])
   vectors_settings=[])
)

In [10]:
localgilla = "localgilla"

In [None]:
dataset.push_to_argilla(name="ds-1", workspace=localgilla)

In [30]:
ds1 = rg.FeedbackDataset.from_argilla(name='ds-1', workspace=localgilla)

In [31]:
records = [
    rg.FeedbackRecord(
        fields={
            "text": "I am so happy today",
        },
    ),
    rg.FeedbackRecord(
        fields={
            "text": "I feel sad today",
        },
    )
]
dataset.add_records(records)

In [13]:
dataset.records

[FeedbackRecord(fields={'text': 'I am so happy today'}, metadata={}, vectors={}, responses=[], suggestions=(), external_id=None),
 FeedbackRecord(fields={'text': 'I feel sad today'}, metadata={}, vectors={}, responses=[], suggestions=(), external_id=None)]

In [32]:
ds1.add_records(records)

In [None]:
dataset_hf = rg.FeedbackDataset.from_huggingface("argilla/emotion", split="train[1:101]")

In [None]:
dataset_hf.push_to_argilla(name="emogilla", workspace=localgilla)

In [None]:
from argilla.feedback import TrainingTask

task = TrainingTask.for_text_classification(
    text=dataset_hf.field_by_name('text'),
    label=dataset_hf.question_by_name("label")
)

In [None]:
from argilla.feedback import ArgillaTrainer

trainer = ArgillaTrainer(
    dataset=dataset_hf,
    task=task,
    framework="transformers",
    train_size=0.8,
)

In [None]:
trainer.get_trainer_model()

In [None]:
trainer.get_trainer_tokenizer()

In [None]:
trainer.get_model_kwargs()

In [None]:
trainer.train(output_dir="train_arg_bert")

In [None]:
### Uploading Data

# Three types of Record, depending on the tasks support TextClassificationRecord, 
# TokenClassificationRecord and Text2TextRecord

In [None]:
rg.list_datasets()

In [None]:
dataset = rg.FeedbackDataset.for_text_classification(
    labels=["good", "bad"],
    multi_label=False,
    use_markdown=True,
    guidelines=None,
    metadata_properties=None,
    vectors_settings=None,
)

In [None]:
# Critical attrs for the record are text, annotation, prediction and metadata

textcat_rec = rg.TextClassificationRecord(
    text='Hello there. Its me',
    prediction=[("LABEL1", 0.8), ("LABEL2", 0.2)],
    annotation='LABEL1',
    multi_label=False
)

In [None]:
tokencat_rec = rg.TokenClassificationRecord(
    text='Argilla is a super awesome library that speeds up annotation',
    tokens=["Argilla", "is", "a", "super", "awesome", "library", "that", "speeds",
            "up", "annotation"],
    prediction=[("Name", 0, 7), ("ADJ", 19, 26)],  # look at the indices, it matters
)

In [None]:
text2text = rg.Text2TextRecord(
    text='Argilla is a super awesome library that speeds up annotation',
    prediction=["More we use it faster we understand it"],
)

In [None]:
rg.set_workspace(localgilla)

In [None]:
rg.log(textcat_rec, 'my_cat_ds')
rg.log(text2text, 'my_t2t_ds')
rg.log(tokencat_rec, 'my_tokenclass_ds')

In [None]:
from datasets import load_dataset
ds = load_dataset("imdb", split='train').shuffle(seed=42).select(range(100))
ds.rename_column("label", "annotation")
df_rg = rg.read_datasets(ds,task="TextClassification")

In [None]:
ds_in_arg = rg.log(df_rg, "imdb")

In [None]:
ds_in_arg.schema()

In [None]:
labels = ["pos", "neg"]
settings = rg.TextClassificationSettings(label_schema=labels)
rg.configure_dataset_settings(name="imdb", settings=settings)

In [None]:
# tokenclassification task

from datasets import load_dataset

tokenDs = load_dataset("ag_news", split="train").shuffle(70).select(range(100))

In [None]:
def metadata_dict(row):
    metadata = {}
    metadata['label'] = row["label"]
    row["metadata"] = metadata
    return row

dataset = tokenDs.map(metadata_dict, remove_columns=["label"])

In [None]:
tokenDs

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def tokenizer(row):
    tokens = [token.text for token in nlp(row["text"])]
    return {"tokens": tokens}

In [None]:
tokenDs = tokenDs.map(tokenizer)

In [None]:
token_rg = rg.read_datasets(tokenDs, task="TokenClassification")

In [None]:
rg.log(token_rg, 'ag_news')

In [None]:
labels = ["PER", "LOC", "ORG", "MISC"]

settings = rg.TokenClassificationSettings(label_schema=labels)
rg.configure_dataset_settings(name="ag_news", settings=settings)

In [None]:
t2tds = load_dataset("europa_ecdc_tm", "en2fr", split="train").shuffle(25).select(range(150))

In [None]:
def extract(row):
    return {"text": row["translation"]["en"],
            "prediction": row["translation"]["fr"]}

t2tds = t2tds.map(extract, remove_columns=['translation'])

In [None]:
t2trg = rg.read_datasets(t2tds, task="Text2Text")
rg.log(t2trg, "ecdc_en")

In [None]:
import argilla as rg
# need to provide the reconrds
dataset = rg.FeedbackDataset(
    guidelines="Add some guidelines for the annotation team here.",
    fields=[
        rg.TextField(name="prompt", title="Human prompt"),
        rg.TextField(name="output", title="Generated output", use_markdown=True)
    ],
    questions =[
        rg.RatingQuestion(
            name="rating",
            title="Rate the quality of the response:",
            description="1 = very bad - 5= very good",
            required=True,
            values=[1,2,3,4,5]
        ),
        rg.TextQuestion(
            name="corrected-text",
            title="Provide a correction to the response:",
            required=False,
            use_markdown=True
        ),
        rg.LabelQuestion(
            name="relevant",
            title="Is the response relevant for the given prompt?",
            labels={"YES": "Yes", "NO": "No"}, # or ["YES","NO"]
            required=True,
            visible_labels=None
        ),
        rg.MultiLabelQuestion(
            name="content_class",
            title="Does the response include any of the following?",
            description="Select all that apply",
            labels={"hate": "Hate Speech" , "sexual": "Sexual content", "violent": "Violent content", "pii": "Personal information", "untruthful": "Untruthful info", "not_english": "Not English", "inappropriate": "Inappropriate content"}, # or ["hate", "sexual", "violent", "pii", "untruthful", "not_english", "inappropriate"]
            required=False,
            visible_labels=4
        ),
        rg.RankingQuestion(
            name="preference",
            title="Order replies based on your preference",
            description="1 = best, 3 = worst. Ties are allowed.",
            required=True,
            values={"reply-1": "Reply 1", "reply-2": "Reply 2", "reply-3": "Reply 3"} # or ["reply-1", "reply-2", "reply-3"]
        ),
        rg.MultiLabelQuestion(
            name="entities",
            title="Highlight the entities in the text:",
            labels={"PER": "Person", "ORG": "Organization", "EVE": "Event"},
            # or ["PER", "ORG", "EVE"],
            # field="text",
            required=True
        ),

    ]
)
# SpanQuestion errored out, so used MultiLabel question in its place
dataset.push_to_argilla(name="feedback_ds", workspace="argilla")

To query the rg.Dataset, you need to be able to write Lucene Query Language (LQL), which is native to Elastic Search and Open Search.

In [None]:
# load the data record

record = rg.load(name="imdb", vector=("new_vector", [0, 43, 1985]))

In [None]:
record = rg.TextClassificationRecord(
    text="Hello world, I am a vector record!",
    vectors= {"my_vector_name": [0, 42, 1984]}
)
rg.log(name="withvek", records=record)

In [None]:
from argilla.labeling.text_classification import add_rules, delete_rules, Rule, update_rules

# Create
rule = Rule(query="positive impact", label="optimism")
add_rules(dataset="withvek", rules=[rule])

# Update
rule.label = "pessimism"
update_rules(dataset="withvek", rules=[rule])

# Delete
# delete_rules(dataset="withvek", rules=[rule])

In [26]:
records

[FeedbackRecord(fields={'text': 'I am so happy today'}, metadata={}, vectors={}, responses=[], suggestions=(), external_id=None),
 FeedbackRecord(fields={'text': 'I feel sad today'}, metadata={}, vectors={}, responses=[], suggestions=(), external_id=None)]

In [23]:
ds1.push_to_argilla(name='ds-1',workspace=localgilla)



RemoteFeedbackDataset(
   id=7c84a580-aedc-4bee-a1bb-7dde00e83bdd
   name=ds-1
   workspace=Workspace(id=5ed06c89-dbe5-47a8-a9d0-5b59872481a6, name=localgilla, inserted_at=2024-04-02 07:18:14.945504, updated_at=2024-04-02 07:18:14.945504)
   url=https://kamaljp-yttutorialserver.hf.space/dataset/7c84a580-aedc-4bee-a1bb-7dde00e83bdd/annotation-mode
   fields=[RemoteTextField(id=UUID('3f0de2e8-48f5-4e7e-8bd9-b7897ca630d0'), client=None, name='text', title='Text', required=True, type='text', use_markdown=True)]
   questions=[RemoteLabelQuestion(id=UUID('3a243fd3-6d6d-4b35-ba75-74be2ac33f1c'), client=None, name='label', title='Label', description=None, required=True, type='label_selection', labels=['sadness', 'joy'], visible_labels=None)]
   guidelines=This is a text classification dataset that contains texts and labels. Given a set of texts and a predefined set of labels, the goal of text classification is to assign one label to each text based on its content. Please classify the texts by 