# Prepare datasets with Argilla

## 1. Deploy Argilla server locally

```bash
cd /workspace
mkdir argilla && cd argilla
wget -O docker-compose.yaml https://raw.githubusercontent.com/argilla-io/argilla/main/examples/deployments/docker/docker-compose.yaml

service docker start
docker compose up -d
```

Connect to: http://localhost:6900/

Login:

> cat docker-compose.yaml

- USERNAME: argilla
- PASSWORD: 12345678

## 2. Install Argilla client SDK

In [None]:
pip install argilla -U --pre

In [5]:
from importlib.metadata import version
version('argilla')

'2.0.0'

API key:

> cat docker-compose.yaml

API_KEY: argilla.apikey

In [7]:
import argilla as rg

client = rg.Argilla(api_url="http://localhost:6900/", api_key="argilla.apikey")

client.me.first_name

'argilla'

In [24]:
workspace_to_create = rg.Workspace(name="argilla")
created_workspace = workspace_to_create.create()

## 3. Import Huggingface dataset

https://huggingface.co/datasets/frenchtext/banque-fr-2311

Dataset extracted from public websites by wordslab-webscraper in 2311:
- domain: banque
- language: fr
- license: Apache 2.0

In [None]:
pip install datasets

In [27]:
from datasets import load_dataset

with open("/workspace/myhftoken", 'r') as file:
    myhftoken = file.read().strip()

hf_dataset = load_dataset("frenchtext/banque-fr-2311",  token=myhftoken)

Downloading readme:   0%|          | 0.00/7.03k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/42 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/42 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/42 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/68166 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/8522 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8541 [00:00<?, ? examples/s]

```yaml
dataset_info:
  features:
    - name: Uri
      dtype: string
    - name: Timestamp
      dtype: string
    - name: Lang
      dtype: string
    - name: Title
      dtype: string
    - name: Text
      dtype: string
    - name: Words
      dtype: int32
    - name: AvgWordsLength
      dtype: int32
    - name: Chars
      dtype: int32
    - name: LetterChars
      dtype: int32
    - name: NumberChars
      dtype: int32
    - name: OtherChars
      dtype: int32
    - name: Website
      dtype: string
    - name: PDF
      dtype: bool
  config_name: default
  splits:
    - name: train
      num_examples: 68166
    - name: valid
      num_examples: 8522
    - name: test
      num_examples: 8541
  download_size: 247147772
  ```

In [22]:
settings = rg.Settings(
    guidelines="Explore french banking websites dataset - date ",
    fields=[
        rg.TextField(
            name="Text",
            title="Web page text",
            required=True,
            use_markdown=True,            
        ),
    ],
    questions=[
        rg.MultiLabelQuestion(
            name="ContentType",
            title="Does the web page include any of these content types?",
            labels=["info", "news", "product", "process", "ads", "metadata"],
        )
    ],
    metadata=[
        rg.TermsMetadataProperty(name="Uri"),
        rg.TermsMetadataProperty(name="Lang"),
        rg.IntegerMetadataProperty(name="Words"),
        rg.TermsMetadataProperty(name="Website"),
        rg.IntegerMetadataProperty(name="PDF"),
    ],
    vectors=[
        rg.VectorField(name="Text_e5_embeddings", dimensions=768)
    ],
)

In [25]:
dataset = rg.Dataset(
    name="banque-fr-2311",
    workspace="argilla",
    settings=settings,
)

dataset.create()

Dataset(id=UUID('2cda3686-e350-4999-897d-8f30c01c81f1') inserted_at=datetime.datetime(2024, 8, 1, 22, 1, 48, 500104) updated_at=datetime.datetime(2024, 8, 1, 22, 1, 49, 262278) name='banque-fr-2311' status='ready' guidelines='Explore french banking websites dataset - date' allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('485e94ed-4e72-496c-9d92-756baea3882d') last_activity_at=datetime.datetime(2024, 8, 1, 22, 1, 49, 262278))

MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis

https://arxiv.org/pdf/2405.20468v2

=> multilingual-e5-base for clustering = 768 embeddings dimensions

https://github.com/microsoft/unilm/tree/master/e5

https://huggingface.co/intfloat/multilingual-e5-small --- https://huggingface.co/intfloat/multilingual-e5-base

In [None]:
pip install sentence_transformers

In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-base')

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/179k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [16]:
input_texts = [
    'query: how much protein should a female eat',
    'query: 南瓜的家常做法',
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 i     s 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or traini     ng for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "passage: 1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮     ,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,     放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油     锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀      6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅"
]
embeddings = model.encode(input_texts, normalize_embeddings=True)

In [17]:
embeddings.shape

(4, 768)

In [18]:
models = SentenceTransformer('intfloat/multilingual-e5-small')

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/498k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

(4, 768)

In [21]:
embeddings = models.encode(input_texts, normalize_embeddings=True)
embeddings.shape

(4, 384)