In [None]:
# Define the URL where Label Studio is accessible
import os
import dotenv

dotenv.load_dotenv()

LABEL_STUDIO_URL = "http://185.8.172.121:8080/"
# API key is available at the Account & Settings page in Label Studio UI
LABEL_STUDIO_API_KEY = os.environ["LABELSTUDIO_TOKEN"]


In [10]:
import os
import pandas as pd

DATA_DIR = "dataset/href_queries"

# Collect text files
files = [f for f in os.listdir(DATA_DIR) if f.endswith(".txt")]


# Generator of DataFrames — avoids building large Python lists
def load_file(path, filename):
    return pd.read_csv(
        path,
        header=None,  # no header row
        names=["query"],
        engine="python",  # safe for \n separation
        encoding="utf-8",
    ).assign(source=filename)


# Concatenate all at once (pandas handles internal chunking efficiently)
df = pd.concat(
    (load_file(os.path.join(DATA_DIR, f), f) for f in files),
    ignore_index=True,
)

print(df.info())

df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   query   5000 non-null   object
 1   source  5000 non-null   object
dtypes: object(2)
memory usage: 78.3+ KB
None


Unnamed: 0,query,source
0,خرید لباس مجلسی پسرانه,keywords-04-tir-random.txt
1,امداد خودرو رنو,keywords-04-tir-random.txt
2,کراتین,keywords-04-tir-random.txt
3,شکایت برای ارثیه,keywords-04-tir-random.txt
4,لگو فانتزی دخترانه,keywords-04-tir-random.txt
...,...,...
4995,تور سوئیس,keywords-08-aban-random.txt
4996,کاشت موی طبیعی,keywords-08-aban-random.txt
4997,نوار نقاله,keywords-08-aban-random.txt
4998,ضایعات الکترونیکی,keywords-08-aban-random.txt


In [None]:
import pandas as pd
import requests
import json
import random
import time
from urllib.parse import quote
import tqdm.auto as tqdm

# === CONFIG ===
INPUT_CSV = "dataset/queries (1).csv"
OUTPUT_JSON = "dataset/unannotated_dataset.json"
API_URL = "http://185.8.172.121:8000/similar-keywords?query={}&top_k=64"
SAMPLES_PER_QUERY = 10
SLEEP_BETWEEN_REQUESTS = 0.0  # seconds, to avoid hammering the API

# === LOAD QUERIES ===
df = pd.read_csv(INPUT_CSV)
queries = df.dropna()

dataset = []

for i, query in tqdm.tqdm(queries.iterrows(), unit="queries", total=len(df)):
    try:
        url = API_URL.format(quote(query["keyword"]))
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        results = response.json()

        if not results:
            continue

        # sample up to 10 results
        options = random.sample(results, min(SAMPLES_PER_QUERY, len(results)))

        entry = {
            "query": query,
            "site_count": query["site_count"],
            "options": [
                {"value": item["keyword"], "score": item["similarity"]}
                for item in options
            ],
        }
        dataset.append(entry)

        # print(f"[{i + 1}/{len(queries)}] ✅ {query} ({len(sampled)} options)")

    except Exception as e:
        print(f"[{i + 1}/{len(queries)}] ⚠️ Error for '{query}': {e}")

    time.sleep(SLEEP_BETWEEN_REQUESTS)


  0%|          | 0/1024 [00:00<?, ?queries/s]

KeyboardInterrupt: 

In [11]:
df.to_csv("queries/href_queries.csv")

In [None]:
import pandas as pd
import requests
import json
import random
from urllib.parse import quote
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm

# === CONFIG ===
INPUT_CSV = "dataset/queries (1).csv"
OUTPUT_JSON = "dataset/unannotated_dataset.json"
API_URL = "http://185.8.172.121:8000/similar-keywords?query={}&top_k=64"
SAMPLES_PER_QUERY = 10
MAX_WORKERS = 4  # you can safely increase this to 64 or even 128 if API is fast

# === LOAD QUERIES ===
df = pd.read_csv(INPUT_CSV).dropna(subset=["keyword"])
queries = df.to_dict("records")


def fetch(query_row: dict):
    keyword = query_row["keyword"]
    while True:
        try:
            url = API_URL.format(quote(keyword))
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            results = response.json()
            if not results:
                return None

            sampled = random.sample(results, min(SAMPLES_PER_QUERY, len(results)))
            sampled.sort(key=lambda e: e["similarity"], reverse=True)

            return {
                "query": keyword,
                "site_count": query_row["site_count"],
                "options": [
                    {"value": item["keyword"], "score": item["similarity"]}
                    for item in sampled
                ],
            }

        except Exception as e:
            print("Error:", e)
            continue
            # return {"query": keyword, "error": str(e)}


# === MULTITHREADED FETCH ===
dataset = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(fetch, q) for q in queries]
    for f in tqdm(as_completed(futures), total=len(futures), unit="queries"):
        result = f.result()
        if result and "error" not in result:
            dataset.append(result)


  0%|          | 0/1024 [00:00<?, ?queries/s]

Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out. (read timeout=10)
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out.
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out.
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out. (read timeout=10)
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out.
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out.
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out.
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out.
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out.
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out.
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out. (read timeout=10)
Error: HTTPConnectionPool(host='185.8.172.121', port=8000): Read timed out.
Error: HTTPConnectionPool(host='18

In [4]:
# === SAVE OUTPUT ===
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=2)

print(f"\n✅ Saved {len(dataset)} items to {OUTPUT_JSON}")



✅ Saved 1024 items to dataset/unannotated_dataset.json


username: 20.mahdikh.0
email: 20.mahdikh.0@gmail.com


In [None]:
from openai import project


ls_client.tasks.create(
    data={"query":},
    project=project.id,
)

In [None]:
import asyncio
import aiohttp
import pandas as pd
import random
import os
import pandas as pd
import dotenv
from urllib.parse import quote

# import tqdm.auto as tqdm
from label_studio_sdk import AsyncLabelStudio

dotenv.load_dotenv()


# === CONFIG ===
DATA_DIR = "dataset/href_queries"
LABEL_STUDIO_URL = "http://185.8.172.121:8080/"
# API key is available at the Account & Settings page in Label Studio UI
LABEL_STUDIO_API_KEY = os.environ["LABELSTUDIO_TOKEN"]

API_URL = "http://185.8.172.121:8000/similar-keywords?query={}&top_k=64"
SAMPLES_PER_QUERY = 10
MAX_WORKERS = 4  # you can safely increase this to 64 or even 128 if API is fast


# Connect to the Label Studio API
ls_client = AsyncLabelStudio(
    base_url=LABEL_STUDIO_URL,
    api_key=LABEL_STUDIO_API_KEY,
)

project = await ls_client.projects.get(id=1)

# A basic request to verify connection is working
me = await ls_client.users.whoami()

print("username:", me.username)
print("email:", me.email)


# === LOAD QUERIES ===

# Collect text files
files = [f for f in os.listdir(DATA_DIR) if f.endswith(".txt")]


# Generator of DataFrames — avoids building large Python lists
def load_file(path, filename):
    return pd.read_csv(
        path,
        header=None,  # no header row
        names=["query"],
        engine="python",  # safe for \n separation
        encoding="utf-8",
    ).assign(source=filename)


# Concatenate all at once (pandas handles internal chunking efficiently)
queries_df = pd.concat(
    (load_file(os.path.join(DATA_DIR, f), f) for f in files),
    ignore_index=True,
)


async def fetch_similar(
    session: aiohttp.ClientSession, query: str, top_k=10
) -> list[dict[str, str]]:
    url = API_URL.format(quote(query))
    response = await session.get(url, timeout=10)
    response.raise_for_status()
    results = await response.json()
    if not results:
        raise ValueError(f"Server returned no response for `{query}`: {results}")

    if top_k < len(results):
        results = random.sample(results, min(top_k, len(results)))

    results.sort(key=lambda e: e["similarity"], reverse=True)
    return results


async def create_task(
    query_row: dict[str, str],
    options: list[dict[str, str]],
):
    return await ls_client.tasks.create(
        data={
            "options": [
                {
                    "value": item["keyword"],
                    "score": item["similarity"],
                    "hint": "translation I guess",
                }
                for item in options
            ],
            **query_row,
        },
        project=project.id,
    )


async def create_task_from_query(
    session: aiohttp.ClientSession,
    query_row: dict[str, str],
    sem: asyncio.Semaphore,
):
    while True:
        async with sem:
            try:
                options = await fetch_similar(
                    session,
                    query_row["query"],
                    top_k=SAMPLES_PER_QUERY,
                )
                break

            except Exception as e:
                print(f"Retrying query {query_row}: {e}")
                await asyncio.sleep(3)

    while True:
        try:
            return await create_task(
                query_row=query_row,
                options=options,
            )

        except Exception as e:
            print(f"Retrying task task creation {query_row}: {e}")
            await asyncio.sleep(3)


username: 20.mahdikh.0
email: 20.mahdikh.0@gmail.com


In [39]:
import asyncio
import os

import aiohttp
import dotenv
import tqdm.asyncio
from googletrans import Translator
from label_studio_sdk import AsyncLabelStudio, LabelStudio, RoleBasedTask

# import tqdm.auto as tqdm

dotenv.load_dotenv()


# === CONFIG ===
LABEL_STUDIO_URL = "http://185.8.172.121:8080/"
# API key is available at the Account & Settings page in Label Studio UI
LABEL_STUDIO_API_KEY = os.environ["LABELSTUDIO_TOKEN"]

MAX_WORKERS = 4  # you can safely increase this to 64 or even 128 if API is fast


# Connect to the Label Studio API
als_client = AsyncLabelStudio(
    base_url=LABEL_STUDIO_URL,
    api_key=LABEL_STUDIO_API_KEY,
)

# Connect to the Label Studio API
ls_client = LabelStudio(
    base_url=LABEL_STUDIO_URL,
    api_key=LABEL_STUDIO_API_KEY,
)


project = ls_client.projects.get(id=1)
print(project)
tasks = ls_client.tasks.list(project=project.id)
annotations = []

# A basic request to verify connection is working
me = ls_client.users.whoami()

print("username:", me.username)
print("email:", me.email)

translator = Translator()


async def enrich_translations(task: RoleBasedTask):
    translations = await translator.translate(
        [entry["value"] for entry in task.data["options"]],
        src="auto",
        dest="fa",
    )

    for entry, translation in zip(task.data["options"], translations):
        entry["hint"] = translation.text

    return task


async def translate_task(
    task: RoleBasedTask,
    sem: asyncio.Semaphore,
):
    annotations.extend(task.annotations)
    if task.annotations or all("hint" in cand for cand in task.data["options"]):
        return

    while True:
        async with sem:
            try:
                task = await enrich_translations(task)
                break

            except Exception as e:
                print(f"Translating {task.id}: {e}")
                await asyncio.sleep(3)

    while True:
        try:
            return await als_client.tasks.update(
                task.id, project=project.id, data=task.data
            )

        except Exception as e:
            print(f"Retrying task update {task.id}: {e}")
            await asyncio.sleep(3)


tasks = list(ls_client.tasks.list(project=project.id))


sem = asyncio.Semaphore(MAX_WORKERS)  # concurrency control
async with aiohttp.ClientSession() as session:
    # Iterate asynchronously
    for coro in tqdm.asyncio.tqdm.as_completed(
        [translate_task(t, sem) for t in tasks], unit="tasks"
    ):
        try:
            result = await coro

        except Exception as e:
            print("❌ Error:", e)


agreement_threshold=None allow_stream=None annotation_limit_count=None annotation_limit_percent=None annotator_evaluation_minimum_score=None annotator_evaluation_minimum_tasks=None assignment_settings=None color='#FFFFFF' comment_classification_config=None config_has_control_tags=True config_suitable_for_bulk_annotation=False control_weights={'rel': {'overall': 1.0, 'type': 'Choices', 'labels': {}}} created_at=datetime.datetime(2025, 11, 8, 6, 46, 8, 364936, tzinfo=datetime.timezone.utc) created_by=UserSimple(avatar=None, email='20.mahdikh.0@gmail.com', first_name='', id=1, last_name='') custom_script=None custom_task_lock_ttl=None data_types=None description='' duplication_done=None duplication_status=None enable_empty_annotation=True evaluate_predictions_automatically=False expert_instruction='For each query, you are given a few sample keywords and your job is to select and check the options that are relevant and uncheck the ones that are irrelevant.' finished_task_number=186 ground_

 18%|█▊        | 1104/6019 [05:35<33:15,  2.46tasks/s] 

Translating 2405: Server disconnected
Translating 2406: Server disconnected
Translating 3855: Server disconnected
Translating 180: Server disconnected
Retrying task update 5870: Server disconnected without sending a response.
Retrying task update 3853: Server disconnected without sending a response.


 46%|████▋     | 2798/6019 [16:23<41:37,  1.29tasks/s]  

Translating 6606: 
Translating 6605: 
Translating 2574: 
Translating 4590: 
Retrying task update 4589: 
Translating 2575: [Errno 11001] getaddrinfo failed
Translating 4591: 
Translating 4592: [Errno 11001] getaddrinfo failed
Translating 6607: 
Translating 2576: [Errno 11001] getaddrinfo failed
Translating 6608: 
Translating 4593: [Errno 11001] getaddrinfo failed
Translating 2577: 
Retrying task update 4589: 
Translating 6609: 
Translating 2578: [Errno 11001] getaddrinfo failed
Translating 6610: 
Translating 4594: 
Translating 4595: 
Translating 2579: 
Translating 4596: 
Retrying task update 4589: 
Translating 6612: [Errno 11001] getaddrinfo failed
Translating 6611: 
Translating 2580: [Errno 11001] getaddrinfo failed
Translating 4597: [Errno 11001] getaddrinfo failed
Translating 2581: 
Translating 6613: 
Translating 2582: 
Retrying task update 4589: Server disconnected without sending a response.
Translating 4598: 
Translating 6614: 
Translating 4599: [SSL: CERTIFICATE_VERIFY_FAILED] ce

100%|██████████| 6019/6019 [1:02:12<00:00,  1.61tasks/s] 


In [None]:
tasks = list(ls_client.tasks.list(project=project.id))


In [42]:
annotations = []
for task in tasks:
    annotations.extend(task.annotations)

In [50]:
import pandas as pd


annotations_df = pd.DataFrame(annotations)

In [51]:
annotations_df["created_at_dt"] = pd.to_datetime(annotations_df["created_at"])
annotations_df["updated_at_dt"] = pd.to_datetime(annotations_df["updated_at"])

In [79]:
import random

random_task = random.choice(tasks)

In [83]:
random_task.annotations

[{'id': 414,
  'result': [{'value': {'choices': ['دستگاه لیزر',
      'بهترین دستگاه لیزر',
      'لیزر خال',
      'لیزر جای جوش',
      'لیزر فرکشنال',
      'دستگاه لیزر تیتانیوم',
      'دستگاه لیزر بدن']},
    'id': 'xX8w8WKIlj',
    'from_name': 'rel',
    'to_name': 'q',
    'type': 'choices',
    'origin': 'manual'}],
  'created_username': ' rafatesmino@gmail.com, 5',
  'created_ago': '5\xa0hours, 34\xa0minutes',
  'completed_by': 5,
  'was_cancelled': False,
  'ground_truth': False,
  'created_at': '2025-11-15T08:53:12.154835Z',
  'updated_at': '2025-11-15T08:53:12.154879Z',
  'draft_created_at': '2025-11-15T08:50:53.987440Z',
  'lead_time': 149.639,
  'import_id': None,
  'last_action': None,
  'bulk_created': False,
  'task': 2102,
  'project': 1,
  'updated_by': 5,
  'parent_prediction': None,
  'parent_annotation': None,
  'last_created_by': None}]

In [76]:
import datetime
import plotly.express as px

threshold = datetime.datetime.now(datetime.timezone.utc).replace(
    hour=9, minute=0, second=0, microsecond=0
)

is_today = annotations_df["created_at_dt"] > threshold


fig = px.histogram(
    annotations_df[is_today],
    x="created_at_dt",
    color="created_username",
    nbins=100,  # adjust bin count for smoothness
    title="Annotation Density Over Time by User",
)

fig.update_layout(
    bargap=0.05,
    xaxis_title="Time",
    yaxis_title="Number of Annotations",
)

fig.show()

In [60]:
import plotly.express as px

fig = px.density_heatmap(
    annotations_df,
    x="created_at_dt",
    y="created_username",
    color_continuous_scale="Viridis",
    title="Annotation Density Heatmap",
)

fig.update_layout(
    xaxis_title="Time",
    yaxis_title="User",
)
fig.show()


In [61]:
df = annotations_df.copy()
df["timestamp"] = df["created_at_dt"].astype("int64") // 10**9  # convert to seconds

# Count events per minute per user
df_resampled = (
    df.set_index("created_at_dt")
    .groupby("created_username")
    .resample("1min")["created_username"]
    .count()
    .rename("count")
    .reset_index()
)

fig = px.line(
    df_resampled,
    x="created_at_dt",
    y="count",
    color="created_username",
    title="Smoothed Annotation Density Over Time",
)

fig.show()


In [24]:
from googletrans import Translator

import asyncio

from label_studio_sdk import RoleBasedTask


translator = Translator()


async def enrich_translations(task: RoleBasedTask):
    translations = await translator.translate(
        [entry["value"] for entry in task.data["options"]],
        src="auto",
        dest="fa",
    )

    for entry, translation in zip(task.data["options"], translations):
        entry["hint"] = translation.text

    return task


In [25]:
await enrich_translations(task)

print(task.data["options"])

[{'value': 'מהגרים', 'score': 0.8278806, 'hint': 'مهاجران'}, {'value': 'imigration', 'score': 0.800049, 'hint': 'فراموش کردن'}, {'value': 'مهاجرین', 'score': 0.7876773, 'hint': 'مهاجرین'}, {'value': 'سازمان مهاجرتی', 'score': 0.7745429, 'hint': 'سازمان مهاجرتی'}, {'value': 'moving abroad', 'score': 0.7697719, 'hint': 'حرکت به خارج از کشور'}, {'value': 'expat or immigrant', 'score': 0.76735723, 'hint': 'مهاجر یا مهاجر'}, {'value': 'موسسه مهاجرتی تراست', 'score': 0.74778056, 'hint': 'موسسه مهاجرتی تراست'}, {'value': 'مهاجرت بدون پول', 'score': 0.7448804, 'hint': 'مهاجرت بدون پول'}, {'value': 'الهجرة الى اوروبا', 'score': 0.74148244, 'hint': 'مهاجرت به اروپا'}, {'value': 'immigration:', 'score': 0.7411676, 'hint': 'مهاجرت:'}]


In [35]:
tasks = ls_client.tasks.list(project=project.id)
annotations = []

In [None]:
import tqdm.auto as tqdm


for task in tqdm.tqdm(tasks, total=):
    annotations.extend(task.annotations)
    if task.annotations or all("hint" in cand for cand in task.data["options"]):
        continue

    print(task)
    task = await enrich_translations(task)
    await als_client.tasks.update(task.id, project=project.id, data=task.data)
    break

print(task.id)

0it [00:00, ?it/s]

agreement=None agreement_selected=None annotations=[] annotations_ids='' annotations_results='' annotators=[] annotators_count=None avg_lead_time=None cancelled_annotations=0 comment_authors=[] comment_authors_count=None comment_count=0 comments=None completed_at=None created_at=datetime.datetime(2025, 11, 10, 14, 9, 24, 747569, tzinfo=datetime.timezone.utc) data={'query': 'ضبط صوت سامسونگ', 'site_count': 24, 'options': [{'value': 'ضبط صوت سامسونگ', 'score': 0.99999994}, {'value': 'وصل كردن گوشي به تلويزيون سامسونگ', 'score': 0.8253835}, {'value': 'باز كردن رمز گوشي سامسونگ', 'score': 0.8237331}, {'value': 'وصل كردن گوشي سامسونگ به تلویزیون', 'score': 0.8068063}, {'value': 'پخش کننده موسیقی سامسونگ', 'score': 0.77631205}, {'value': 'voice focus samsung', 'score': 0.77151215}, {'value': 'نصب شنود روی گوشی', 'score': 0.7510103}, {'value': 'call ضبط مکالمه دو طرفه گوشی سامسونگ', 'score': 0.7452122}, {'value': 'قيمت گوشي سامسونگ', 'score': 0.7358978}, {'value': 'دانلود برنامه موسیقی سامسون

In [None]:
task

In [38]:
task.id

17

In [44]:
len(finished)

5802

In [43]:
tasks = await ls_client.tasks.list(project=project.id, include=["data"])

while tasks.has_next:
    for task in tasks.items:
        finished.add(task.data["query"])

    tasks = await tasks.get_next()

  obj, end = self.scan_once(s, idx)


ApiError: status_code: 404, body: {'id': '4f1b4115-8ace-4a70-94af-726f581e5097', 'status_code': 404, 'version': '1.21.0', 'detail': 'Invalid page.', 'exc_info': None}

In [40]:
queries = [q for q in queries_df.to_dict("records") if q["query"] not in finished]


sem = asyncio.Semaphore(1)  # concurrency control
async with aiohttp.ClientSession() as session:
    # Iterate asynchronously
    for coro in tqdm.asyncio.tqdm.as_completed(
        [create_task_from_query(session, t, sem) for t in queries],
        unit="queries",
    ):
        try:
            result = await coro

        except Exception as e:
            print("❌ Error:", e)


100%|██████████| 4994/4994 [34:02<00:00,  2.45queries/s]  


In [38]:
query_row = queries[0]
query_row

{'query': 'امداد خودرو رنو', 'source': 'keywords-04-tir-random.txt'}

In [45]:
query_row = {"query": "ssd سریع"}

In [46]:
async with aiohttp.ClientSession() as session:
    options = await fetch_similar(
        session,
        query_row["query"],
        top_k=SAMPLES_PER_QUERY,
    )

In [47]:
options

[{'keyword': 'بهتر شدن سرعت ssd', 'similarity': 0.8863002},
 {'keyword': 'پایین آمدن سرعت ssd', 'similarity': 0.82714355},
 {'keyword': 'ssd samsung', 'similarity': 0.8130125},
 {'keyword': 'as ssd', 'similarity': 0.8045932},
 {'keyword': 'بهترین هارد ssd', 'similarity': 0.7971209},
 {'keyword': 'tweak ssd', 'similarity': 0.777882},
 {'keyword': 'fastdisk', 'similarity': 0.7752943},
 {'keyword': 'ssd msata', 'similarity': 0.7731728},
 {'keyword': 'msata ssd 512gb', 'similarity': 0.769642},
 {'keyword': 'is internal ssd faster than external', 'similarity': 0.7685696}]

In [13]:
len(options)

10

In [48]:
# async def create_task(
#     query_row: dict[str, str],
#     options: list[dict[str, str]],
# ):
#     return await ls_client.tasks.create(
#         data={
#             "options": [
#                 {"value": item["keyword"], "score": item["similarity"]}
#                 for item in options
#             ],
#             **query_row,
#         },
#         project=project.id,
#     )


task = await create_task(query_row, options)
task

LseTask(agreement=None, agreement_selected=None, annotations=None, annotations_ids=None, annotations_results=None, annotators=None, annotators_count=None, avg_lead_time=None, cancelled_annotations=0, comment_authors=[], comment_authors_count=None, comment_count=0, comments=None, completed_at=None, created_at=datetime.datetime(2025, 11, 12, 15, 22, 59, 789207, tzinfo=datetime.timezone.utc), data={'options': [{'value': 'بهتر شدن سرعت ssd', 'score': 0.8863002, 'hint': 'translation I guess'}, {'value': 'پایین آمدن سرعت ssd', 'score': 0.82714355, 'hint': 'translation I guess'}, {'value': 'ssd samsung', 'score': 0.8130125, 'hint': 'translation I guess'}, {'value': 'as ssd', 'score': 0.8045932, 'hint': 'translation I guess'}, {'value': 'بهترین هارد ssd', 'score': 0.7971209, 'hint': 'translation I guess'}, {'value': 'tweak ssd', 'score': 0.777882, 'hint': 'translation I guess'}, {'value': 'fastdisk', 'score': 0.7752943, 'hint': 'translation I guess'}, {'value': 'ssd msata', 'score': 0.7731728,

In [20]:
task.data

{'options': [{'value': 'لباس مجلسی پسرانه', 'score': 0.9361354},
  {'value': 'لباس پسرانه مجلسی', 'score': 0.92647344},
  {'value': 'خرید لباس مجلسی بچه گانه دخترانه شیک', 'score': 0.848766},
  {'value': 'لباس های مجلسی شیک', 'score': 0.8246183},
  {'value': 'لباسهای مجلسی', 'score': 0.8160416},
  {'value': 'نوجوان لباس مجلسی شیک دخترانه', 'score': 0.81401956},
  {'value': 'لباس مجلسی دخترانه شیک', 'score': 0.8134425},
  {'value': 'لباس مجلسی', 'score': 0.806944},
  {'value': 'لباس مجلسی دخترانه نوجوان', 'score': 0.8037823},
  {'value': 'لباس مجلس', 'score': 0.80318695}],
 'query': 'خرید لباس مجلسی پسرانه',
 'source': 'keywords-04-tir-random.txt'}

In [None]:
from label_studio_sdk.label_interface.objects import PredictionValue
import tqdm.auto as tqdm
import json

PROJECT_ID = 1

project = ls_client.projects.get(id=PROJECT_ID)

# Get the parsed labeling interface to build valid prediction payloads
li = project.get_label_interface()
task_filter = {
    "filters": {
        "conjunction": "and",
        "items": [
            {
                "filter": "filter:tasks:total_annotations",
                "operator": "equal",
                "value": 0,
                "type": "Number",
            },
            {
                "filter": "filter:tasks:cancelled_annotations",
                "operator": "equal",
                "value": 0,
                "type": "Number",
            },
        ],
    }
}

# Iterate tasks and attach predictions
for task in tqdm.tqdm(
    ls_client.tasks.list(
        project=project.id,
        query=json.dumps(task_filter),
    )
):
    task_data = task.data
    new_predictions = ...

    predicted_label = li.get_control("rel").label(new_predictions)
    score = None

    if task.predictions:
        prev_predictions = set(task.predictions[0].result[0]["value"]["choices"])

        common = prev_predictions.intersection(new_predictions)
        union = prev_predictions.union(new_predictions)

        score = len(common) / len(union)

    prediction = PredictionValue(
        model_version="mahdi",
        score=score,
        result=[predicted_label],
    )

    ls_client.predictions.create(task=task.id, **prediction.model_dump())

0it [00:00, ?it/s]

In [None]:
ls_client.tasks.create(data=)

In [None]:
ls_client.tasks.list(
    project=project.id,
    query=json.dumps(task_filter),
)

SyncPagerExt(has_next=True, items=[LseTask(agreement=None, agreement_selected=None, annotations=[], annotations_ids='', annotations_results='', annotators=[], annotators_count=None, avg_lead_time=None, cancelled_annotations=0, comment_authors=[], comment_authors_count=None, comment_count=0, comments=None, completed_at=None, created_at=datetime.datetime(2025, 11, 10, 14, 9, 24, 747569, tzinfo=datetime.timezone.utc), data={'query': 'ضبط صوت سامسونگ', 'site_count': 24, 'options': [{'value': 'ضبط صوت سامسونگ', 'score': 0.99999994}, {'value': 'وصل كردن گوشي به تلويزيون سامسونگ', 'score': 0.8253835}, {'value': 'باز كردن رمز گوشي سامسونگ', 'score': 0.8237331}, {'value': 'وصل كردن گوشي سامسونگ به تلویزیون', 'score': 0.8068063}, {'value': 'پخش کننده موسیقی سامسونگ', 'score': 0.77631205}, {'value': 'voice focus samsung', 'score': 0.77151215}, {'value': 'نصب شنود روی گوشی', 'score': 0.7510103}, {'value': 'call ضبط مکالمه دو طرفه گوشی سامسونگ', 'score': 0.7452122}, {'value': 'قيمت گوشي سامسونگ', '

In [52]:
list(_47)

[LseTask(agreement=None, agreement_selected=None, annotations=[], annotations_ids='', annotations_results='', annotators=[], annotators_count=None, avg_lead_time=None, cancelled_annotations=0, comment_authors=[], comment_authors_count=None, comment_count=0, comments=None, completed_at=None, created_at=datetime.datetime(2025, 11, 10, 14, 9, 24, 747569, tzinfo=datetime.timezone.utc), data={'query': 'ضبط صوت سامسونگ', 'site_count': 24, 'options': [{'value': 'ضبط صوت سامسونگ', 'score': 0.99999994}, {'value': 'وصل كردن گوشي به تلويزيون سامسونگ', 'score': 0.8253835}, {'value': 'باز كردن رمز گوشي سامسونگ', 'score': 0.8237331}, {'value': 'وصل كردن گوشي سامسونگ به تلویزیون', 'score': 0.8068063}, {'value': 'پخش کننده موسیقی سامسونگ', 'score': 0.77631205}, {'value': 'voice focus samsung', 'score': 0.77151215}, {'value': 'نصب شنود روی گوشی', 'score': 0.7510103}, {'value': 'call ضبط مکالمه دو طرفه گوشی سامسونگ', 'score': 0.7452122}, {'value': 'قيمت گوشي سامسونگ', 'score': 0.7358978}, {'value': 'دان

In [24]:
task.cancelled_annotations

0

In [None]:
new_predictions = [
    "وصل كردن گوشي سامسونگ به تلویزیون",
    "ضبط صوت سامسونگ",
]

predicted_label = li.get_control("rel").label(new_predictions)
score = None

if task.predictions:
    prev_predictions = set(task.predictions[0].result[0]["value"]["choices"])

    common = prev_predictions.intersection(new_predictions)
    union = prev_predictions.union(new_predictions)

    score = len(common) / len(union)

prediction = PredictionValue(
    model_version="mahdi",
    score=score,
    result=[predicted_label],
)

ls_client.predictions.create(task=task.id, **prediction.model_dump())