In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
from huggingface_hub import login
from transformers import pipeline
import torch
import accelerate
import os

In [16]:
ds = load_dataset("julioc-p/Question-Sparql")

In [None]:
login(os.environ['HF_TOKEN'])

In [None]:
pipe = pipeline(
    "text-generation",
    model="google/gemma-2-27b-it",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=0
)

Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
def get_answer_batch(texts):
    messages = [
        [{"role": "user", "content": f'Tell me if the following text is a statement and not a query (asking for information), answer "yes" or "no" without any further characters: "{t}"'}] 
        for t in texts
    ]

    with torch.inference_mode():
        outputs = pipe(messages, max_new_tokens=5, batch_size=128)  # Adjust batch_size as needed
    return ["yes" in output[0]["generated_text"][-1]["content"].strip() for output in outputs]

In [9]:
def is_statement_batch(texts):
    return get_answer_batch(texts)

In [26]:
df = ds["train"].to_pandas()

In [None]:
# Separate English and non-English rows
df_en = df[df["language"] == "en"]
df_non_en = df[df["language"] != "en"]

# Run inference only on English text_query
df_en["is_statement"] = is_statement_batch(df_en["text_query"].tolist())

# Keep only English rows that are not statements + all non-English rows
df_filtered = pd.concat([df_non_en, df_en[~df_en["is_statement"]]], ignore_index=True)

In [None]:
df_filtered = df_filtered.drop('is_statement', axis=1)

In [None]:
df_filtered

In [None]:
filtered_ds = Dataset.from_pandas(df_filtered)

In [None]:
filtered_ds.push_to_hub("julioc-p/Question-Sparql")