In [None]:
import os
import pandas as pd
from transformers import pipeline
from datasets import load_dataset, Dataset
from tqdm import tqdm

# Place the path of your cleaned csv here. The CSV should have a
# column named 'cleaned_text'

path = 'data/Cleaned_review-Alaska_10.csv'

dataset = pd.read_csv(path)

test = dataset.sample(n=100)

In [None]:
# Example: load your CSV into a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(test)

# Load zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", 
                      model="facebook/bart-large-mnli",
                      batch_size=16)

# defining labels
candidate_labels = [
    "relevant – genuine review describing a real experience",
    "irrelevant – advertisement or promotional content",
    "irrelevant – marketing or self-promotion",
    "irrelevant – contains discounts, sales, or offers",
    "irrelevant – spammy content or repeated promotions",
    "irrelevant – does not describe the business or location",
    "irrelevant – mentions another place instead",
    "irrelevant – talks about unrelated personal matters",
    "irrelevant – contains generic statements with no context",
    "irrelevant – rant without context or explanation",
    "irrelevant – vague or emotional outburst only",
    "irrelevant – contains only star ratings or short phrases with no details",
    "irrelevant – irrelevant complaint with no connection to the service",
    "irrelevant – duplicate review content",
    "irrelevant – gibberish, nonsense text, or random characters",
    "irrelevant – copy-pasted text not related to this location",
    "irrelevant – repeated meaningless phrases or emojis",
    "irrelevant – fake review not based on a real experience",
    "irrelevant – misleading or fabricated content",
    "irrelevant – hostile trolling or abusive language unrelated to the service",
    "irrelevant – written in an unsupported language",
    "irrelevant – contains only symbols, numbers, or links",
    "irrelevant – irrelevant website links or contact info"
]

# Define a processing function
def classify_batch(batch):
    results = classifier(
        batch["cleaned_text"],
        candidate_labels,
        multi_label=False
    )
    
    # Extract top label for each review
    preds = []
    for r in results:
        label = r["labels"][0]
        if label.startswith("relevant"):
            preds.append(1)
        else:
            preds.append(0)
    return {"classification": preds}

hf_dataset = hf_dataset.map(classify_batch, batched=True, batch_size=32)

In [None]:
pd_dataset = hf_dataset.to_pandas()
pd_dataset_final = pd_dataset.drop('__index_level_0__',axis=1)

In [None]:
pd_dataset_final.head()

In [None]:
file_name = 'Labelled_' + os.path.basename(path)
dir_name = os.path.dirname(path)
save_path = os.path.join(dir_name,file_name)

pd_dataset_final.to_csv(save_path,index=False)