In [None]:
# ===============================
# Review Category Relevance Model
# ===============================
!pip install --upgrade transformers sentence-transformers --quiet

import pandas as pd
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import InferenceClient
from google.colab import drive
import torch
import os
import json
import math

# -------- SETTINGS --------
TOP_N_WORDS = 50
WEIGHT_CAT = 0.5
WEIGHT_KEYEMB = 0.5
MODEL_NAME_EMB = "sentence-transformers/all-MiniLM-L6-v2"
HF_MODEL = "Qwen/Qwen3-4B-Instruct-2507"
HF_TOKEN = "<token>" #replace with Hugging Face token

# -------- MOUNT DRIVE --------
drive.mount('/content/drive')
FILE_PATH = "/content/drive/MyDrive/toktoktest/scraped_dataset_cleaned_scored.xlsx"
OUTPUT_FILE = "/content/drive/MyDrive/toktoktest/dataset_with_category_relevance_score.xlsx"
PARTIAL_FILE = "/content/drive/MyDrive/toktoktest/category_keywords_partial.json"

# -------- LOAD DATA --------
print("Loading dataset...")
df = pd.read_excel(FILE_PATH)

# -------- CLEAN COLUMNS TO STRINGS --------
df["category"] = df["category"].fillna("").astype(str)
df["reviewText"] = df["reviewText"].fillna("").astype(str)

# -------- LOAD SENTENCE TRANSFORMER --------
print(f"Loading sentence transformer: {MODEL_NAME_EMB} ...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(MODEL_NAME_EMB, device=device)

# -------- SETUP HF INFERENCE API --------
print(f"Connecting to Hugging Face Inference API: {HF_MODEL} ...")
client = InferenceClient(HF_MODEL, token=HF_TOKEN)

# -------- LOAD OR INITIALIZE PARTIAL RESULTS --------
if os.path.exists(PARTIAL_FILE):
    with open(PARTIAL_FILE, "r") as f:
        category_to_words = json.load(f)
else:
    category_to_words = {}

# -------- FUNCTION TO GENERATE KEYWORDS FOR BATCH --------
def generate_keywords_batch(categories, top_n=TOP_N_WORDS):
    """
    Generate keywords for multiple categories in a single API call.
    Returns a dict: {category: [keyword1, keyword2, ...]}
    """
    prompt = (
        f"For each of the following categories, generate {top_n} keywords that "
        "will appear in reviews, as a comma-separated list, no explanation. "
        "Respond in JSON format with category as key and list of keywords as value.\n"
    )
    prompt += json.dumps([str(c) for c in categories])

    response = client.chat.completions.create(
        model=HF_MODEL,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=2000
    )

    text = response.choices[0].message["content"]

    try:
        result = json.loads(text)  # parse JSON directly
    except json.JSONDecodeError:
        print("Error parsing JSON from response:")
        print(text)
        result = {}

    # Ensure each list contains top_n keywords and only whole words
    for k, v in result.items():
        if isinstance(v, list):
            result[k] = [w.strip().lower() for w in v][:top_n]
        else:
            # fallback: split string by commas
            result[k] = [w.strip().lower() for w in str(v).split(",") if w.strip()][:top_n]
        print(f"    {k} → {result[k]}")

    return result

# -------- GENERATE KEYWORDS IN BATCHES WITH RETRY --------
unique_categories = df["category"].unique()
print(unique_categories)
# Remove empty strings
unique_categories = [c for c in unique_categories if c.strip()]
remaining_categories = [c for c in unique_categories if c not in category_to_words]
print(f"{len(remaining_categories)} categories remaining to generate keywords...")

if remaining_categories:
    batch_size = max(1, math.ceil(len(remaining_categories) / 5))

    for i in range(0, len(remaining_categories), batch_size):
        batch_cats = remaining_categories[i:i+batch_size]
        batch_results = generate_keywords_batch(batch_cats, TOP_N_WORDS)

        # Update partial results
        category_to_words.update(batch_results)

        # Retry any missing categories individually
        missing = [c for c in batch_cats if c not in batch_results]
        if missing:
            print("Retrying missing categories individually:", missing)
            for cat in missing:
                retry_result = generate_keywords_batch([cat], TOP_N_WORDS)
                category_to_words.update(retry_result)

        # Save partial progress after each batch
        with open(PARTIAL_FILE, "w") as f:
            json.dump(category_to_words, f)
else:
    print("No categories left to process.")

# -------- MAP KEYWORDS TO DF --------
df["relevantWords"] = df["category"].map(category_to_words)

# -------- SIMILARITY SCORE (Category ↔ Review) --------
print("Computing embeddings similarity (category vs review)...")
cat_embs = model.encode(df["category"].tolist(), convert_to_tensor=True)
rev_embs = model.encode(df["reviewText"].tolist(), convert_to_tensor=True)
df["similarity_catReview"] = [util.cos_sim(c, r).item() for c, r in zip(cat_embs, rev_embs)]

# -------- KEYWORD EMBEDDING SIMILARITY --------
print("Computing similarity to keyword embeddings...")
def review_keyword_similarity(review, keywords, model):
    if not isinstance(keywords, list) or not keywords:  # Check if keywords is a list and not empty
        return 0.0
    rev_emb = model.encode(review, convert_to_tensor=True)
    kw_embs = model.encode(keywords, convert_to_tensor=True)
    return util.cos_sim(rev_emb, kw_embs).max().item()

df["similarity_keywords"] = [
    review_keyword_similarity(r, kws, model) for r, kws in zip(df["reviewText"], df["relevantWords"])
]

# -------- FINAL RELEVANCE SCORE --------
df["relevanceScore"] = (
    WEIGHT_CAT * df["similarity_catReview"] +
    WEIGHT_KEYEMB * df["similarity_keywords"]
)

# -------- SAVE RESULTS --------
df.to_excel(OUTPUT_FILE, index=False)
print(f"\nResults saved to {OUTPUT_FILE}")
print(f"Partial keyword progress saved to {PARTIAL_FILE}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading dataset...
Loading sentence transformer: sentence-transformers/all-MiniLM-L6-v2 ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Connecting to Hugging Face Inference API: Qwen/Qwen3-4B-Instruct-2507 ...
['Tourist attraction' 'Restaurant' 'Observation deck' 'Water park'
 'Garden' 'Hotel' 'Park' 'Botanical garden' 'Railway services'
 'Scenic spot' 'Subway station' 'Non-profit organization' 'Cafe'
 'Department store' 'Theme park' 'International airport'
 'General practitioner' 'Archaeological site' 'Historical landmark'
 'Coffee store' 'Spa' 'Wildlife and safari park' 'Night market' 'Stadium'
 'Shopping mall' 'Hairdresser' 'Performing arts theater'
 'Historical place' 'Hospital' 'Art center' 'Furniture store'
 "Children's museum" 'Bank' 'Gym' 'Shopping Mall' 'Museum'
 'Airport Terminal' 'Supermarket' 'Train Station' 'Clinic' 'Library'
 'Theater' 'Pharmacy']
43 categories remaining to generate keywords...
    Tourist attraction → ['amazing', 'fascinating', 'must visit', 'popular', 'scenic', 'historic', 'unique', 'breathtaking', 'unforgettable', 'lively', 'cultural', 'picturesque', 'well-maintained', 'family-friendly