In [23]:
MODEL="gdino" #sam/gdino/manual
LENS = f"../gdinoOutput/lens/"
FINAL = f"../gdinoOutput/final-original/"
TEXT_PROMPT_CSV = "../zm_scraper/items-prompt.csv"
RESULTS_THRESH = 0.05 # only keep tokens that has appeared in at least 10% of results
TOKENS_LIMIT = 0.5 #only keep top x% of sorted tokens 


MASTER_LIST_CSV = "../zm_scraper/master-list.csv"

## Parsing Lens data
1. based on kept data: keep most frequent keywords
2. keep the list to a len of 10 for most freq keywords
3. embed this list of freq keywords and find the item closest to our embedded list of items (cosine similarity).


In [24]:
# NLTK data has been manually installed
import nltk
print("NLTK data ready!")

NLTK data ready!


In [25]:
# Filter each results array, and extract top substrings algorithm
# 1. Preprocess titles: make lower case, remove stopwords based on the Natural Language Toolkit (NLTK)
# 2. Extract top 10% substrings based on the n-gram tokens. Threshold of count is at least 10% of number of searches obtained
# 3. Store those tokens inside the json 'tokens' key
# Note to future self; remove the bad error checking in request when google lens does not return anything

import os
import json
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter

stop_words = set(stopwords.words('english'))
custom_stopwords = {
    'ebay', 'amazon', 'homedepot', 'etsy', 'walmart', 'target', 
    'shop', 'sale', 'brandnew', 'free', 'shipping', 'official', 'store'
}
# Merge words to exclude

stop_words = stop_words.union(custom_stopwords)
def preprocess_title(title):
    # Lowercase
    title = title.lower()
    # Extract words
    tokens = re.findall(r'\b\w+\b', title)
    # Remove stopwords
    filtered_tokens = [t for t in tokens if t not in stop_words]
    return filtered_tokens

def update_json_with_tokens(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    updated = False

    for mask_id, mask_info in data.items():
        results = mask_info.get("results", [])
        mask_tokens = []
        
        if results:
            # Threshold: minimum appearances based on n% of number of results
            min_num_results = max(2, int(len(results) * RESULTS_THRESH))
            if isinstance(results, list) and results and results != ["Google Lens didn't return any results."]:
                for result in results:
                    if isinstance(result, dict):
                        title = result.get("title", "").strip()
                        if not title:
                            continue
                        tokens = preprocess_title(title)
                        mask_tokens.extend(tokens)
                        mask_tokens.extend([" ".join(bg) for bg in ngrams(tokens, 2)])  # Add bigrams

            if not mask_tokens:
                mask_info["tokens"] = []
                mask_info["token_counts"] = {}
                continue

            # Count frequencies
            token_counts = Counter(mask_tokens)

            # Filter tokens that meet frequency threshold
#             print(f"Results:{min_num_results}")
            filtered_tokens = [(token, count) for token, count in token_counts.items() if count >= min_num_results]

            # Sort by frequency (descending)
            filtered_tokens.sort(key=lambda x: x[1], reverse=True)

            # Top n% of tokens
            top_n = max(1, int(len(filtered_tokens) * TOKENS_LIMIT))
            top_tokens_with_counts = filtered_tokens[:top_n]
            top_tokens = [token for token, _ in top_tokens_with_counts]
            top_token_counts = {token: count for token, count in top_tokens_with_counts}
            

            # Save filtered tokens and their counts
#             print(f"MASK {mask_id} OLD TOKENS: {token_counts}")
            print(f"MASK {mask_id} TOKENS: {top_tokens}")
            mask_info["tokens"] = top_tokens
            mask_info["token_counts"] = top_token_counts
            updated = True

    if updated:
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        print(f"[UPDATED] {json_path}")
    else:
        print(f"[SKIPPED] {json_path}")

def process_jsons():
    df = pd.read_csv(TEXT_PROMPT_CSV)
    item_ids = df['id'].astype(str).tolist()

    for item_id in item_ids:
        item_folder = os.path.join(LENS, item_id)
        if not os.path.exists(item_folder):
            continue

        for file_name in os.listdir(item_folder):
            if not file_name.endswith(".json"):
                continue
            print(f"FILE: {file_name}")
            
            json_path = os.path.join(item_folder, file_name)
            update_json_with_tokens(json_path)

if __name__ == "__main__":
    process_jsons()
    print("All COMPLETE!")



FILE: p1187246330.json
MASK 1 TOKENS: ['red', 'leather', 'bag', 'clutch', 'wallet', 'vintage', 'clutch bag', 'pouch', 'com', 'tray', 'x', 'case', 'red leather', 'patent', 'saint', 'laurent', 'saint laurent', 'set', 'new', '5', 'box', 'purse', 'serving', 'cartier', 'authentic']
[UPDATED] ../gdinoOutput/lens/1/p1187246330.json
FILE: c1186222082.json
MASK 1 TOKENS: ['nintendo', 'dsi', 'nintendo dsi', 'white', 'console', 'dsi white', 'ds', 'handheld', 'dsi console', 'nintendo ds', 'console white', 'tested', 'game', 'system', 'consoles', 'charger', 'japanese', 'game console', 'japan', 'dsi handheld', 'ntsc', 'j', 'ntsc j', 'pen', 'handheld game', 'white console', 'color', 'box', 'used', 'com', 'com nintendo', 'games', 'mario', 'handheld system', 'white handheld', 'youtube', 'ホワイト', 'w', 'touch', 'touch pen']
[UPDATED] ../gdinoOutput/lens/1/c1186222082.json
FILE: r1172860507.json
MASK 1 TOKENS: ['nintendo', 'dsi', 'nintendo dsi', 'console', 'black', 'japan', 'ds', 'nintendo ds', 'lite', 'ds 

### Embed token results 
1. Embed those tokens with LLM.
2. Based on the embeddings, derive from the item list what is the item's cosine similairity, 

**Embedding Concerns**
1. Multilinguity : 'rosa'(Espanol) = 'rose'(English). The embeddings need to be able to match cross language
2. Match to item list: soft match hard match?
3. 

## DETERMINISTIC MATCHING

In [26]:
TRANSLATE=False

In [27]:
import os
import json
import pandas as pd
from collections import defaultdict
from langdetect import detect
from deep_translator import GoogleTranslator

# Detect and translate individual token if needed
def translate_token(token):
    try:
        lang = detect(token)
        if lang != "en":
            translated = GoogleTranslator(source='auto', target='en').translate(token)
            return translated.lower().strip()
    except:
        pass
    return token.lower().strip()  # fallback: return as-is (lowercased)

# -------------------
# Load Master List
# -------------------
master_list_df = pd.read_csv(MASTER_LIST_CSV)
master_hashmap = dict(zip(master_list_df.iloc[:, 0].astype(str), master_list_df.iloc[:, 1]))

# Normalize master list into token sets
def normalize(text):
    return text.lower().replace("-", " ").split()

master_processed = {}
token_index = defaultdict(set)

for item_id, name in master_hashmap.items():
    tokens = normalize(name)
    master_processed[item_id] = {
        "tokens": set(tokens),
        "type_tokens": {"console", "controller"} & set(tokens),  # type terms
        "raw": name
    }
    for token in tokens:
        token_index[token].add(item_id)

# -------------------
# Deterministic Matching Logic (rank-aware)
# -------------------
def match_mask(ranked_tokens):
    if TRANSLATE:
        translated_ranked = [translate_token(token) for token in ranked_tokens]
        token_weights = {token: len(translated_ranked) - i for i, token in enumerate(translated_ranked)}
    else:
        token_weights = {token.lower(): len(ranked_tokens) - i for i, token in enumerate(ranked_tokens)}
    seen_items = defaultdict(int)

    for token, weight in token_weights.items():
        for item_id in token_index.get(token, []):
            seen_items[item_id] += weight

    # Apply type-token filter
    filtered = {
        item_id: score
        for item_id, score in seen_items.items()
        if master_processed[item_id]["type_tokens"] & set(token_weights.keys())
    }

    # Final selection
    best_match_id = max(filtered.items(), key=lambda x: x[1], default=(None, 0))[0]
    return best_match_id if best_match_id else ""

# -------------------
# Read JSON and Match
# -------------------
def update_json_with_tokens(json_path, item_id, file_name):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    mask_tokens_map = {}
    mask_token_counts_map = {}
    for mask_id, mask_info in data.items():
        tokens = mask_info.get("tokens", [])
        token_counts = mask_info.get("token_counts", {})
        if tokens:
            mask_tokens_map[str(mask_id)] = tokens
            mask_token_counts_map[str(mask_id)] = token_counts

    if not mask_tokens_map:
        print(f"[SKIPPED] No tokens in {json_path}")
        return

    # Run deterministic matching
    final = {
        mask_id: match_mask(tokens)
        for mask_id, tokens in mask_tokens_map.items()
    }

    # Map to human-readable names
    final_readable = {
        mask_id: master_hashmap.get(item_id, "No Match") if item_id else "No Match"
        for mask_id, item_id in final.items()
    }

    output_data = {
        MODEL: final,
        f"{MODEL}_readable": final_readable,
        f"{MODEL}_token_counts": mask_token_counts_map
    }

    # Save output
    output_folder = os.path.join(FINAL, item_id)
    print(output_folder)
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, file_name)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_data, f, indent=4, ensure_ascii=False)

    print(f"[FINAL SAVED] {output_path}")

# -------------------
# Process All Files
# -------------------
def process_jsons():
    df = pd.read_csv(TEXT_PROMPT_CSV)
    item_ids = df['id'].astype(str).tolist()

    for item_id in item_ids:
        item_folder = os.path.join(LENS, item_id)
        if not os.path.exists(item_folder):
            continue

        for file_name in os.listdir(item_folder):
            if not file_name.endswith(".json"):
                continue
            json_path = os.path.join(item_folder, file_name)
            update_json_with_tokens(json_path, item_id, file_name)

if __name__ == "__main__":
    process_jsons()
    print("ALL COMPLETE!")


../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/p1187246330.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/c1186222082.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/r1172860507.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/x1190805798.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/r1140105858.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/q1183324759.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/r1188366109.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/n1184979547.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/k1190934472.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../gdinoOutput/final-original/1/n1181963058.json
../gdinoOutput/final-original/1
[FINAL SAVED] ../g