In [1]:
from transformers import T5Tokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import T5Tokenizer
from tqdm import tqdm

def is_url_semantically_rich(url_segments):
    generic_terms = {"index", "page", "item", "view", "default", "home"}
    descriptive_count = 0
    total_segments = len(url_segments)
    for segment in url_segments:
        if not segment.isnumeric() and segment not in generic_terms and len(segment) > 2:
            descriptive_count += 1
    return descriptive_count > total_segments / 2

def url_docid(input_data, max_docid_len=99, pretrain_model_path="t5-base"):
    tokenizer = T5Tokenizer.from_pretrained(pretrain_model_path)
    results = {}
    skipped_docs = []

    for doc_item in tqdm(input_data, desc="Processing URL docids"):
        try:
            docid = doc_item.get('docid', '').strip().lower()
            url = doc_item.get('url', '')
            title = doc_item.get('title', '')

            if not docid:
                skipped_docs.append({"reason": "Missing or empty docid", "doc_item": doc_item})
                continue

            url = url.strip().lower() if isinstance(url, str) else ""
            title = title.strip().lower() if isinstance(title, str) else ""

            if not url and not title:
                skipped_docs.append({"reason": "Missing both URL and title", "docid": docid})
                continue

            url = url.replace("http://", "").replace("https://", "").replace("-", " ")
            url_segments = [segment for segment in url.split('/') if segment]

            domain = url_segments[0] if url_segments else ""
            reversed_path = " ".join(reversed(url_segments[1:])) if len(url_segments) > 1 else ""

            if url_segments and is_url_semantically_rich(url_segments):
                final_string = f"{reversed_path} {domain}".strip()
                source = "URL"
            elif title and domain:
                final_string = f"{title} {domain}".strip()
                source = "TITLE"
            elif title:
                final_string = title
                source = "TITLE"
            else:
                skipped_docs.append({"reason": "Unable to determine final string", "docid": docid})
                continue

            tokenized_ids = tokenizer(final_string, truncation=True, max_length=max_docid_len).input_ids
            tokenized_ids = tokenized_ids[:-1][:max_docid_len] + [1]

            results[docid] = {
                "final_string": final_string,
                "source": source,
                "token_ids": tokenized_ids
            }

        except Exception as e:
            skipped_docs.append({"reason": f"Error: {str(e)}", "docid": doc_item.get('docid', 'unknown')})

    if skipped_docs:
        print("Skipped documents:")
        for skipped in skipped_docs:
            print(skipped)

    return results




In [12]:
# Demo input data
data = [
    {"docid": "1", "url": "https://example.com/products/item123", "title": "Product Page"},
    {"docid": "2", "url": "https://example.com//blog//how-to-code", "title": "How to Code"},
    {"docid": "3", "url": None, "title": "Fallback Title"},
    {"docid": "4", "url": "https://example.com/index", "title": ""},
    {"docid": "5", "url": "", "title": ""},
    {"docid": "6", "url": "example.com", "title": "Example Domain"},
]

results = url_docid(data)

# Display results
for docid, info in results.items():
    print(f"{docid}:")
    print(f"  Used:   {info['source']}")
    print(f"  String: {info['final_string']}")
    print(f"  Tokens: {info['token_ids']}\n")

Processing URL docids: 100%|██████████| 6/6 [00:00<00:00, 11428.62it/s]

Skipped documents:
{'reason': 'Unable to determine final string', 'docid': '4'}
{'reason': 'Missing both URL and title', 'docid': '5'}
1:
  Used:   URL
  String: example.com products item123
  Tokens: [677, 5, 287, 494, 2118, 14574, 1]

2:
  Used:   URL
  String: example.com blog how to code
  Tokens: [677, 5, 287, 875, 149, 12, 1081, 1]

3:
  Used:   TITLE
  String: fallback title
  Tokens: [1590, 1549, 2233, 1]

6:
  Used:   URL
  String: example.com
  Tokens: [677, 5, 287, 1]






In [15]:
from transformers import T5Tokenizer

# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Choose a string docid (e.g., "1" from your sample data)
docid_key = "1"
tokens = results[docid_key]["token_ids"]  # Correct way to access

# Decode token IDs
decoded_string = tokenizer.decode(tokens, skip_special_tokens=True)
print(f"Decoded string for docid [{docid_key}]: {decoded_string}")


Decoded string for docid [1]: example.com products item123


In [13]:
# Example input
data = [
    {"docid": "1", "url": "https://example.com/products/item123", "title": "Product Page"},
    {"docid": "2", "url": "https://example.com//blog//how-to-code", "title": "How to Code"},
    {"docid": "3", "url": None, "title": "Fallback Title"},
    {"docid": "4", "url": "https://example.com/index", "title": ""},
    {"docid": "5", "url": "", "title": ""},
    {"docid": "6", "url": "example.com", "title": "Example Domain"},
]

results = url_docid(data)

# Print results
for docid, info in results.items():
    print(f"{docid}:")
    print(f"  Used:   {info['source']}")
    print(f"  String: {info['final_string']}")
    print(f"  Tokens: {info['token_ids']}\n")

Processing URL docids: 100%|██████████| 6/6 [00:00<00:00, 10828.67it/s]

Skipped documents:
{'reason': 'Unable to determine final string', 'docid': '4'}
{'reason': 'Missing both URL and title', 'docid': '5'}
1:
  Used:   URL
  String: example.com products item123
  Tokens: [677, 5, 287, 494, 2118, 14574, 1]

2:
  Used:   URL
  String: example.com blog how to code
  Tokens: [677, 5, 287, 875, 149, 12, 1081, 1]

3:
  Used:   TITLE
  String: fallback title
  Tokens: [1590, 1549, 2233, 1]

6:
  Used:   URL
  String: example.com
  Tokens: [677, 5, 287, 1]






In [8]:
# Simulate sample JSONL lines as dicts
sample_docs = [
    {"docid": "doc1", "url": "https://example.com/index/1234", "title": "Welcome to Example"},
    {"docid": "doc2", "url": "https://example.com/products/laptop/dell-xps", "title": "Buy Dell XPS Online"},
    {"docid": "doc3", "url": "https://example.com/page/456", "title": "Amazing Deals"},
    {"docid": "doc4", "url": "https://example.com/ai/transformers/bert", "title": "BERT Model Overview"},
]
results = url_docid(sample_docs)

# Print results
for docid, info in results.items():
    print(f"{docid}:")
    print(f"  Used:   {info['source']}")
    print(f"  String: {info['final_string']}")
    print(f"  Tokens: {info['token_ids']}\n")

Processing URL docids: 100%|██████████| 4/4 [00:00<00:00, 6929.87it/s]

doc1:
  Used:   TITLE
  String: welcome to example 1234
  Tokens: [2222, 12, 677, 586, 3710, 1]

doc2:
  Used:   URL
  String: example.com products laptop dell xps
  Tokens: [677, 5, 287, 494, 4544, 20, 195, 3, 226, 102, 7, 1]

doc3:
  Used:   TITLE
  String: amazing deals 456
  Tokens: [1237, 3694, 314, 4834, 1]

doc4:
  Used:   URL
  String: example.com ai transformers bert
  Tokens: [677, 5, 287, 3, 9, 23, 19903, 7, 3, 7041, 1]




