# Pastebin Similarity Search System

In [None]:
from pinecone import Pinecone
import json
import re
import os
from tqdm import tqdm
import time
from langdetect import detect, LangDetectException
import re, unicodedata

In [93]:
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT="us-east-1"

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)


print("✅ Pinecone client initialized successfully!")
print(f"Available indexes: {[idx.name for idx in pc.list_indexes()]}")

✅ Pinecone client initialized successfully!
Available indexes: ['similarity-search']


In [None]:
file_path = 'paste_extract.json'

records = []

with open(file_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            try:
                obj = json.loads(line)
                title = obj.get('title')
                id = obj.get('id')
                language = obj.get('language')
                if title and id and title.lower() != "untitled" and language == "en":  # filter out untitled and non-english titles
                    records.append({'title': title, 'id': id}) # only append title and id to records
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")

In [None]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

filtered_records = []

for i, record in enumerate(tqdm(records, desc="Processing records")):
    if not is_english(record['title']):
        filtered_records.append(record)

print(len(filtered_records))

Processing records: 100%|██████████| 67836/67836 [10:27<00:00, 108.07it/s]

41780





In [51]:
# Save non-English records
with open('non_english_records.json', 'w') as f:
    json.dump(filtered_records, f, indent=2)


filtered_ids = set(record['id'] for record in filtered_records)
english_records = [record for record in records if record['id'] not in filtered_ids]

# Save English records
with open('english_records.json', 'w') as f:
    json.dump(english_records, f, indent=2)


Saved 41780 non-English records to 'non_english_records.json'
Found 26056 English records
Total records: 67836
Non-English: 41780
English: 26056
Verification: True


In [72]:
# Create a dense index with integrated embedding
index_name = "similarity-search"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

In [73]:
def clean_title(title: str) -> str:
    t = unicodedata.normalize("NFKC", title)        # fix miss encoded glyphs
    t = t.lower().strip()                           # lowercase and strip whitespace
    t = re.sub(r"https?://\S+", " ", t)             # remove urls
    t = re.sub(r"\.\w{1,4}\b.*$", "", t)            # remove file extensions
    t = re.sub(r"^[^\w]+|[^\w]+$", "", t)           # drop leading and trailing punctuation
    t = re.sub(r"\s+", " ", t)                      # collapse multiple spaces
    return t

clean_english_records = [{'chunk_text': clean_title(record['title']), 'id': record['id']} for record in english_records]

In [75]:
# Batch the upsert
def batch_upsert(index, namespace, records, batch_size=90):

    total_batches = (len(records) + batch_size - 1) // batch_size
    
    for i in tqdm(range(0, len(records), batch_size), desc="Upserting batches"):
        batch = records[i:i + batch_size]
        try:
            index.upsert_records(namespace, batch)
            time.sleep(1)
        except Exception as e:
            print(f"Error upserting batch {i//batch_size + 1}: {e}")
            continue
    
    print(f"Completed upserting {len(records)} records in {total_batches} batches")

dense_index = pc.Index(index_name)

# Upsert the records in batches
batch_upsert(dense_index, "paste_data", clean_english_records)

# Wait for the upserted vectors to be indexed
print("Waiting for indexing to complete...")
time.sleep(15)

# stats for the index
stats = dense_index.describe_index_stats()
print("Index stats:", stats)

Upserting batches:   4%|▍         | 11/290 [00:15<05:00,  1.08s/it]

Error upserting batch 11: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:37:15 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '78', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches:   8%|▊         | 22/290 [00:29<04:44,  1.06s/it]

Error upserting batch 22: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:37:29 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '43', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches:  13%|█▎        | 39/290 [00:53<04:26,  1.06s/it]

Error upserting batch 39: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:37:53 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '38', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches:  19%|█▉        | 55/290 [01:16<04:28,  1.14s/it]

Error upserting batch 55: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:38:16 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '45', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches:  31%|███       | 90/290 [02:05<03:23,  1.02s/it]

Error upserting batch 90: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:39:05 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '39', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches:  38%|███▊      | 110/290 [02:32<03:08,  1.05s/it]

Error upserting batch 110: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:39:32 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '44', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches:  45%|████▍     | 130/290 [03:00<02:48,  1.05s/it]

Error upserting batch 130: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:40:00 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '44', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches:  62%|██████▏   | 181/290 [04:12<01:55,  1.06s/it]

Error upserting batch 181: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:41:12 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '80', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches:  90%|████████▉ | 260/290 [06:05<00:30,  1.03s/it]

Error upserting batch 260: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:43:05 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '46', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches:  97%|█████████▋| 280/290 [06:32<00:10,  1.05s/it]

Error upserting batch 280: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 13 Jun 2025 16:43:32 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '185', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '47', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Embedding error: 'Input list must be non-empty and all elements must be non-empty.' occurred for model llama-text-embed-v2"},"status":400}



Upserting batches: 100%|██████████| 290/290 [06:47<00:00,  1.40s/it]


Completed upserting 26056 records in 290 batches
Waiting for indexing to complete...
Index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'paste_data': {'vector_count': 25156}},
 'total_vector_count': 25156,
 'vector_type': 'dense'}


In [94]:

def get_info(query, top_k = 10):
    # Search the dense index and rerank results
    results = dense_index.search(
        namespace="paste_data",
        query={
            "top_k": top_k,
            "inputs": {
                'text': query
            }
        }
    )

    # Print the results
    for hit in results['result']['hits']:
            print(f"id: {hit['_id']:<5} | score: {round(hit['_score'], 2):<5} | text: {hit['fields']['chunk_text']:<50}")
            
query = "money hacks"
print(get_info(query, top_k = 5))

id: 7f06c490-3b4c-4970-aff5-7d5e15f1d544 | score: 0.39  | text: western union transfer hack bank transfer hack paypal credit card hack
id: 45c9da7b-1ba4-4209-b8de-7705d7fdf8a3 | score: 0.39  | text: western union transfer hack bank transfer hack paypal credit card hack
id: 4df6db0b-cc05-42e2-b640-16df963d1e5b | score: 0.39  | text: western union transfer hack bank transfer hack paypal credit card hack
id: 6f923f4e-00b8-46b3-9df3-d5ff731cc93a | score: 0.39  | text: western union transfer hack bank transfer hack paypal credit card hack
id: c31993ef-f8ac-4174-9947-8c48dc62fc3d | score: 0.38  | text: western union transfer hack bank transfer hack paypal credit card hack transfer
None
