# Pastebin Similarity Search System

In [1]:
from pinecone import Pinecone
import json
import re
import os
from tqdm import tqdm
import time
from langdetect import detect, LangDetectException
import re, unicodedata

In [2]:
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT="us-east-1"

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)


print("Pinecone client initialized successfully!")
print(f"Available indexes: {[idx.name for idx in pc.list_indexes()]}")

Pinecone client initialized successfully!
Available indexes: []


# Data Cleaning

- Extract the title and id from pastes with "en" language
- Filter out pastes with "untitled" title
- Filter out pastes with non-english titles using langdetect
- Save both english and non-english pastes to separate files

In [3]:
file_path = 'paste_extract.json'

records = []

with open(file_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            try:
                obj = json.loads(line)
                title = obj.get('title')
                id = obj.get('id')
                language = obj.get('language')
                if title and id and title.lower() != "untitled" and language == "en":  # filter out untitled and non-english titles
                    records.append({'title': title, 'id': id}) # only append title and id to records
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")

In [None]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

filtered_records = []

for i, record in enumerate(tqdm(records, desc="Processing records")):
    if not is_english(record['title']):
        filtered_records.append(record)

print(len(filtered_records))

Processing records: 100%|██████████| 67836/67836 [10:27<00:00, 108.07it/s]

41780





In [98]:
# Save non-English records
with open('non_english_records.json', 'w') as f:
    json.dump(filtered_records, f, indent=2)


filtered_ids = set(record['id'] for record in filtered_records)
english_records = [record for record in records if record['id'] not in filtered_ids]

# Save English records
with open('english_records.json', 'w') as f:
    json.dump(english_records, f, indent=2)


# Vector Embedding

- Load the english pastes json file
- Preprocess the titles by performing unicode normalization, removing urls, file extensions, leading and trailing punctuation, and collapsing multiple spaces
- Create a dense index with a built in embedding model
- Batch the records and upsert them into the index

In [11]:
with open('english_records.json', 'r') as f:
    english_records = json.load(f)            

In [12]:
# Create a dense index with integrated embedding
index_name = "similarity-search"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

In [23]:
def clean_title(title):
    if not isinstance(title, str):
        return ""
    t = unicodedata.normalize("NFKC", title) # fix miss encoded glyphs
    t = t.lower().strip() # lowercase and strip whitespace
    t = re.sub(r"https?://\S+", " ", t) # remove urls
    t = re.sub(r"\.\w{1,4}\b.*$", "", t) # remove file extensions
    t = re.sub(r"^[^\w]+|[^\w]+$", "", t) # drop leading and trailing punctuation
    t = re.sub(r"\s+", " ", t)
    return t.strip()


clean_english_records = [
    {'chunk_text': cleaned, 'id': record['id']}
    for record in english_records
    if (cleaned := clean_title(record.get('title', '')))  # clean and check if not empty
]


In [24]:
# Batch the upsert
def batch_upsert(index, namespace, records, batch_size=90):

    total_batches = (len(records) + batch_size - 1) // batch_size
    
    for i in tqdm(range(0, len(records), batch_size), desc="Upserting batches"):
        batch = records[i:i + batch_size]
        try:
            index.upsert_records(namespace, batch)
            time.sleep(1)
        except Exception as e:
            print(f"Error upserting batch {i//batch_size + 1}: {e}")
            continue
    
    print(f"Completed upserting {len(records)} records in {total_batches} batches")

dense_index = pc.Index(index_name)

# Upsert the records in batches
batch_upsert(dense_index, "paste_data", clean_english_records)

# Wait for the upserted vectors to be indexed
print("Waiting for indexing to complete...")
time.sleep(15)

# stats for the index
stats = dense_index.describe_index_stats()
print("Index stats:", stats)

Upserting batches: 100%|██████████| 290/290 [07:18<00:00,  1.51s/it]


Completed upserting 26046 records in 290 batches
Waiting for indexing to complete...
Index stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'paste_data': {'vector_count': 26046}},
 'total_vector_count': 26046,
 'vector_type': 'dense'}


# Search Similarity

Use the get_info function to search the index for any desired query and it will return the top k most similar pastes with their ids and relevancy scores.

In [25]:

def get_info(query, top_k = 10):
    # Search the dense index and rerank results
    results = dense_index.search(
        namespace="paste_data",
        query={
            "top_k": top_k,
            "inputs": {
                'text': query
            }
        }
    )

    # Print the results
    for hit in results['result']['hits']:
            print(f"id: {hit['_id']:<5} | score: {round(hit['_score'], 2):<5} | text: {hit['fields']['chunk_text']:<50}")
            
query = "bank account hacks"
print(get_info(query, top_k = 10))

id: 6f923f4e-00b8-46b3-9df3-d5ff731cc93a | score: 0.46  | text: western union transfer hack bank transfer hack paypal credit card hack
id: c56a39c1-46a4-42a3-a0f0-01240dca0b83 | score: 0.45  | text: western union transfer hack bank transfer hack paypal credit card hack transfer
id: 45c9da7b-1ba4-4209-b8de-7705d7fdf8a3 | score: 0.44  | text: western union transfer hack bank transfer hack paypal credit card hack
id: 7f06c490-3b4c-4970-aff5-7d5e15f1d544 | score: 0.44  | text: western union transfer hack bank transfer hack paypal credit card hack
id: 4df6db0b-cc05-42e2-b640-16df963d1e5b | score: 0.44  | text: western union transfer hack bank transfer hack paypal credit card hack
id: d5904521-8219-4b99-97bb-736e52524f31 | score: 0.44  | text: western union transfer hack bank transfer hack paypal credit card hack transfer
id: 674f3e28-d4b0-4130-9c17-8740e89e99dc | score: 0.44  | text: western union transfer hack bank transfer hack paypal credit card hack transfer
id: 5395c315-f199-40ba-928a-

In [95]:
# function to delete index
pc.delete_index("similarity-search")