# 1. Init 

In [1]:
from qdrant_client import QdrantClient, models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = QdrantClient("http://localhost:6333")

In [3]:
import pandas as pd 
import numpy as np 

In [4]:
reddit_df = pd.read_csv("/workspaces/reddit_search/data/reddit_posts_and_comments.csv")
reddit_df.head() 

Unnamed: 0,post_title,post_text,subreddit,post_author,post_url,post_upvotes,post_downvotes,comment_upvotes,comment_downvotes,comment_text,comment_author
0,[D] - NeurIPS'2025 Reviews,"Hey everyone,\n\nNeurIPS 2025 reviews should b...",MachineLearning,Proof-Marsupial-5367,https://www.reddit.com/r/MachineLearning/comme...,203,0,77,0,Friendly reminder that reviews this year are s...,ChoiceStranger2898
1,[D] - NeurIPS'2025 Reviews,"Hey everyone,\n\nNeurIPS 2025 reviews should b...",MachineLearning,Proof-Marsupial-5367,https://www.reddit.com/r/MachineLearning/comme...,203,0,36,0,I had a dream recently where my upcoming avera...,popeldo
2,[D] - NeurIPS'2025 Reviews,"Hey everyone,\n\nNeurIPS 2025 reviews should b...",MachineLearning,Proof-Marsupial-5367,https://www.reddit.com/r/MachineLearning/comme...,203,0,63,0,I will treat the scores as a divine interventi...,matcha-coconut
3,[D] - NeurIPS'2025 Reviews,"Hey everyone,\n\nNeurIPS 2025 reviews should b...",MachineLearning,Proof-Marsupial-5367,https://www.reddit.com/r/MachineLearning/comme...,203,0,33,0,"July 24, so as long as it's July 24 somewhere ...",SmolLM
4,[D] - NeurIPS'2025 Reviews,"Hey everyone,\n\nNeurIPS 2025 reviews should b...",MachineLearning,Proof-Marsupial-5367,https://www.reddit.com/r/MachineLearning/comme...,203,0,34,0,"Well, if you feel heart-broken, be assured tha...",Marionberry6886


In [5]:
# combine 
reddit_df['post_title_text'] = reddit_df['post_title'] + '-' + reddit_df['post_text'] 

In [6]:
client.delete_collection("reddit_post")
client.delete_collection("reddit_comment")

True

In [7]:
# set up collection 
from qdrant_client import models

# Define the collection name
collection_name = "reddit_post"

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,  # for sentence-transformers embeddings
        distance=models.Distance.COSINE
    ),
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

client.get_collections()


CollectionsResponse(collections=[CollectionDescription(name='reddit_post')])

In [8]:
# Create the collection with specified vector parameters

collection_name = "reddit_comment"

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,  # for sentence-transformers embeddings
        distance=models.Distance.COSINE
    ),
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

client.get_collections()


CollectionsResponse(collections=[CollectionDescription(name='reddit_post'), CollectionDescription(name='reddit_comment')])

In [12]:
# Truncate long text 
def truncate_text(text, max_length=4000):
    """Truncate text but try to end at sentence boundary."""
    if len(text) <= max_length:
        return text
    
    # Truncate and try to end at sentence
    truncated = text[:max_length]
    last_period = truncated.rfind('.')
    last_space = truncated.rfind(' ')
    
    # End at sentence if period found in last 200 chars
    if last_period > max_length - 200:
        return truncated[:last_period + 1]
    # Otherwise end at word boundary
    elif last_space > max_length - 50:
        return truncated[:last_space]
    else:
        return truncated

In [13]:
from qdrant_client import models
from datetime import datetime

# Decide which dense encoding model to use 
model_handle = "jinaai/jina-embeddings-v2-small-en"


filtered_points = []
skipped_empty = 0
truncated_count = 0
id = 0

for idx, row in reddit_df.iterrows():
    # Combine title and text
    title = str(row['post_title']) if pd.notna(row['post_title']) else ""
    text = str(row['post_text']) if pd.notna(row['post_text']) else ""
    combined_text = f"{title}. {text}".strip(". ")
    
    # Skip if essentially empty
    if not combined_text or combined_text == "No content":
        skipped_empty += 1
        continue
    
    # Truncate if too long
    original_length = len(combined_text)
    combined_text = truncate_text(combined_text, max_length=4000)
    
    if len(combined_text) < original_length:
        truncated_count += 1
    
    point = models.PointStruct(
        id=id,
        vector=models.Document(
            text=combined_text, 
            model=model_handle
        ),
        payload={
            "text": combined_text,
            "post_title": title,
            "post_text": str(row['post_text']) if pd.notna(row['post_text']) else "",
            "subreddit": str(row['subreddit']) if pd.notna(row['subreddit']) else "",
            "post_author": str(row['post_author']) if pd.notna(row['post_author']) else "",
            "post_url": str(row['post_url']) if pd.notna(row['post_url']) else "",
            "post_upvotes": int(row['post_upvotes']) if pd.notna(row['post_upvotes']) else 0,
            "post_downvotes": int(row['post_downvotes']) if pd.notna(row['post_downvotes']) else 0,
            # for trend analysis 
            "content_type": "post",
            "engagement_score": int(row['post_upvotes']) if pd.notna(row['post_upvotes']) else 0,
            "text_length": len(combined_text),
            "was_truncated": len(combined_text) < original_length,
        }
    )
    filtered_points.append(point)
    id += 1

print(f"Filtered points: {len(filtered_points)}")
print(f"Skipped empty: {skipped_empty}")
print(f"Truncated long texts: {truncated_count}")


Filtered points: 5708
Skipped empty: 0
Truncated long texts: 207


In [14]:
# Check how much data we're processing
print(f"Total points to upload: {len(points)}")
print(f"Average text length: {sum(len(p.payload['text']) for p in points) / len(points):.1f} chars")

# Check for very long texts that might cause kernel to die 
long_texts = [p for p in points if len(p.payload['text']) > 5000]
print(f"Posts with >5000 characters: {len(long_texts)}")

Total points to upload: 5708
Average text length: 1148.3 chars
Posts with >5000 characters: 122


In [15]:
# Step 3: Upload with smaller batches and error handling
# Memory usage is reset after each embedding call (Python releases it once the function returns and no references remain).
# take note that you can optimize this as qdrant support batch upsert 
from tqdm import tqdm
import time

batch_size = 25  # Smaller batches
successful_uploads = 0
failed_batches = []

print(f"\nUploading {len(filtered_points)} points in batches of {batch_size}...")

for i in tqdm(range(0, len(filtered_points), batch_size)):
    try:
        batch = filtered_points[i:i + batch_size]
        
        client.upsert(
            collection_name="reddit_post",
            points=batch
        )
        
        successful_uploads += len(batch)
        
        # Small delay to prevent overwhelming
        time.sleep(0.2)
        
    except Exception as e:
        print(f"\n❌ Error uploading batch {i//batch_size + 1}: {e}")
        failed_batches.append(i//batch_size + 1)
        continue

print(f"\n✅ Upload complete!")
print(f"Successful uploads: {successful_uploads}")
print(f"Failed batches: {len(failed_batches)}")

# Verify final count
collection_info = client.get_collection("reddit_post")
print(f"Collection now has {collection_info.points_count} points")


Uploading 5708 points in batches of 25...


100%|███████████████████████████████████████████████████████████████████████████████████████| 229/229 [19:09<00:00,  5.02s/it]


✅ Upload complete!
Successful uploads: 5708
Failed batches: 0
Collection now has 5708 points





In [25]:
# do search 
def search(query, limit=1):


    results = client.query_points(
        collection_name='reddit_post',
        query=models.Document( 
            text=query, # query must be text, qdrant will do the embedding for you 
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [26]:
test_query = 'What is the most trending topic about Machine Learning?'

search(test_query)

QueryResponse(points=[ScoredPoint(id=260, version=10, score=0.87068844, payload={'text': '[D] What are the bottlenecks holding machine learning back?. I remember this being posted a long, long time ago. What has changed since then? What are the biggest problems holding us back?', 'post_title': '[D] What are the bottlenecks holding machine learning back?', 'post_text': 'I remember this being posted a long, long time ago. What has changed since then? What are the biggest problems holding us back?', 'subreddit': 'MachineLearning', 'post_author': 'jacobfa', 'post_url': 'https://www.reddit.com/r/MachineLearning/comments/1lywxnm/d_what_are_the_bottlenecks_holding_machine/', 'post_upvotes': 52, 'post_downvotes': 0, 'content_type': 'post', 'engagement_score': 52, 'text_length': 188, 'was_truncated': False}, vector=None, shard_key=None, order_value=None)])

size=512 distance=<Distance.COSINE: 'Cosine'> hnsw_config=None quantization_config=None on_disk=None datatype=None multivector_config=None
