In [1]:
from qdrant_client import QdrantClient, models
from datetime import datetime
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import time


# Decide which dense encoding model to use 
model_handle = "jinaai/jina-embeddings-v2-small-en"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = QdrantClient("http://localhost:6333")

In [3]:
reddit_df = pd.read_csv("/workspaces/reddit_search/data/reddit_posts_and_comments.csv")

In [5]:
# combine 
reddit_df['post_title_text'] = reddit_df['post_title'] + '-' + reddit_df['post_text'] 

In [6]:
# client.delete_collection("reddit_post")
# client.delete_collection("reddit_comment")

True

In [4]:
# set up collection 

# Define the collection name
collection_name = "reddit_post_comment"

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,  # for sentence-transformers embeddings
        distance=models.Distance.COSINE
    ),
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

client.get_collections()


CollectionsResponse(collections=[CollectionDescription(name='reddit_post'), CollectionDescription(name='reddit_comment'), CollectionDescription(name='reddit_post_comment')])

In [5]:
# Truncate long text 
def truncate_text(text, max_length=4000):
    """Truncate text but try to end at sentence boundary."""
    if len(text) <= max_length:
        return text
    
    # Truncate and try to end at sentence
    truncated = text[:max_length]
    last_period = truncated.rfind('.')
    last_space = truncated.rfind(' ')
    
    # End at sentence if period found in last 200 chars
    if last_period > max_length - 200:
        return truncated[:last_period + 1]
    # Otherwise end at word boundary
    elif last_space > max_length - 50:
        return truncated[:last_space]
    else:
        return truncated

In [6]:
filtered_points = []
skipped_empty = 0
truncated_count = 0
id = 0

for idx, row in reddit_df.iterrows():
    # Combine title and text
    title = str(row['post_title']) if pd.notna(row['post_title']) else ""
    text = str(row['post_text']) if pd.notna(row['post_text']) else ""
    comment = str(row['comment_text']) if pd.notna(row['comment_text']) else ""
    combined_text = f"{title}. {text}. {comment}".strip(". ")
    
    # Skip if essentially empty
    if not combined_text or combined_text == "No content":
        skipped_empty += 1
        continue
    
    # Truncate if too long
    original_length = len(combined_text)
    combined_text = truncate_text(combined_text, max_length=4000)
    
    if len(combined_text) < original_length:
        truncated_count += 1
    
    point = models.PointStruct(
        id=id,
        vector=models.Document(
            text=combined_text, 
            model=model_handle
        ),
        payload={
            "text": combined_text,
            "post_title": title,
            "post_text": str(row['post_text']) if pd.notna(row['post_text']) else "",
            "post_comment": str(row['comment_text']) if pd.notna(row['comment_text']) else "",
            "subreddit": str(row['subreddit']) if pd.notna(row['subreddit']) else "",
            "post_author": str(row['post_author']) if pd.notna(row['post_author']) else "",
            "post_url": str(row['post_url']) if pd.notna(row['post_url']) else "",
            "post_upvotes": int(row['post_upvotes']) if pd.notna(row['post_upvotes']) else 0,
            "post_downvotes": int(row['post_downvotes']) if pd.notna(row['post_downvotes']) else 0,
            "text_length": len(combined_text),
            "was_truncated": len(combined_text) < original_length,
        }
    )
    filtered_points.append(point)
    id += 1

print(f"Filtered points: {len(filtered_points)}")
print(f"Skipped empty: {skipped_empty}")
print(f"Truncated long texts: {truncated_count}")


Filtered points: 5708
Skipped empty: 0
Truncated long texts: 336


In [8]:
# Step 3: Upload with smaller batches and error handling
# Memory usage is reset after each embedding call (Python releases it once the function returns and no references remain).
# take note that you can optimize this as qdrant support batch upsert 

batch_size = 25  # Smaller batches
successful_uploads = 0
failed_batches = []

print(f"\nUploading {len(filtered_points)} points in batches of {batch_size}...")

for i in tqdm(range(0, len(filtered_points), batch_size)):
    try:
        batch = filtered_points[i:i + batch_size]
        
        client.upsert(
            collection_name="reddit_post_comment",
            points=batch
        )
        
        successful_uploads += len(batch)
        
        # Small delay to prevent overwhelming
        time.sleep(0.2)
        
    except Exception as e:
        print(f"\n❌ Error uploading batch {i//batch_size + 1}: {e}")
        failed_batches.append(i//batch_size + 1)
        continue

print(f"\n✅ Upload complete!")
print(f"Successful uploads: {successful_uploads}")
print(f"Failed batches: {len(failed_batches)}")

# Verify final count
collection_info = client.get_collection("reddit_post_comment")
print(f"Collection now has {collection_info.points_count} points")


Uploading 5708 points in batches of 25...


100%|███████████████████████████████████████████████████████████████████████████████████████| 229/229 [28:46<00:00,  7.54s/it]


✅ Upload complete!
Successful uploads: 5708
Failed batches: 0
Collection now has 5708 points





In [9]:
# do search 
def search(query, limit=5):


    results = client.query_points(
        collection_name='reddit_post_comment',
        query=models.Document( 
            text=query, # query must be text, qdrant will do the embedding for you 
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    formatted_results = []
    for point in results.points:  # Access points attribute
        formatted_point = {
            'post_title': point.payload['post_title'],
            'post_text': point.payload['post_text'],
            'subreddit': point.payload['subreddit'], 
            'post_url': point.payload['post_url'],
            'post_upvotes': point.payload['post_upvotes'],
            'post_comment': point.payload['post_comment']
        }
        formatted_results.append(formatted_point) 

    return formatted_results

In [10]:
test_query = 'What is the most trending topic about Machine Learning?'

search(test_query)

[{'post_title': '[D] What are the bottlenecks holding machine learning back?',
  'post_text': 'I remember this being posted a long, long time ago. What has changed since then? What are the biggest problems holding us back?',
  'subreddit': 'MachineLearning',
  'post_url': 'https://www.reddit.com/r/MachineLearning/comments/1lywxnm/d_what_are_the_bottlenecks_holding_machine/',
  'post_upvotes': 52,
  'post_comment': 'All the time and attention going towards generative AI and LLMs instead of more useful things'},
 {'post_title': '[D] What are the bottlenecks holding machine learning back?',
  'post_text': 'I remember this being posted a long, long time ago. What has changed since then? What are the biggest problems holding us back?',
  'subreddit': 'MachineLearning',
  'post_url': 'https://www.reddit.com/r/MachineLearning/comments/1lywxnm/d_what_are_the_bottlenecks_holding_machine/',
  'post_upvotes': 52,
  'post_comment': 'Domain Knowledge driven dataset design.'},
 {'post_title': '[D] W

In [14]:
# Build Prompt 
def build_prompt(query, search_results):
    prompt_template = """
You're a reddit summariser. Answer user's question based on the CONTEXT given to you.
If you did not spot useful information, then answer based on your own knowledge. 
Otherwise, use only the facts from the CONTEXT when answering the question.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"title: {doc['post_title']}\ncontent: {doc['post_text']}\ncomment: {doc['post_comment']}\nurl: {doc['post_url']}\nsubreddit: {doc['subreddit']}\npost_upvotes: {doc['post_upvotes']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [15]:
test_query = 'What is the most trending topic about Machine Learning?'
search_results = search(test_query)
prompt = build_prompt(test_query, search_results)
prompt 

"You're a reddit summariser. Answer user's question based on the CONTEXT given to you.\nIf you did not spot useful information, then answer based on your own knowledge. \nOtherwise, use only the facts from the CONTEXT when answering the question.\n\nQUESTION: What is the most trending topic about Machine Learning?\n\nCONTEXT: \ntitle: [D] What are the bottlenecks holding machine learning back?\ncontent: I remember this being posted a long, long time ago. What has changed since then? What are the biggest problems holding us back?\ncomment: All the time and attention going towards generative AI and LLMs instead of more useful things\nurl: https://www.reddit.com/r/MachineLearning/comments/1lywxnm/d_what_are_the_bottlenecks_holding_machine/\nsubreddit: MachineLearning\npost_upvotes: 52\n\ntitle: [D] What are the bottlenecks holding machine learning back?\ncontent: I remember this being posted a long, long time ago. What has changed since then? What are the biggest problems holding us back?

In [None]:
# LLM 
import os 
import requests 
os.environ["API_KEY"] = "cannot_show_you_api_key"

def lamma3_groq(prompt):
    api_key = os.getenv('API_KEY') 
    url = "https://api.groq.com/openai/v1/chat/completions"

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    
    data = {
        "model": "llama3-8b-8192",  # or "llama3-70b-8192" for larger model
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.7,
        "max_tokens": 1024
    }

    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        return response.json()['choices'][0]['message']['content']
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None

In [19]:
def rag_pipeline(query): 
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = lamma3_groq(prompt)
    return answer 

In [21]:
test_query = 'What are some trending topics about machine learning?'
rag_pipeline(test_query) 

'Based on the provided context, some trending topics about machine learning that are mentioned as bottlenecks holding it back include:\n\n1. Generative AI and LLMs (Large Language Models) distracting from more useful applications.\n2. Domain Knowledge driven dataset design.\n3. Memory bandwidth and shuffling large models in and out of memory.\n4. Lack of understanding of the feature space and processing data effectively.\n5. Efficiency and densifying signals.\n6. Limited understanding of network architectures and how the brain works.\n7. Lack of mathematical understanding and the need for more rigorous testing and ablation.\n8. Hype and overselling of work to secure funding and resources.\n9. Benchmaxxing and paper treadmills, where reviews become consistently worse and innovation is stifled.\n10. Gatekeeping, where newcomers face significant barriers to entry due to requirements for published papers, experience, and compute resources.\n\nThese topics are discussed in the comments of t

In [22]:
test_query = 'What are some common machine learning models reddit users use?'
rag_pipeline(test_query) 

'Based on the provided context, I see that some common machine learning models mentioned by Reddit users include:\n\n* LLMs (Large Language Models)\n* Transformer architecture\n* Generative AI models\n* Federated Learning models (specifically mentioned in the comment section of the post "P Federated Learning on a decentralized protocol (CLI demo, no central server)")\n\nThese models are mentioned in various comments across different posts in the MachineLearning subreddit.'

In [23]:
test_query = 'What are people discussing about LLM now?'
rag_pipeline(test_query) 

'Based on the provided context, people in the MachineLearning subreddit are currently discussing Large Language Models (LLMs) and related topics. Specifically:\n\n1. In a post about the "Favorite ML paper of 2024", every paper discussed is related to LLMs.\n2. In a separate post about "The Big LLM Architecture Comparison", users are discussing issues with token representation and vocabulary sizes in LLMs.\n3. In another post, users are discussing "prompt routing" or "model routing", a concept that involves routing prompts to the most cost-effective LLM that can deliver a high-quality response.\n\nIt seems that LLMs are a dominant topic of discussion in the MachineLearning subreddit at this time.'