<h3>Step 1: Fetch CVPR Papers and create JSON dataset</h3>

In [None]:
import arxiv
import os
import pandas as pd
import pymupdf # PyMuPDF
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# --- Setup: NLTK Stopwords ---
# This ensures the stopwords list is available for the cleaning function.
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
    print("Download complete.")

# --- Text Cleaning Function ---
# A robust function to clean the text extracted from PDFs.
def clean_text(text):
    """
    Performs comprehensive cleaning of extracted text.
    - Removes common PDF artifacts like headers, footers, and arXiv identifiers.
    - Strips out URLs and citation numbers (e.g., [1, 2]).
    - Normalizes whitespace and converts to lowercase.
    - Removes stopwords for noise reduction.
    """
    # Remove arXiv identifiers and page numbers
    text = re.sub(r'arXiv:\S+', '', text)
    text = re.sub(r'Page \d+ of \d+', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text, flags=re.MULTILINE)
    
    # Remove citation brackets like [1], [2, 3], etc.
    text = re.sub(r'\[\d+(?:,\s*\d+)*\]', '', text)
    
    # Remove special characters and digits, keeping basic punctuation
    text = re.sub(r'[^a-zA-Z\s.,!?-]', '', text)
    
    # Normalize whitespace (tabs, newlines, etc.) to a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stopwords
    # Note: For modern embedding models, this step is often skipped as stopwords can provide context.
    # We are including it here to follow your requested preprocessing style.
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    return ' '.join(filtered_tokens)

# --- Main Data Collection Logic ---
pdf_dir = "cvpr_papers_pdf"
os.makedirs(pdf_dir, exist_ok=True)

search_query = "cat:cs.CV AND (ti:CVPR OR abs:CVPR)"
search = arxiv.Search(
  query = search_query,
  max_results = 50, # Fetching 50 papers for a good-sized dataset
  sort_by = arxiv.SortCriterion.SubmittedDate
)

# This list will hold the final, cleaned data for all papers
all_papers_data = []

print("Starting data collection and preprocessing pipeline...")

for result in tqdm(search.results(), desc="Processing Papers"):
    try:
        paper_id = result.entry_id.split('/')[-1]
        pdf_filename = f"{paper_id}.pdf"
        pdf_path = os.path.join(pdf_dir, pdf_filename)
        
        # 1. Download the PDF
        result.download_pdf(dirpath=pdf_dir, filename=pdf_filename)
        
        # 2. Extract Text using PyMuPDF
        full_text = ""
        with pymupdf.open(pdf_path) as doc:
            for page in doc:
                full_text += page.get_text() + " "
        
        if not full_text.strip():
            print(f"Warning: No text extracted for {paper_id}. Skipping.")
            continue
            
        # 3. Clean the Extracted Text
        cleaned_full_text = clean_text(full_text)
        
        # Also clean the summary for consistency
        cleaned_summary = clean_text(result.summary)
        
        # 4. Store the Processed Data
        all_papers_data.append({
            "id": paper_id,
            "title": result.title,
            "authors": [author.name for author in result.authors],
            "summary": cleaned_summary,
            "full_text": cleaned_full_text,
            "published_date": result.published.isoformat()
        })

    except Exception as e:
        print(f"Failed to process paper {result.entry_id}. Error: {e}")

print(f"\nSuccessfully processed {len(all_papers_data)} papers.")

if all_papers_data:
    print("\n--- Example of Processed Paper ---")
    print("Title:", all_papers_data[0]['title'])
    print("Cleaned Summary:", all_papers_data[0]['summary'][:500])
    print("Cleaned Full Text (start):", all_papers_data[0]['full_text'][:500])
    print("--- End of Example ---")


  for result in tqdm(search.results(), desc="Processing Papers"):


Starting data collection and preprocessing pipeline...


Processing Papers: 50it [04:10,  5.01s/it]


Successfully processed 50 papers.

--- Example of Processed Paper ---
Title: SLRTP2025 Sign Language Production Challenge: Methodology, Results, and Future Work
Cleaned Summary: sign language production slp task generating sign language video spoken language inputs. field seen range innovations last years, introduction deep learning-based approaches providing significant improvements realism naturalness generated outputs. however, lack standardized evaluation metrics slp approaches hampers meaningful comparisons across different systems. address this, introduce first sign language production challenge, held part third slrtp workshop cvpr competitions aims evaluate archi
Cleaned Full Text (start): slrtp sign language production challenge methodology, results, future work harry walsh, fish, ozge mercanoglu sincan, mohamed ilyes lakhal, richard bowden, neil fox, bencie woll, kepeng wu, zecheng li, weichao zhao, haodong wang, wengang zhou, houqiang li, shengeng tang, jiayi he, wang, ruobe




<h3>Step 2: Saving the processed into jsonl file</h3>

In [None]:
# Cell 3: Save Processed Data to JSON Lines file
import json
import os

# Define the directory and filename for the output
output_dir = "processed_data"
output_filename = "cvpr_papers_cleaned.jsonl"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, output_filename)

print(f"Saving processed data to {output_path}...")

# Write the data to a JSON Lines file
with open(output_path, 'w', encoding='utf-8') as f:
    for paper_data in all_papers_data:
        # Convert each paper's dictionary to a JSON string and write it as a new line
        f.write(json.dumps(paper_data) + '\n')

print("Successfully saved the data.")


print("\nVerifying the saved file...")
with open(output_path, 'r', encoding='utf-8') as f:
    first_line = f.readline()
    first_paper = json.loads(first_line)
    print("Successfully read back the first paper's title:", first_paper['title'])


Saving processed data to processed_data\cvpr_papers_cleaned.jsonl...
Successfully saved the data.

Verifying the saved file...
Successfully read back the first paper's title: SLRTP2025 Sign Language Production Challenge: Methodology, Results, and Future Work


<h3>Chunking the cleaned data for building FAISS Vector database</h3>

In [None]:
#Load Data and Chunk Documents
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm

# --- Load the processed data from the JSONL file ---
processed_data_path = "processed_data/cvpr_papers_cleaned.jsonl"
papers = []
with open(processed_data_path, 'r', encoding='utf-8') as f:
    for line in f:
        papers.append(json.loads(line))

print(f"Loaded {len(papers)} papers from {processed_data_path}")

# --- Initialize the Text Splitter ---
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # The maximum number of characters in a chunk
    chunk_overlap=200,    # The number of characters to overlap between chunks to maintain context
    length_function=len,  
    add_start_index=True, # This helps in identifying the location of a chunk in the original text
)

# --- Process each paper and create chunks ---
all_chunks = []
print("\nStarting to chunk documents...")

for paper in tqdm(papers, desc="Chunking Papers"):
    # We create a combined text with important metadata for better context during retrieval
    text_to_split = f"Title: {paper['title']}\n\nSummary: {paper['summary']}\n\nFull Text: {paper['full_text']}"
    
    # Create the chunks
    chunks = text_splitter.create_documents([text_to_split])
    
    # Add paper-specific metadata to each chunk
    for chunk in chunks:
        chunk.metadata['paper_id'] = paper['id']
        chunk.metadata['title'] = paper['title']
        
    all_chunks.extend(chunks)

print(f"\nCreated a total of {len(all_chunks)} chunks from {len(papers)} papers.")

# --- Verification (Optional) ---
# Let's inspect a chunk to see what it looks like
if all_chunks:
    print("\n--- Example of a Chunk ---")
    example_chunk = all_chunks[0]
    print(f"Paper ID: {example_chunk.metadata['paper_id']}")
    print(f"Title: {example_chunk.metadata['title']}")
    print(f"Chunk Content (start): {example_chunk.page_content[:500]}...")
    print("--- End of Example ---")


Loaded 50 papers from processed_data/cvpr_papers_cleaned.jsonl

Starting to chunk documents...


Chunking Papers: 100%|██████████| 50/50 [00:00<00:00, 124.56it/s]


Created a total of 2310 chunks from 50 papers.

--- Example of a Chunk ---
Paper ID: 2508.06951v1
Title: SLRTP2025 Sign Language Production Challenge: Methodology, Results, and Future Work
Chunk Content (start): Title: SLRTP2025 Sign Language Production Challenge: Methodology, Results, and Future Work...
--- End of Example ---





In [None]:
#Save the processed chunks to a file
import json
import os

# --- Convert Document objects to a list of dictionaries ---
# This is necessary because LangChain's Document object is not directly JSON serializable.
chunks_to_save = [
    {
        "page_content": chunk.page_content,
        "metadata": chunk.metadata
    } 
    for chunk in all_chunks
]

# --- Save to a JSON Lines file ---
output_dir = "processed_data"
output_filename = "cvpr_papers_chunks.jsonl"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, output_filename)

print(f"Saving {len(chunks_to_save)} chunks to {output_path}...")

with open(output_path, 'w', encoding='utf-8') as f:
    for chunk_dict in chunks_to_save:
        f.write(json.dumps(chunk_dict) + '\n')

print("Successfully saved the chunks.")

# --- Verification (Optional) ---
print("\nVerifying the saved chunks file...")
with open(output_path, 'r', encoding='utf-8') as f:
    first_line = f.readline()
    if first_line:
        first_chunk = json.loads(first_line)
        print("Successfully read back the first chunk's metadata:", first_chunk['metadata'])


Saving 2310 chunks to processed_data\cvpr_papers_chunks.jsonl...
Successfully saved the chunks.

Verifying the saved chunks file...
Successfully read back the first chunk's metadata: {'start_index': 0, 'paper_id': '2508.06951v1', 'title': 'SLRTP2025 Sign Language Production Challenge: Methodology, Results, and Future Work'}


<h3>Building FAISS Vector Database</h3>

In [None]:
#Load Chunks, Embed, and Build FAISS Index
import json
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document # Needed to reconstruct Document objects

# --- Load the saved chunks from the JSONL file ---
chunks_path = "processed_data/cvpr_papers_chunks.jsonl"
loaded_chunks = []

print(f"Loading chunks from {chunks_path}...")
with open(chunks_path, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        # Recreate the LangChain Document object from the dictionary
        chunk = Document(page_content=data['page_content'], metadata=data['metadata'])
        loaded_chunks.append(chunk)

print(f"Successfully loaded {len(loaded_chunks)} chunks.")

# --- Initialize the Embedding Model ---
# This part remains the same.
print("\nLoading embedding model...")
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'} # Use 'cpu' if you don't have a GPU
encode_kwargs = {'normalize_embeddings': False}
embeddings_function = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
print("Embedding model loaded.")

# --- Build the FAISS Vector Store from the loaded chunks ---
print("\nBuilding FAISS index from loaded document chunks... (This will take a few minutes)")
vector_store = FAISS.from_documents(
    documents=loaded_chunks, # We use the chunks we just loaded from the file
    embedding=embeddings_function
)

# --- Save the FAISS Index to Disk ---
faiss_index_path = "faiss_cvpr_index"
vector_store.save_local(faiss_index_path)

print(f"\nFAISS index built and saved to '{faiss_index_path}'")

# --- Verification (Optional) ---
# This part also remains the same.
print("\nVerifying the vector store with a test query...")
query = "What are some novel approaches to 3D human motion modeling?"
results = vector_store.similarity_search(query, k=3)

print(f"\nTest Query: '{query}'")
print("Top 3 similar chunks found:")
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(f"Source Paper Title: {doc.metadata['title']}")
    print(f"Chunk Content: {doc.page_content[:600]}...")
    print("-" * 20)



Loading chunks from processed_data/cvpr_papers_chunks.jsonl...
Successfully loaded 2310 chunks.

Loading embedding model...


  embeddings_function = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Embedding model loaded.

Building FAISS index from loaded document chunks... (This will take a few minutes)

FAISS index built and saved to 'faiss_cvpr_index'

Verifying the vector store with a test query...

Test Query: 'What are some novel approaches to 3D human motion modeling?'
Top 3 similar chunks found:

--- Result 1 ---
Source Paper Title: EventEgo3D++: 3D Human Motion Capture from a Head-Mounted Event Camera
Chunk Content: Title: EventEgo3D++: 3D Human Motion Capture from a Head-Mounted Event Camera...
--------------------

--- Result 2 ---
Source Paper Title: EventEgo3D++: 3D Human Motion Capture from a Head-Mounted Event Camera
Chunk Content: data fine-tuning evaluating method outdoor environments. thirdly, provide allocentric rgb views smpl loper al, body annotations real datasets, thereby providing comprehensive dataset advancing research. inclusion in-the-wild data ensures robustness real-world conditions, smpl body annotations provide dense human correspondences, making d