In [1]:
!pip install arxiv langchain langchain-community torch transformers pillow datasets

Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain)
  Downloading langchain_core-0.3.44-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.6 (from langchain)
  Downloading langchain_tex

In [2]:
!pip install pymupdf faiss-cpu PyMuPDF requests

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf, faiss-cpu
Successfully installed faiss-cpu-1.10.0 pymupdf-1.25.3


In [3]:
# Step 1: Implement a Basic Text-Based RAG Model Using LangChain

# Import necessary libraries
import arxiv
import os
import requests
import json
from langchain.document_loaders import ArxivLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings  # Use Hugging Face for embeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline  # Use Hugging Face for text generation
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM  # For Hugging Face models
import torch
import fitz
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration

In [4]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("Hugging_Face_Token")

In [45]:
# Step 1.1: Collect and Preprocess Text Data
def collect_and_preprocess_data(query: str, max_results: int = 10):
    """
    Collect scientific papers from arXiv based on a query and preprocess the text data.
    Args:
        query (str): The search query for arXiv (e.g., "quantum computing").
        max_results (int): Maximum number of papers to retrieve.
    Returns:
        List[Document]: A list of preprocessed documents.
    """
    # Use LangChain's ArxivLoader to fetch papers
    loader = ArxivLoader(query=query, max_results=max_results)
    documents = loader.load()

    # Preprocess the text data
    for doc in documents:
        # Remove LaTeX formatting 
        doc.page_content = doc.page_content.replace("\\", "")

        # Remove special characters and stopwords
        doc.page_content = " ".join([word for word in doc.page_content.split() if word.isalnum()])

    return documents


In [50]:
# Step 1.2: Set Up LangChain for Text Retrieval
def setup_rag_pipeline(documents):
    """
    Set up a basic RAG pipeline using LangChain and Hugging Face models.
    Args:
        documents (List[Document]): Preprocessed documents.
    Returns:
        RetrievalQA: A RAG pipeline for querying documents.
    """
    # Step 1.2.1: Split documents into smaller chunks for better retrieval
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)

    # Step 1.2.2: Generate embeddings for the text chunks using Hugging Face
    model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Lightweight and efficient embedding model
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    vectorstore = FAISS.from_documents(texts, embeddings)

    # Step 1.2.3: Set up the retriever
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 most relevant chunks

    # Step 1.2.4: Integrate a Hugging Face language model for text generation
    tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Use GPT-2 for text generation
    model = AutoModelForCausalLM.from_pretrained("gpt2")
    text_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100,
        temperature=0.7
    )
    llm = HuggingFacePipeline(pipeline=text_generator)

    # Step 1.2.5: Create the RAG pipeline
    rag_pipeline = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

    return rag_pipeline



In [51]:
# Step 1.3: Test the Text-Based RAG Model
def test_rag_pipeline(rag_pipeline, query: str):
    """
    Test the RAG pipeline by querying it with a user question.
    Args:
        rag_pipeline (RetrievalQA): The RAG pipeline.
        query (str): The user's query.
    """
    # Query the RAG pipeline
    result = rag_pipeline({"query": query})

    # Display the result
    print("Answer:", result["result"])
    print("\nSource Documents:")
    for doc in result["source_documents"]:
        print(f"Document: {doc.metadata['title']}")
        print(f"Content: {doc.page_content[:200]}...\n")  # Display first 200 characters


In [52]:
# Main Execution
if __name__ == "__main__":
    # Step 1.1: Collect and preprocess data
    query = "quantum computing"
    documents = collect_and_preprocess_data(query, max_results=5)

    # Step 1.2: Set up the RAG pipeline
    rag_pipeline = setup_rag_pipeline(documents)

    # Step 1.3: Test the RAG pipeline
    user_query = "What is quantum computing?"
    test_rag_pipeline(rag_pipeline, user_query)

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

31 Mar 2000 Unconventional Quantum Computing Devices Seth Lloyd Mechanical Engineering MIT 02139 This paper investigates a variety of unconventional quantum computation including fermionic quantum computers and computers that exploit nonlinear tum It is shown that unconventional quantum computing devices can in ciple compute some quantities more rapidly than quantum Computers are what they can and cannot do is determined by the laws of When scientiﬁc progress augments or revises those our picture of what computers can do quantum mechanics is generally accepted as the fundamental dynamical theory of how physical systems Quantum computers can in principle exploit quantum coherence to perform computational tasks that classical puters cannot If someday quantum mechanics should turn out to be incomplete or then our pictur

KeyError: 'title'

In [5]:
# Step 2.1: Load Metadata
def load_metadata(metadata_path):
    """
    Load arXiv metadata from the JSON file.
    Args:
        metadata_path (str): Path to the metadata JSON file.
    Returns:
        List[dict]: List of paper metadata.
    """
    with open(metadata_path, "r") as f:
        metadata = [json.loads(line) for line in f]
    return metadata

# Step 2.2: Download PDFs from arXiv
def download_pdf(paper_id, output_folder):
    """
    Download a PDF from arXiv using the paper ID.
    Args:
        paper_id (str): arXiv paper ID (e.g., "0001.0001").
        output_folder (str): Folder to save the downloaded PDF.
    Returns:
        str: Path to the downloaded PDF.
    """
    pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
    pdf_path = os.path.join(output_folder, f"{paper_id}.pdf")

    # Download the PDF
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(pdf_path, "wb") as f:
            f.write(response.content)
        return pdf_path
    else:
        print(f"Failed to download PDF for paper {paper_id}.")
        return None


In [6]:
# Step 2.3: Extract Images from PDFs
def extract_images_from_pdf(pdf_path, output_folder):
    """
    Extract images from a PDF file and save them to the output folder.
    Args:
        pdf_path (str): Path to the PDF file.
        output_folder (str): Folder to save extracted images.
    Returns:
        List[str]: List of paths to extracted images.
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    image_paths = []

    # Iterate through pages and extract images
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        image_list = page.get_images(full=True)

        # Save each image
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
            image_path = os.path.join(output_folder, image_filename)

            with open(image_path, "wb") as image_file:
                image_file.write(image_bytes)
            image_paths.append(image_path)

    return image_paths

# Step 2.4: Generate Image Embeddings Using CLIP
def extract_and_embed_images(image_paths):
    """
    Extract images and generate embeddings using OpenAI's CLIP model.
    Args:
        image_paths (List[str]): List of paths to images.
    Returns:
        List[torch.Tensor]: List of image embeddings.
    """
    # Load CLIP model and processor
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # Process images and generate embeddings
    image_embeddings = []
    for image_path in image_paths:
        image = Image.open(image_path)
        inputs = clip_processor(images=image, return_tensors="pt", padding=True)
        with torch.no_grad():
            image_features = clip_model.get_image_features(**inputs)
        image_embeddings.append(image_features)

    return image_embeddings

In [7]:
# Step 2.2: Incorporate Audio/Video Embeddings Using Whisper
def transcribe_and_embed_audio(audio_paths):
    """
    Transcribe audio and generate embeddings for the transcribed text.
    Args:
        audio_paths (List[str]): List of paths to audio files.
    Returns:
        List[torch.Tensor]: List of text embeddings.
    """
    # Load Whisper model and processor
    whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

    # Load text embedding model
    text_embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Transcribe audio and generate embeddings
    text_embeddings = []
    for audio_path in audio_paths:
        # Transcribe audio
        inputs = whisper_processor.from_pretrained(audio_path, return_tensors="pt", sampling_rate=16000)
        with torch.no_grad():
            generated_ids = whisper_model.generate(inputs.input_features)
        transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # Generate text embeddings
        embedding = text_embedder.embed_documents([transcription])
        text_embeddings.append(embedding)

    return text_embeddings

In [8]:
# Step 2.3: Combine Multimodal Embeddings
def combine_multimodal_embeddings(image_embeddings, text_embeddings):
    """
    Combine image and text embeddings into a unified embedding space.
    Args:
        image_embeddings (List[torch.Tensor]): List of image embeddings.
        text_embeddings (List[torch.Tensor]): List of text embeddings.
    Returns:
        List[torch.Tensor]: Combined embeddings.
    """
    combined_embeddings = []
    for img_emb, txt_emb in zip(image_embeddings, text_embeddings):
        # Concatenate image and text embeddings (or use another fusion method)
        combined_embedding = torch.cat((img_emb, txt_emb), dim=1)
        combined_embeddings.append(combined_embedding)

    return combined_embeddings

In [9]:
# Main Execution for Step 2
if __name__ == "__main__":
    # Paths
    metadata_path = "/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json"  # Path to metadata
    pdf_output_folder = "/kaggle/working/pdfs"  # Folder to save downloaded PDFs
    image_output_folder = "/kaggle/working/extracted_images"  # Folder to save extracted images

    # Create output folders if they don't exist
    os.makedirs(pdf_output_folder, exist_ok=True)
    os.makedirs(image_output_folder, exist_ok=True)

    # Step 2.1: Load metadata
    metadata = load_metadata(metadata_path)
    print(f"Loaded metadata for {len(metadata)} papers.")

    # Step 2.2: Download PDF for the first paper (for demonstration)
    paper_id = metadata[0]["id"]  # Use the first paper in the metadata
    pdf_path = download_pdf(paper_id, pdf_output_folder)

    if pdf_path:
        print(f"Downloaded PDF for paper {paper_id} to {pdf_path}.")

        # Step 2.3: Extract images from PDF
        image_paths = extract_images_from_pdf(pdf_path, image_output_folder)
        print("Extracted images:", image_paths)

        # Step 2.4: Extract and embed images
        if image_paths:
            image_embeddings = extract_and_embed_images(image_paths)
            print("Image embeddings generated:", len(image_embeddings))
        else:
            print("No images found in the PDF.")
    else:
        print("Failed to download PDF.")


Loaded metadata for 2683176 papers.
Downloaded PDF for paper 0704.0001 to /kaggle/working/pdfs/0704.0001.pdf.
Extracted images: ['/kaggle/working/extracted_images/page_15_img_1.jpeg']


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Image embeddings generated: 1
