In [1]:
!pip install arxiv langchain langchain-community torch transformers pillow datasets pymupdf faiss-cpu PyMuPDF requests

Defaulting to user installation because normal site-packages is not writeable


In [7]:
import arxiv

def fetch_arxiv_metadata(arxiv_id):
    """Fetches title, authors, and abstract from arXiv using the arxiv_id."""
    search = arxiv.Search(id_list=[arxiv_id])
    paper = next(search.results(), None)
    
    if paper:
        metadata = {
            "title": paper.title,
            "authors": [author.name for author in paper.authors],
            "abstract": paper.summary,
            "doi": paper.doi if paper.doi else "DOI Not Available",
            "published": paper.published,
            "url": paper.entry_id
        }
        return metadata
    else:
        return {"error": "Paper not found on arXiv"}

# Extracted arXiv ID from the file (0704.0001)
arxiv_id = "0704.0001"

# Fetch and print metadata
metadata = fetch_arxiv_metadata(arxiv_id)


  paper = next(search.results(), None)


In [9]:
print(*metadata.items(), sep="\n")

('title', 'Calculation of prompt diphoton production cross sections at Tevatron and LHC energies')
('authors', ['C. Balázs', 'E. L. Berger', 'P. M. Nadolsky', 'C. -P. Yuan'])
('abstract', 'A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of 

In [4]:


meta = PdfReader("tmp/0704.0001.pdf").metadata
print(*meta.items(), sep="\n")

('/Producer', 'dvips + GPL Ghostscript GIT PRERELEASE 9.22')
('/CreationDate', "D:20181023213259-04'00'")
('/ModDate', "D:20181023213259-04'00'")
('/Creator', 'LaTeX with hyperref package')
('/Title', '')
('/Subject', '')
('/Author', '')
('/Keywords', '')


In [None]:
import os
import json
import logging
logger = logging.getLogger(__name__)

PATH_SAVED_QUERIES = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    "data",
    "saved_queries"
)

filename = query2filename(query)
filepath = os.path.join(PATH_SAVED_QUERIES, filename)
logger.info(f"Saving response to {filepath}")

print()

#### Import Libraries

In [1]:
import os
import json
import requests
from pypdf import PdfReader
import pymupdf  
import numpy as np
import torch
from tqdm import tqdm
from PIL import Image
from transformers import CLIPModel, CLIPProcessor, WhisperProcessor, WhisperForConditionalGeneration
from sentence_transformers import SentenceTransformer
import faiss
import gc

from IPython.display import Image, display


In [5]:
import os
import re
from pypdf import PdfReader
from PIL import Image

# Path to the uploaded PDF file
pdf_path = "tmp/0704.0001.pdf"
output_dir = "extracted_figures"
os.makedirs(output_dir, exist_ok=True)

# Open the PDF
reader = PdfReader(pdf_path)

# Loop through each page
for page_num, page in enumerate(reader.pages):
    text = page.extract_text()  # Extract text from page
    if not text:
        continue
    
    # Search for "Figure" (case insensitive)
    if re.search(r"\bFigure\b", text, re.IGNORECASE):
        print(f"Figure detected on page {page_num+1}")

        # Extract images from the page
        for img_index, img in enumerate(page.images):
            img_data = img.data
            img_ext = img.name.split(".")[-1] if "." in img.name else "png"

            # Save image
            image_path = os.path.join(output_dir, f"figure_p{page_num+1}_{img_index}.{img_ext}")
            with open(image_path, "wb") as img_file:
                img_file.write(img_data)

print(f"Figures extracted and saved in {output_dir}.")


Figures extracted and saved in extracted_figures.


#### Load Metadata

In [2]:
def load_metadata(metadata_path):
    """
    Load arXiv metadata from the JSON file.
    Args:
        metadata_path (str): Path to the metadata JSON file.
    Returns:
        List[dict]: List of paper metadata.
    """
    with open(metadata_path, "r") as f:
        metadata = [json.loads(line) for line in f]
    return metadata

#### Download PDFs from arXiv

In [3]:
def download_pdf(paper_id, output_folder):
    """
    Download a PDF from arXiv using the paper ID.
    Args:
        paper_id (str): arXiv paper ID (e.g., "0001.0001").
        output_folder (str): Folder to save the downloaded PDF.
    Returns:
        str: Path to the downloaded PDF.
    """
    pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
    pdf_path = os.path.join(output_folder, f"{paper_id}.pdf")

    # Download the PDF
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(pdf_path, "wb") as f:
            f.write(response.content)
        return pdf_path
    else:
        print(f"Failed to download PDF for paper {paper_id}.")
        return None

#### Projection Layer

In [4]:
import torch.nn as nn

# Define a projection layer to normalize embedding dimensions
class ProjectionLayer(nn.Module):
    def __init__(self, input_dim, output_dim=512):
        super(ProjectionLayer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim, bias=False)

    def forward(self, x):
        return self.linear(x)

# Initialize projection layer for text embeddings
projection_layer_text = ProjectionLayer(input_dim=384, output_dim=512)  # Map text embeddings to 512 dimensions

#### Extract Images from PDFs

In [14]:
def extract_images_from_pdf(pdf_path, output_folder):
    """
    Extract images from a PDF file and save them to the output folder.
    Args:
        pdf_path (str): Path to the PDF file.
        output_folder (str): Folder to save extracted images.
    Returns:
        List[str]: List of paths to extracted images.
    """
    # Open the PDF file
    pdf_document = pymupdf.open(pdf_path)
    image_paths = []

    # Iterate through pages and extract images
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        image_list = page.get_images(full=True)

        # Save each image
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
            image_path = os.path.join(output_folder, image_filename)

            with open(image_path, "wb") as image_file:
                image_file.write(image_bytes)
            image_paths.append(image_path)

    return image_paths

In [44]:
import fitz
import io
from PIL import Image
import os

def extract_all_possible_images(pdf_path, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    pdf = fitz.open(pdf_path)
    image_count = 0
    
    # Method 1: Extract images using get_images()
    for page_num in range(len(pdf)):
        page = pdf[page_num]
        image_list = page.get_images(full=True)
        
        for img_index, img in enumerate(image_list):
            try:
                xref = img[0]
                base_image = pdf.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                
                image_count += 1
                output_filename = f"{output_dir}/method1_page{page_num+1}_img{img_index}.{image_ext}"
                
                with open(output_filename, "wb") as image_file:
                    image_file.write(image_bytes)
                
                print(f"Method 1: Saved image {image_count} from page {page_num+1}")
            except Exception as e:
                print(f"Method 1: Failed on page {page_num+1}, img {img_index}: {e}")
    
    # Method 2: Extract images from page dictionaries
    for page_num in range(len(pdf)):
        page = pdf[page_num]
        
        try:
            # Process XObject resources
            if "XObject" in page.get_resources():
                xobjects = page.get_resources()["XObject"]
                for key, xobject in xobjects.items():
                    try:
                        if xobject.get("Subtype") == "Image":
                            image_count += 1
                            
                            # Try to extract image data
                            pix = fitz.Pixmap(pdf, xobject.get("SMask", 0))
                            output_filename = f"{output_dir}/method2_page{page_num+1}_{key}.png"
                            pix.save(output_filename)
                            print(f"Method 2: Saved image {image_count} from page {page_num+1}")
                    except Exception as e:
                        print(f"Method 2: Failed on page {page_num+1}, key {key}: {e}")
        except Exception as e:
            print(f"Method 2: Failed to process resources on page {page_num+1}: {e}")
    
    # Method 3: Render full pages at high resolution
    for page_num in range(len(pdf)):
        page = pdf[page_num]
        
        try:
            pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
            output_filename = f"{output_dir}/method3_page{page_num+1}.png"
            pix.save(output_filename)
            image_count += 1
            print(f"Method 3: Saved full page {page_num+1}")
        except Exception as e:
            print(f"Method 3: Failed to render page {page_num+1}: {e}")
    
    pdf.close()
    return image_count


In [45]:
# TEST extract_images_from_pdf
image_paths = extract_all_possible_images("tmp/0704.0001.pdf", "tmp/images")

for imageName in image_paths:
    display(Image(filename=imageName))

Method 1: Saved image 1 from page 15
Method 2: Failed to process resources on page 1: 'Page' object has no attribute 'get_resources'
Method 2: Failed to process resources on page 2: 'Page' object has no attribute 'get_resources'
Method 2: Failed to process resources on page 3: 'Page' object has no attribute 'get_resources'
Method 2: Failed to process resources on page 4: 'Page' object has no attribute 'get_resources'
Method 2: Failed to process resources on page 5: 'Page' object has no attribute 'get_resources'
Method 2: Failed to process resources on page 6: 'Page' object has no attribute 'get_resources'
Method 2: Failed to process resources on page 7: 'Page' object has no attribute 'get_resources'
Method 2: Failed to process resources on page 8: 'Page' object has no attribute 'get_resources'
Method 2: Failed to process resources on page 9: 'Page' object has no attribute 'get_resources'
Method 2: Failed to process resources on page 10: 'Page' object has no attribute 'get_resources'
Me

TypeError: 'int' object is not iterable

#### Generate Image Embeddings

In [18]:
def extract_and_embed_images(image_paths):
    """
    Extract images and generate embeddings using OpenAI's CLIP model.
    Args:
        image_paths (List[str]): List of paths to images.
    Returns:
        List[np.ndarray]: List of image embeddings as NumPy arrays.
    """
    import numpy as np
    from PIL import Image

    # Load CLIP model and processor
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # Process images and generate embeddings
    image_embeddings = []
    for image_path in image_paths:
        try:
            # Open the image and convert to RGB
            image = Image.open(image_path).convert("RGB")
            
            # Convert to numpy array and fix shape
            image_np = np.array(image)
            
            # Fix ambiguous shapes (e.g., (1, 477, 3) → (477, 3, 3))
            if image_np.ndim == 3 and image_np.shape[0] == 1:
                # Remove singleton dimension (e.g., shape becomes (477, 3))
                image_np = np.squeeze(image_np, axis=0)
                # Replicate to create 3 channels if needed
                if image_np.ndim == 2:
                    image_np = np.stack([image_np] * 3, axis=-1)
            
            # Ensure 3 channels
            if image_np.shape[-1] != 3:
                image_np = np.stack([image_np[..., 0]] * 3, axis=-1)
            
            # Convert back to PIL Image
            image = Image.fromarray(image_np)
            
            # Process the image with CLIP
            inputs = clip_processor(images=image, return_tensors="pt", padding=True)
            with torch.no_grad():
                image_features = clip_model.get_image_features(**inputs)
                
            image_embeddings.append(image_features.cpu().numpy())
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            continue

    return image_embeddings

#### Generate Text Embeddings

In [19]:
def extract_and_embed_text(metadata, projection_layer=None):
    """
    Generate text embeddings for titles and abstracts using a sentence transformer.
    Args:
        metadata (List[dict]): List of paper metadata.
        projection_layer (nn.Module): Projection layer to normalize embedding dimensions.
    Returns:
        List[np.ndarray]: List of text embeddings as NumPy arrays.
    """
    # Load text embedding model
    text_embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    # Generate text embeddings
    text_embeddings = []
    for paper in metadata:
        text = f"{paper['title']} {paper['abstract']}"
        embedding = text_embedder.encode(text)
        if projection_layer:
            embedding = torch.tensor(embedding, dtype=torch.float32)
            with torch.no_grad():
                embedding = projection_layer(embedding).numpy()
        text_embeddings.append(embedding)

    return text_embeddings

#### Save Embeddings

In [20]:
def save_embeddings(embeddings, output_folder, batch_id, prefix):
    """
    Save embeddings to disk as a NumPy file.
    Args:
        embeddings (List[np.ndarray]): List of embeddings.
        output_folder (str): Folder to save embeddings.
        batch_id (int): Batch ID for the filename.
        prefix (str): Prefix for the filename (e.g., "image" or "text").
    """
    # Convert embeddings to a 2D NumPy array
    embeddings_array = np.vstack(embeddings)

    # Save embeddings to disk
    embeddings_path = os.path.join(output_folder, f"{prefix}_embeddings_batch_{batch_id}.npy")
    np.save(embeddings_path, embeddings_array)
    print(f"Saved {prefix} embeddings to {embeddings_path}")

#### Build and Save Faiss Index

In [21]:
def build_faiss_index(embeddings, projection_layer=None):
    """
    Build a FAISS index from the embeddings.
    Args:
        embeddings (List[np.ndarray]): List of embeddings.
        projection_layer (nn.Module): Projection layer to normalize embedding dimensions.
    Returns:
        faiss.Index: FAISS index.
    """
    # Convert embeddings to a 2D NumPy array
    embeddings_array = np.vstack(embeddings)

    # Apply projection layer if provided
    if projection_layer:
        embeddings_array = torch.tensor(embeddings_array, dtype=torch.float32)
        with torch.no_grad():
            embeddings_array = projection_layer(embeddings_array).numpy()

    # Build FAISS index
    dimension = embeddings_array.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_array)

    return index

# Save FAISS Index and Metadata
def save_faiss_index_and_metadata(index, id_to_doc, output_folder):
    """
    Save the FAISS index and id_to_doc mappings.
    Args:
        index (faiss.Index): FAISS index.
        id_to_doc (dict): Mapping from IDs to document metadata.
        output_folder (str): Folder to save the index and metadata.
    """
    # Save FAISS index
    faiss_index_path = os.path.join(output_folder, "faiss_index.index")
    faiss.write_index(index, faiss_index_path)
    print(f"Saved FAISS index to {faiss_index_path}")

    # Save id_to_doc mappings
    id_to_doc_path = os.path.join(output_folder, "id_to_doc.json")
    with open(id_to_doc_path, "w") as f:
        json.dump(id_to_doc, f)
    print(f"Saved id_to_doc mappings to {id_to_doc_path}")

#### Main Execution

In [23]:
def process_papers(metadata, output_base, num_papers=500, batch_size=10):
    # Create output directories
    os.makedirs(output_base, exist_ok=True)
    pdf_dir = os.path.join(output_base, "pdfs")
    img_dir = os.path.join(output_base, "images")
    emb_dir = os.path.join(output_base, "embeddings")
    index_dir = os.path.join(output_base, "faiss_index")
    
    for d in [pdf_dir, img_dir, emb_dir, index_dir]:
        os.makedirs(d, exist_ok=True)

    # Initialize models
    text_encoder = SentenceTransformer("all-MiniLM-L6-v2")
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    projection = ProjectionLayer(384, 512).eval()
    
    # Initialize FAISS and metadata
    id_to_doc = {}
    global_idx = 0
    index = faiss.IndexFlatL2(512)
    
    # Process papers
    for batch_idx in tqdm(range(0, num_papers, batch_size)):
        batch = metadata[batch_idx:batch_idx+batch_size]
        
        for paper in batch:
            paper_id = paper['id']
            pdf_path = download_pdf(paper_id, pdf_dir)
            if not pdf_path:
                continue

            # Text Embedding
            text = f"{paper['title']} {paper['abstract']}"
            text_emb = text_encoder.encode(text)
            with torch.no_grad():
                proj_emb = projection(torch.tensor(text_emb)).numpy()
            
            # Add to index and map ID
            index.add(proj_emb.reshape(1, -1))
            id_to_doc[global_idx] = {
                'type': 'text',
                'paper_id': paper_id,
                'path': pdf_path
            }
            global_idx += 1

            # Image Processing
            try:
                images = extract_images_from_pdf(pdf_path, img_dir)
                for img_path in images:
                    img = Image.open(img_path).convert('RGB')
                    inputs = clip_processor(images=img, return_tensors="pt")
                    with torch.no_grad():
                        img_emb = clip_model.get_image_features(**inputs).cpu().numpy()
                    
                    # Add to index
                    index.add(img_emb.reshape(1, -1))
                    id_to_doc[global_idx] = {
                        'type': 'image',
                        'paper_id': paper_id,
                        'path': img_path
                    }
                    global_idx += 1
            except Exception as e:
                print(f"Image error {paper_id}: {e}")

        # Save incremental state
        if (batch_idx // batch_size) % 10 == 0:
            faiss.write_index(index, os.path.join(index_dir, f"temp_index_{batch_idx}.index"))
            with open(os.path.join(index_dir, f"temp_mapping_{batch_idx}.json"), 'w') as f:
                json.dump(id_to_doc, f)

    # Final save
    faiss.write_index(index, os.path.join(index_dir, "final_index.index"))
    with open(os.path.join(index_dir, "final_mapping.json"), 'w') as f:
        json.dump(id_to_doc, f)
    torch.save(projection.state_dict(), os.path.join(index_dir, "projection.pt"))
    
    return index, id_to_doc

In [None]:
# Indexing
index, mapping = process_papers(
    metadata=load_metadata("/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json"), 
    output_base="/kaggle/working",
    num_papers=500
)


#### Retrieval Pipeline

In [24]:
class ArXivRetriever:
    def __init__(self, index_path, mapping_path, projection_path):
        """
        Initialize the retriever.
        
        Args:
            index_path (str): Path to the FAISS index.
            mapping_path (str): Path to the id_to_doc mapping JSON file.
            projection_path (str): Path to the projection layer weights.
        """
        # Load FAISS index
        self.index = faiss.read_index(index_path)
        
        # Load id_to_doc mapping
        with open(mapping_path) as f:
            self.id_to_doc = json.load(f)
        
        # Load projection layer
        self.projection = ProjectionLayer(384, 512)
        self.projection.load_state_dict(torch.load(projection_path))
        self.projection.eval()
        
        # Load text embedding model
        self.text_encoder = SentenceTransformer("all-MiniLM-L6-v2")
    
    def embed_query(self, text):
        """
        Embed and project the query text.
        
        Args:
            text (str): The query text.
        
        Returns:
            np.ndarray: The projected query embedding.
        """
        
        query_embedding = self.text_encoder.encode(text)
        with torch.no_grad():
            query_embedding = self.projection(torch.tensor(query_embedding)).numpy()
        # Ensure normalization
        faiss.normalize_L2(query_embedding.reshape(1, -1))  # <-- Critical fix
        return query_embedding
    
    def query(self, text, k=10):
        """
        Retrieve relevant text and image results for a query.
        
        Args:
            text (str): The query text.
            k (int): Number of results to retrieve.
        
        Returns:
            List[dict]: A list of results, each containing:
                - type: "text" or "image"
                - paper_id: The arXiv paper ID
                - path: Path to the PDF or image
                - score: Similarity score
                - text: Text content (for text results)
            """
        # Embed the query
        query_embedding = self.embed_query(text)
        
        # Reshape query_embedding to 2D (1 query, embedding_dimension)
        query_embedding = query_embedding.reshape(1, -1)
        
        # Search the FAISS index
        distances, indices = self.index.search(query_embedding, k)
        
        # Extract relevant metadata
        results = []
        for idx, dist in zip(indices[0], distances[0]):
            if str(idx) in self.id_to_doc:
                doc = self.id_to_doc[str(idx)]
                result = {
                    'type': doc['type'],
                    'paper_id': doc['paper_id'],  # Use 'paper_id' instead of 'paper'
                    #'path': doc['path'],
                    'score': float(dist),
                }
                if doc['type'] == 'text':
                    result['text'] = doc.get('text', '')  # Add text content for text results
                results.append(result)
        
        return results
       
    def display_results(self, query, k=10):
        """
        Display both text and image results for a query.
        
        Args:
            query (str): The query text.
            k (int): Number of results to retrieve.
        """
        # Retrieve results
        results = self.query(query, k)
        
        # Display results
        print(f"Results for query: '{query}'\n")
        for res in results:
            if res['type'] == 'text':
                print(f"Text from Paper {res['paper_id']} (Score: {res['score']:.4f}):")
                print(res.get('text', ''))  # Display text content
                print(f"PDF Path: {res['path']}\n")
            elif res['type'] == 'image':
                print(f"Image from Paper {res['paper_id']} (Score: {res['score']:.4f}):")
                print(f"Image Path: {res['path']}\n")

    def query_images(self, text, k=5):
        """
        Retrieve only image results.
        """
        results = self.query(text, k=k)
        print("All results (text + images):")
        for res in results[:10]:  # Print top 10 for inspection
            print(f"Type: {res['type']}, Score: {res['score']:.2f}")
        image_results = [res for res in results if res['type'] == 'image']
        return image_results
    
    def query_texts(self, text, k=5):
        """
        Retrieve only text results.
        """
        results = self.query(text, k=k)
        text_results = [res for res in results if res['type'] == 'text']
        return text_results

In [25]:

# Retrieval
retriever = ArXivRetriever(
    index_path="/kaggle/working/faiss_index/final_index.index",
    mapping_path="/kaggle/working/faiss_index/final_mapping.json",
    projection_path="/kaggle/working/faiss_index/projection.pt"
)

# Query
results = retriever.query("transformer neural networks", k=5)
for res in results:
    print(f"{res['type'].upper()} from {res['paper_id']} (Score: {res['score']:.2f})")
    print(f"Path: {res['path']}\n")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'

In [34]:
# Retrieval
retriever = ArXivRetriever(
    index_path="/kaggle/working/faiss_index/final_index.index",
    mapping_path="/kaggle/working/faiss_index/final_mapping.json",
    projection_path="/kaggle/working/faiss_index/projection.pt"
)

# Query
results = retriever.query("Quantum computing", k=5)
for res in results:
    print(f"{res['type'].upper()} from {res['paper_id']} (Score: {res['score']:.2f})")
    print(f"Path: {res['path']}\n")

  self.projection.load_state_dict(torch.load(projection_path))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

TEXT from 0704.0202 (Score: 0.34)
Path: /kaggle/working/pdfs/0704.0202.pdf

TEXT from 0704.0323 (Score: 0.44)
Path: /kaggle/working/pdfs/0704.0323.pdf

TEXT from 0704.0268 (Score: 0.47)
Path: /kaggle/working/pdfs/0704.0268.pdf

TEXT from 0704.0482 (Score: 0.47)
Path: /kaggle/working/pdfs/0704.0482.pdf

TEXT from 0704.0051 (Score: 0.49)
Path: /kaggle/working/pdfs/0704.0051.pdf



In [35]:
# Retrieval
retriever = ArXivRetriever(
    index_path="/kaggle/working/faiss_index/final_index.index",
    mapping_path="/kaggle/working/faiss_index/final_mapping.json",
    projection_path="/kaggle/working/faiss_index/projection.pt"
)

# Query
results = retriever.query("Diagram", k=5)
for res in results:
    print(f"{res['type'].upper()} from {res['paper_id']} (Score: {res['score']:.2f})")
    print(f"Path: {res['path']}\n")

  self.projection.load_state_dict(torch.load(projection_path))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

TEXT from 0704.0010 (Score: 0.68)
Path: /kaggle/working/pdfs/0704.0010.pdf

TEXT from 0704.0379 (Score: 0.68)
Path: /kaggle/working/pdfs/0704.0379.pdf

TEXT from 0704.0051 (Score: 0.73)
Path: /kaggle/working/pdfs/0704.0051.pdf

TEXT from 0704.0026 (Score: 0.73)
Path: /kaggle/working/pdfs/0704.0026.pdf

TEXT from 0704.0112 (Score: 0.77)
Path: /kaggle/working/pdfs/0704.0112.pdf



In [36]:
# Retrieval
retriever = ArXivRetriever(
    index_path="/kaggle/working/faiss_index/final_index.index",
    mapping_path="/kaggle/working/faiss_index/final_mapping.json",
    projection_path="/kaggle/working/faiss_index/projection.pt"
)

# Query
results = retriever.query("Model", k=5)
for res in results:
    print(f"{res['type'].upper()} from {res['paper_id']} (Score: {res['score']:.2f})")
    print(f"Path: {res['path']}\n")

  self.projection.load_state_dict(torch.load(projection_path))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

TEXT from 0704.0445 (Score: 0.59)
Path: /kaggle/working/pdfs/0704.0445.pdf

TEXT from 0704.0271 (Score: 0.63)
Path: /kaggle/working/pdfs/0704.0271.pdf

TEXT from 0704.0107 (Score: 0.69)
Path: /kaggle/working/pdfs/0704.0107.pdf

TEXT from 0704.0444 (Score: 0.69)
Path: /kaggle/working/pdfs/0704.0444.pdf

TEXT from 0704.0131 (Score: 0.70)
Path: /kaggle/working/pdfs/0704.0131.pdf



In [20]:
import numpy as np
import os

def save_embeddings_from_faiss(faiss_index, output_folder):
    """
    Save embeddings from a FAISS index to disk.
    
    Args:
        faiss_index (faiss.Index): The FAISS index containing embeddings.
        output_folder (str): Folder to save the embeddings.
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Get the total number of embeddings in the index
    num_embeddings = faiss_index.ntotal
    
    # Initialize a list to store embeddings
    embeddings = []
    
    # Extract embeddings from the FAISS index
    for idx in range(num_embeddings):
        embedding = faiss_index.reconstruct(idx)  # Reconstruct embedding by ID
        embeddings.append(embedding)
    
    # Convert embeddings to a NumPy array
    embeddings_array = np.array(embeddings)
    
    # Save embeddings to disk
    embeddings_path = os.path.join(output_folder, "faiss_embeddings.npy")
    np.save(embeddings_path, embeddings_array)
    print(f"Saved {num_embeddings} embeddings to {embeddings_path}")

# Example usage
if __name__ == "__main__":
    # Load the FAISS index
    faiss_index_path = "/kaggle/working/faiss_index/final_index.index"
    faiss_index = faiss.read_index(faiss_index_path)
    
    # Save embeddings
    save_embeddings_from_faiss(faiss_index, "/kaggle/working/embeddings")

Saved 2192 embeddings to /kaggle/working/embeddings/faiss_embeddings.npy


In [21]:
import subprocess
subprocess.run(["zip", "-r", "/kaggle/working/output__.zip", "/kaggle/working/"])


CompletedProcess(args=['zip', '-r', '/kaggle/working/output__.zip', '/kaggle/working/'], returncode=0)

In [23]:
import os
os.remove("/kaggle/working/output__.zip")

In [47]:
# Retrieval
retriever = ArXivRetriever(
    index_path="/kaggle/working/faiss_index/final_index.index",
    mapping_path="/kaggle/working/faiss_index/final_mapping.json",
    projection_path="/kaggle/working/faiss_index/projection.pt"
)

# Query
results = retriever.query("Plot", k=5)
for res in results:
    print(f"{res['type'].upper()} from {res['paper_id']} (Score: {res['score']:.2f})")
    print(f"Path: {res['path']}\n")

  self.projection.load_state_dict(torch.load(projection_path))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

TEXT from 0704.0484 (Score: 0.72)
Path: /kaggle/working/pdfs/0704.0484.pdf

TEXT from 0704.0128 (Score: 0.76)
Path: /kaggle/working/pdfs/0704.0128.pdf

TEXT from 0704.0051 (Score: 0.78)
Path: /kaggle/working/pdfs/0704.0051.pdf

TEXT from 0704.0348 (Score: 0.80)
Path: /kaggle/working/pdfs/0704.0348.pdf

TEXT from 0704.0261 (Score: 0.80)
Path: /kaggle/working/pdfs/0704.0261.pdf



In [51]:
# Initialize the retriever
retriever = ArXivRetriever(
    index_path="/kaggle/working/faiss_index/final_index.index",
    mapping_path="/kaggle/working/faiss_index/final_mapping.json",
    projection_path="/kaggle/working/faiss_index/projection.pt"
)

# Query specifically for images
image_query = "tree"
image_results = retriever.query_images(image_query, k=50)

# Display image results
print(f"Image results for query: '{image_query}'\n")
for res in image_results:
    print(f"Image from Paper {res['paper_id']} (Score: {res['score']:.2f})")
    print(f"Path: {res['path']}\n")

  self.projection.load_state_dict(torch.load(projection_path))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

All results (text + images):
Type: text, Score: 0.63
Type: text, Score: 0.73
Type: text, Score: 0.75
Type: text, Score: 0.76
Type: text, Score: 0.76
Type: text, Score: 0.77
Type: text, Score: 0.77
Type: text, Score: 0.77
Type: text, Score: 0.78
Type: text, Score: 0.78
Image results for query: 'tree'



#### Checking if Images Exist

In [42]:
# Load id_to_doc mapping
with open("/kaggle/working/faiss_index/final_mapping.json", "r") as f:
    id_to_doc = json.load(f)

# Count image entries
image_count = sum(1 for doc in id_to_doc.values() if doc["type"] == "image")
print(f"Total image entries: {image_count}")

# Check if image paths exist
sample_image_entry = next((doc for doc in id_to_doc.values() if doc["type"] == "image"), None)
if sample_image_entry:
    print("Sample image entry:", sample_image_entry)
    print("Image exists:", os.path.exists(sample_image_entry["path"]))
else:
    print("No image entries found.")

Total image entries: 1696
Sample image entry: {'type': 'image', 'paper_id': '0704.0001', 'path': '/kaggle/working/images/page_15_img_1.jpeg'}
Image exists: True


In [52]:
# Retrieval
retriever = ArXivRetriever(
    index_path="/kaggle/working/faiss_index/final_index.index",
    mapping_path="/kaggle/working/faiss_index/final_mapping.json",
    projection_path="/kaggle/working/faiss_index/projection.pt"
)

# Query
results = retriever.query("brain", k=5)
for res in results:
    print(f"{res['type'].upper()} from {res['paper_id']} (Score: {res['score']:.2f})")
    print(f"Path: {res['path']}\n")

  self.projection.load_state_dict(torch.load(projection_path))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

TEXT from 0704.0392 (Score: 0.71)
Path: /kaggle/working/pdfs/0704.0392.pdf

TEXT from 0704.0304 (Score: 0.82)
Path: /kaggle/working/pdfs/0704.0304.pdf

TEXT from 0704.0301 (Score: 0.86)
Path: /kaggle/working/pdfs/0704.0301.pdf

TEXT from 0704.0093 (Score: 0.87)
Path: /kaggle/working/pdfs/0704.0093.pdf

TEXT from 0704.0051 (Score: 0.87)
Path: /kaggle/working/pdfs/0704.0051.pdf

