In [None]:
import requests
import xml.etree.ElementTree as ET
import json
import os
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# File paths for caching
PAPERS_FILE = "cached_papers.json"
EMBEDDINGS_FILE = "cached_embeddings.npy"

# Function to fetch all available papers from ArXiv for a given category
def fetch_all_arxiv_papers(category, batch_size=100):
    base_url = "http://export.arxiv.org/api/query?"
    start = 0
    all_papers = []

    print(f"Fetching papers for category: {category}...")

    while True:
        search_query = f"search_query=cat:{category}&start={start}&max_results={batch_size}"
        response = requests.get(base_url + search_query)

        if response.status_code != 200:
            print(f"Failed to fetch data for {category}")
            break

        root = ET.fromstring(response.content)
        papers = []

        for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
            title = entry.find("{http://www.w3.org/2005/Atom}title").text
            abstract = entry.find("{http://www.w3.org/2005/Atom}summary").text
            link = entry.find("{http://www.w3.org/2005/Atom}id").text
            papers.append({"title": title, "abstract": abstract, "link": link})

        if not papers:
            print(f"Finished fetching papers for {category}. Total: {len(all_papers)}")
            break

        all_papers.extend(papers)
        start += batch_size

    return all_papers

# Load Pre-trained SPECTER Model
model_name = "allenai/specter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Function to Generate Embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        output = model(**inputs)

    return output.last_hidden_state.mean(dim=1).cpu().numpy()

# Load cached data if available
if os.path.exists(PAPERS_FILE) and os.path.exists(EMBEDDINGS_FILE):
    print("Cached data already exists.")
else:
    print("Fetching data from ArXiv...")
    # Categories: AI, ML, and NLP (ArXiv categories)
    categories = ["cs.AI", "cs.LG", "cs.CL"]

    # Fetch all papers for each category
    all_papers = []
    for category in categories:
        all_papers.extend(fetch_all_arxiv_papers(category))

    print(f"Total papers fetched: {len(all_papers)}")

    # Save papers to JSON
    with open(PAPERS_FILE, "w") as f:
        json.dump(all_papers, f)

    # Convert fetched papers into text format
    papers_text = [f"{paper['title']}: {paper['abstract']}" for paper in all_papers]

    # Generate Paper Embeddings
    print("Generating embeddings...")
    paper_embeddings = np.array([get_embedding(paper) for paper in papers_text]).squeeze()

    # Save embeddings to file
    np.save(EMBEDDINGS_FILE, paper_embeddings)

print("Data creation complete.")