In [1]:
import pandas as pd
pd.set_option("display.max_colwidth", None) 

from promptx import load

load()

[32m2023-10-30 08:26:37.988[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m137[0m - [1mloading local app from /home/rjl/promptx/scratch/arxiv-reader[0m
[32m2023-10-30 08:26:37.990[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m140[0m - [1mloaded environment variables from /home/rjl/promptx/scratch/arxiv-reader/.env[0m
[32m2023-10-30 08:26:37.991[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m141[0m - [1mAPI KEY CQZm7[0m


[1m<[0m[1;95mApp[0m[39m local [0m[33mpath[0m[39m=[0m[35m/home/rjl/promptx/scratch/[0m[95marxiv-reader[0m[1m>[0m

In [2]:
import PyPDF2
import requests
import uuid

def load_pdf(filepath_or_url):
    """
    Load content of a PDF from either a file path or a remote URL.
    
    :param filepath_or_url: File path or URL to fetch the PDF from.
    :return: Content of the PDF as a string.
    """
    
    # Handle remote URL
    if filepath_or_url.startswith(("http://", "https://")):
        response = requests.get(filepath_or_url)
        response.raise_for_status()
        id = str(uuid.uuid4())
        filepath_or_url = f'./data/{id}.pdf'
        with open(filepath_or_url, 'wb') as pdf:
            pdf.write(response.content)
    
    with open(filepath_or_url, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text_content = ''.join([page.extract_text() for page in pdf_reader.pages])
    return text_content


def batch(generator, bs=1, limit=None):
    b = []
    i = 0
    for item in generator:
        if limit and i > limit:
            break
        b.append(item)
        if len(b) == bs:
            yield b
            b = []
        i += bs
    if b and (limit and i <= limit):  # Yield any remaining items in the batch
        yield b

In [3]:
from typing import *
from pydantic import Field
import requests
from bs4 import BeautifulSoup

from promptx.collection import Entity


class Document(Entity):
    title: str
    abstract: str
    url: str
    body: str = Field(None, embed=False)


def get_arxiv_urls():
    response = requests.get('https://arxiv.org/list/cs.AI/recent')
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    urls = [f"https://arxiv.org{a.attrs['href']}" for a in soup.find_all('a', title='Abstract')]
    return urls


def extract_whitepaper_from_arxiv(url):
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1', class_='title').text.replace('Title:', '')
    abstract = soup.find('blockquote', class_='abstract').text.replace('Abstract:', '')
    url = soup.find('a', class_='download-pdf').attrs['href']
    url = f"https://arxiv.org{url}"

    return Document(
        title=title,
        abstract=abstract,
        url=url,
    )

In [4]:
import random
from pprint import pprint

try:
    urls = get_arxiv_urls()
    url = random.choice(urls)
    paper = extract_whitepaper_from_arxiv(url)
except Exception as e:
    print(f'Error loading {url}: {e}')

In [5]:
from promptx import store, query

collection_name = 'arxiv'
store(paper, collection=collection_name)
query(collection=collection_name)[['title', 'abstract', 'url']]

Unnamed: 0,title,abstract,url
0,Learning to Search Feasible and Infeasible Regions of Routing Problems with Flexible Neural k-Opt,"\n In this paper, we present Neural k-Opt (NeuOpt), a novel learning-to-search\n(L2S) solver for routing problems. It learns to perform flexible k-opt\nexchanges based on a tailored action factorization method and a customized\nrecurrent dual-stream decoder. As a pioneering work to circumvent the pure\nfeasibility masking scheme and enable the autonomous exploration of both\nfeasible and infeasible regions, we then propose the Guided Infeasible Region\nExploration (GIRE) scheme, which supplements the NeuOpt policy network with\nfeasibility-related features and leverages reward shaping to steer\nreinforcement learning more effectively. Additionally, we equip NeuOpt with\nDynamic Data Augmentation (D2A) for more diverse searches during inference.\nExtensive experiments on the Traveling Salesman Problem (TSP) and Capacitated\nVehicle Routing Problem (CVRP) demonstrate that our NeuOpt not only\nsignificantly outstrips existing (masking-based) L2S solvers, but also\nshowcases superiority over the learning-to-construct (L2C) and\nlearning-to-predict (L2P) solvers. Notably, we offer fresh perspectives on how\nneural solvers can handle VRP constraints. Our code is available:\nthis https URL.\n\n",https://arxiv.org/pdf/2310.18264.pdf


In [6]:
paper


[1;35mDocument[0m[1m([0m
    [33mid[0m=[32m'f3eb72a4-d4ed-427f-bdcd-32b4461eb2dc'[0m,
    [33mtype[0m=[32m'document'[0m,
    [33mtitle[0m=[32m'Learning to Search Feasible and Infeasible Regions of Routing Problems with Flexible Neural k-Opt'[0m,
    [33mabstract[0m=[32m'\n  In this paper, we present Neural k-Opt [0m[32m([0m[32mNeuOpt[0m[32m)[0m[32m, a novel learning-to-search\n[0m[32m([0m[32mL2S[0m[32m)[0m[32m solver for routing problems. It learns to perform flexible k-opt\nexchanges based on a tailored action factorization method and a customized\nrecurrent dual-stream decoder. As a pioneering work to circumvent the pure\nfeasibility masking scheme and enable the autonomous exploration of both\nfeasible and infeasible regions, we then propose the Guided Infeasible Region\nExploration [0m[32m([0m[32mGIRE[0m[32m)[0m[32m scheme, which supplements the NeuOpt policy network with\nfeasibility-related features and leverages reward shaping to steer

In [7]:
pdf = load_pdf(paper.url)
print(f'Loaded pdf with {len(pdf)} characters')

Loaded pdf with 96105 characters


In [8]:
import spacy
import en_core_web_sm

nlp = spacy.load("en_core_web_sm")
doc = nlp(pdf)

In [10]:
class Quote(Entity):
    text: str
    source: Document
    start: int
    end: int


In [None]:
quotes = []
for chunk in batch(doc.sents, bs=10, limit=1000):
    quotes += [
        Quote(
            text=sentence.text,
            source=paper,
            start=sentence.start_char,
            end=sentence.end_char,
        ) 
        for sentence in chunk
    ]

quotes

In [None]:


for chunk in batch(doc.sents, bs=10, limit=1000):
    store(
        *[
            Quote(
                text=sentence.text,
                source=paper,
                start=sentence.start_char,
                end=sentence.end_char,
            ) 
            for sentence in chunk
        ], 
        collection=collection_name
    )

query(collection=collection_name).query('type == "quote"')