In [1]:
import PyPDF2
import requests
import uuid

def load_pdf(filepath_or_url):
    """
    Load content of a PDF from either a file path or a remote URL.
    
    :param filepath_or_url: File path or URL to fetch the PDF from.
    :return: Content of the PDF as a string.
    """
    
    # Handle remote URL
    if filepath_or_url.startswith(("http://", "https://")):
        response = requests.get(filepath_or_url)
        response.raise_for_status()
        id = str(uuid.uuid4())
        filepath_or_url = f'./data/{id}.pdf'
        with open(filepath_or_url, 'wb') as pdf:
            pdf.write(response.content)
    
    with open(filepath_or_url, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text_content = ''.join([page.extract_text() for page in pdf_reader.pages])
    return text_content


def batch(generator, bs=1, limit=None):
    b = []
    i = 0
    for item in generator:
        if limit and i > limit:
            break
        b.append(item)
        if len(b) == bs:
            yield b
            b = []
        i += bs
    if b and (limit and i <= limit):  # Yield any remaining items in the batch
        yield b

In [2]:
from typing import *
from pydantic import Field
import requests
from bs4 import BeautifulSoup

from promptx.collection import Entity


class Document(Entity):
    title: str
    abstract: str
    url: str


def get_arxiv_urls():
    response = requests.get('https://arxiv.org/list/cs.AI/recent')
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    urls = [f"https://arxiv.org{a.attrs['href']}" for a in soup.find_all('a', title='Abstract')]
    return urls


def extract_whitepaper_from_arxiv(url):
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1', class_='title').text.replace('Title:', '')
    abstract = soup.find('blockquote', class_='abstract').text.replace('Abstract:', '')
    url = soup.find('a', class_='download-pdf').attrs['href']
    url = f"https://arxiv.org{url}"

    return Document(
        title=title,
        abstract=abstract,
        url=url,
    )

[32m2023-11-02 00:13:02.600[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m104[0m - [1mloading local app from /home/rjl/promptx/examples/arxiv-reader[0m
[32m2023-11-02 00:13:02.603[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m107[0m - [1mloaded environment variables from /home/rjl/promptx/examples/arxiv-reader/.env[0m
[32m2023-11-02 00:13:02.604[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m108[0m - [1mAPI KEY wMeGC[0m


In [3]:
import random
from pprint import pprint

try:
    urls = get_arxiv_urls()
    url = random.choice(urls)
    paper = extract_whitepaper_from_arxiv(url)
except Exception as e:
    print(f'Error loading {e}')

In [5]:
from promptx import store, query

collection_name = 'arxiv'
store(paper, collection=collection_name)
query(collection=collection_name)[['title', 'abstract', 'url']]

Unnamed: 0,title,abstract,url
0,Image Clustering Conditioned on Text Criteria,\nClassical clustering methods do not provide ...,https://arxiv.org/pdf/2310.18297.pdf
1,Learning to Search Feasible and Infeasible Reg...,"\n In this paper, we present Neural k-Opt (Ne...",https://arxiv.org/pdf/2310.18264.pdf
2,The Innovation-to-Occupations Ontology: Linkin...,\n The fast adoption of new technologies forc...,https://arxiv.org/pdf/2310.17909.pdf
3,Is Scaling Learned Optimizers Worth It? Evalua...,\n We analyze VeLO (versatile learned optimiz...,https://arxiv.org/pdf/2310.18191.pdf
4,Moments for Perceptive Narration Analysis Thro...,"\nIn this work, our goal is to develop a theor...",https://arxiv.org/pdf/2310.18273.pdf
...,...,...,...
734,,,
735,,,
736,,,
737,,,


In [6]:
from promptx import query

paper = query(collection=collection_name).query('type == "document"').sample().first
paper


[1;35mDocument[0m[1m([0m
    [33mid[0m=[32m'90bd06ba-604a-4d00-8c0d-422278338ff4'[0m,
    [33mtype[0m=[32m'document'[0m,
    [33mtitle[0m=[32m"Is[0m[32m Scaling Learned Optimizers Worth It? Evaluating The Value of VeLO's 4000 TPU Months"[0m,
    [33mabstract[0m=[32m'\n  We analyze VeLO [0m[32m([0m[32mversatile learned optimizer[0m[32m)[0m[32m, the largest scale attempt to\ntrain a general purpose "foundational" optimizer to date. VeLO was trained on\nthousands of machine learning tasks using over 4000 TPU months with the goal of\nproducing an optimizer capable of generalizing to new problems while being\nhyperparameter free, and outperforming industry standards such as Adam. We\nindependently evaluate VeLO on the MLCommons optimizer benchmark suite. We find\nthat, contrary to initial claims: [0m[32m([0m[32m1[0m[32m)[0m[32m VeLO has a critical hyperparameter that\nneeds problem-specific tuning, [0m[32m([0m[32m2[0m[32m)[0m[32m VeLO does not n

In [7]:
pdf = load_pdf(paper.url)
print(f'Loaded pdf with {len(pdf)} characters')

Loaded pdf with 46195 characters


In [8]:
import spacy
import en_core_web_sm

nlp = spacy.load("en_core_web_sm")
doc = nlp(pdf)

In [9]:
from promptx import store, query

class Quote(Entity):
    text: str
    source: Document
    start: int
    end: int

for chunk in batch(doc.sents, bs=10, limit=1000):
    store(
        *[
            Quote(
                text=sentence.text,
                source=paper,
                start=sentence.start_char,
                end=sentence.end_char,
            ) 
            for sentence in chunk
        ], 
        collection=collection_name
    )

query(collection=collection_name).query('type == "quote"')

Unnamed: 0,id,type,title,abstract,url,body,text,source,start,end,value,category,confidence
139,e59b876a-76f3-4554-8bf0-b9f2577f7ad2,quote,,,,,Under Review\nIMAGE CLUSTERING CONDITIONED ON ...,{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471...,0.0,449.0,,,
140,5c85156c-d763-485f-8e2a-44b41948d0bc,quote,,,,,"In this work, we present a new methodology for...",{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471...,450.0,633.0,,,
141,3c372465-b53f-4b08-a071-607d1c54bf5a,quote,,,,,We call our method\nImage Clustering Condition...,{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471...,634.0,769.0,,,
142,a7472a5e-3c94-4870-80b8-6ddf37e93d50,quote,,,,,IC |TC requires a minimal and practical degree...,{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471...,770.0,917.0,,,
143,c86deabc-0ec4-4b6b-9c28-c884df333017,quote,,,,,Our experiments show that IC |TC can effective...,{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471...,918.0,1482.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,0ee98c30-6423-4eb4-a827-433c0018f287,quote,,,,,"In summary, for individual benchmarks wand alg...",{'ids': ['90bd06ba-604a-4d00-8c0d-422278338ff4...,14811.0,14899.0,,,
835,60d98342-05fb-4894-ba9e-8775ca4439d4,quote,,,,,We measure\nboth wall-clock-time to target (de...,{'ids': ['90bd06ba-604a-4d00-8c0d-422278338ff4...,14900.0,15022.0,,,
836,89aa8c38-d067-4f2b-9d4e-607bd6f9c497,quote,,,,,To aggregate across benchmarks we report the a...,{'ids': ['90bd06ba-604a-4d00-8c0d-422278338ff4...,15022.0,15099.0,,,
837,ea06325d-312d-4209-8228-e31e0bbd3e61,quote,,,,,Measuring Training Quality While MLCommons\nma...,{'ids': ['90bd06ba-604a-4d00-8c0d-422278338ff4...,15099.0,15269.0,,,


In [10]:
from enum import Enum


class ThoughtCategory(str, Enum):
    fact = 'fact'
    opinion = 'opinion'
    idea = 'idea'
    connection = 'connection'
    belief = 'belief'


class Thought(Entity):
    value: str
    category: ThoughtCategory
    confidence: float
    source: Entity = Field(None, generate=False)

In [13]:
def read_document(doc, bs=5, limit=1000, recall_limit=3, recent_limit=5):
    sentences = doc.sents
    recent_thoughts = []
    previous_passage = None
    for chunk in batch(sentences, bs=bs, limit=limit):
        passage = [sentence.text for sentence in chunk]
        recalled_thoughts = query(*passage, collection=collection_name, limit=recall_limit).query('type == "thought"').objects
        
        thoughts = prompt(
            '''
            Given a passage of text and some context, generate some new thoughts about the text.
            Make sure to not repeat any existing thoughts too closely.
            ''',
            input=dict(
                context=dict(
                    previous_passage=previous_passage,
                    recent_thoughts=recent_thoughts,
                    recalled_thoughts=recalled_thoughts,
                ),
                passage=passage,
            ),
            output=[Thought],
        )

        thoughts = [Thought(**{**dict(thought), 'source': paper}) for thought in thoughts.objects]
        recent_thoughts = (thoughts + recent_thoughts)[:recent_limit]
        previous_passage = passage
        
        print(f'Generated {len(thoughts)} thoughts')
        print([thought.value for thought in thoughts])

        store(*thoughts, collection=collection_name)

In [14]:
read_document(doc)

TypeError: TextInputSequence must be str

In [None]:

thoughts = query(collection=collection_name).query('type == "thought"')
thoughts