In [1]:
import pandas as pd
pd.set_option("display.max_colwidth", None) 

from promptx import load

load()

[32m2023-10-30 09:18:16.766[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m137[0m - [1mloading local app from /home/rjl/promptx/examples/arxiv-reader[0m
[32m2023-10-30 09:18:16.767[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m140[0m - [1mloaded environment variables from /home/rjl/promptx/examples/arxiv-reader/.env[0m
[32m2023-10-30 09:18:16.768[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m141[0m - [1mAPI KEY CQZm7[0m


[1m<[0m[1;95mApp[0m[39m local [0m[33mpath[0m[39m=[0m[35m/home/rjl/promptx/examples/[0m[95marxiv-reader[0m[1m>[0m

In [2]:
import PyPDF2
import requests
import uuid

def load_pdf(filepath_or_url):
    """
    Load content of a PDF from either a file path or a remote URL.
    
    :param filepath_or_url: File path or URL to fetch the PDF from.
    :return: Content of the PDF as a string.
    """
    
    # Handle remote URL
    if filepath_or_url.startswith(("http://", "https://")):
        response = requests.get(filepath_or_url)
        response.raise_for_status()
        id = str(uuid.uuid4())
        filepath_or_url = f'./data/{id}.pdf'
        with open(filepath_or_url, 'wb') as pdf:
            pdf.write(response.content)
    
    with open(filepath_or_url, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text_content = ''.join([page.extract_text() for page in pdf_reader.pages])
    return text_content


def batch(generator, bs=1, limit=None):
    b = []
    i = 0
    for item in generator:
        if limit and i > limit:
            break
        b.append(item)
        if len(b) == bs:
            yield b
            b = []
        i += bs
    if b and (limit and i <= limit):  # Yield any remaining items in the batch
        yield b

In [3]:
from typing import *
from pydantic import Field
import requests
from bs4 import BeautifulSoup

from promptx.collection import Entity


class Document(Entity):
    title: str
    abstract: str
    url: str
    body: str = Field(None, embed=False)


def get_arxiv_urls():
    response = requests.get('https://arxiv.org/list/cs.AI/recent')
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    urls = [f"https://arxiv.org{a.attrs['href']}" for a in soup.find_all('a', title='Abstract')]
    return urls


def extract_whitepaper_from_arxiv(url):
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1', class_='title').text.replace('Title:', '')
    abstract = soup.find('blockquote', class_='abstract').text.replace('Abstract:', '')
    url = soup.find('a', class_='download-pdf').attrs['href']
    url = f"https://arxiv.org{url}"

    return Document(
        title=title,
        abstract=abstract,
        url=url,
    )

In [4]:
import random
from pprint import pprint

try:
    urls = get_arxiv_urls()
    url = random.choice(urls)
    paper = extract_whitepaper_from_arxiv(url)
except Exception as e:
    print(f'Error loading {url}: {e}')

In [5]:
from promptx import store, query

collection_name = 'arxiv'
store(paper, collection=collection_name)
query(collection=collection_name)[['title', 'abstract', 'url']]

Unnamed: 0,title,abstract,url
0,Learning to Search Feasible and Infeasible Regions of Routing Problems with Flexible Neural k-Opt,"\n In this paper, we present Neural k-Opt (NeuOpt), a novel learning-to-search\n(L2S) solver for routing problems. It learns to perform flexible k-opt\nexchanges based on a tailored action factorization method and a customized\nrecurrent dual-stream decoder. As a pioneering work to circumvent the pure\nfeasibility masking scheme and enable the autonomous exploration of both\nfeasible and infeasible regions, we then propose the Guided Infeasible Region\nExploration (GIRE) scheme, which supplements the NeuOpt policy network with\nfeasibility-related features and leverages reward shaping to steer\nreinforcement learning more effectively. Additionally, we equip NeuOpt with\nDynamic Data Augmentation (D2A) for more diverse searches during inference.\nExtensive experiments on the Traveling Salesman Problem (TSP) and Capacitated\nVehicle Routing Problem (CVRP) demonstrate that our NeuOpt not only\nsignificantly outstrips existing (masking-based) L2S solvers, but also\nshowcases superiority over the learning-to-construct (L2C) and\nlearning-to-predict (L2P) solvers. Notably, we offer fresh perspectives on how\nneural solvers can handle VRP constraints. Our code is available:\nthis https URL.\n\n",https://arxiv.org/pdf/2310.18264.pdf
1,Image Clustering Conditioned on Text Criteria,"\nClassical clustering methods do not provide users with direct control of the clustering results, and the clustering results may not be consistent with the relevant criterion that a user has in mind. In this work, we present a new methodology for performing image clustering based on user-specified text criteria by leveraging modern vision-language models and large language models. We call our method Image Clustering Conditioned on Text Criteria (IC$|$TC), and it represents a different paradigm of image clustering. IC$|$TC requires a minimal and practical degree of human intervention and grants the user significant control over the clustering results in return. Our experiments show that IC$|$TC can effectively cluster images with various criteria, such as human action, physical location, or the person's mood, while significantly outperforming baselines.\n",https://arxiv.org/pdf/2310.18297.pdf
2,The Innovation-to-Occupations Ontology: Linking Business Transformation Initiatives to Occupations and Skills,"\n The fast adoption of new technologies forces companies to continuously adapt\ntheir operations making it harder to predict workforce requirements. Several\nrecent studies have attempted to predict the emergence of new roles and skills\nin the labour market from online job ads. This paper aims to present a novel\nontology linking business transformation initiatives to occupations and an\napproach to automatically populating it by leveraging embeddings extracted from\njob ads and Wikipedia pages on business transformation and emerging\ntechnologies topics. To our knowledge, no previous research explicitly links\nbusiness transformation initiatives, like the adoption of new technologies or\nthe entry into new markets, to the roles needed. Our approach successfully\nmatches occupations to transformation initiatives under ten different\nscenarios, five linked to technology adoption and five related to business.\nThis framework presents an innovative approach to guide enterprises and\neducational institutions on the workforce requirements for specific business\ntransformation initiatives.\n\n",https://arxiv.org/pdf/2310.17909.pdf
3,Is Scaling Learned Optimizers Worth It? Evaluating The Value of VeLO's 4000 TPU Months,"\n We analyze VeLO (versatile learned optimizer), the largest scale attempt to\ntrain a general purpose ""foundational"" optimizer to date. VeLO was trained on\nthousands of machine learning tasks using over 4000 TPU months with the goal of\nproducing an optimizer capable of generalizing to new problems while being\nhyperparameter free, and outperforming industry standards such as Adam. We\nindependently evaluate VeLO on the MLCommons optimizer benchmark suite. We find\nthat, contrary to initial claims: (1) VeLO has a critical hyperparameter that\nneeds problem-specific tuning, (2) VeLO does not necessarily outperform\ncompetitors in quality of solution found, and (3) VeLO is not faster than\ncompeting optimizers at reducing the training loss. These observations call\ninto question VeLO's generality and the value of the investment in training it.\n\n",https://arxiv.org/pdf/2310.18191.pdf
4,Moments for Perceptive Narration Analysis Through the Emotional Attachment of Audience to Discourse and Story,"\nIn this work, our goal is to develop a theoretical framework that can eventually be used for analyzing the effectiveness of visual stories such as feature films to comic books. To develop this theoretical framework, we introduce a new story element called moments. Our conjecture is that any linear story such as the story of a feature film can be decomposed into a set of moments that follow each other. Moments are defined as the perception of the actions, interactions, and expressions of all characters or a single character during a given time period. We categorize the moments into two major types: story moments and discourse moments. Each type of moment can further be classified into three types, which we call universal storytelling moments. We believe these universal moments foster or deteriorate the emotional attachment of the audience to a particular character or the story. We present a methodology to catalog the occurrences of these universal moments as they are found in the story. The cataloged moments can be represented using curves or color strips. Therefore, we can visualize a character's journey through the story as either a 3D curve or a color strip. We also demonstrated that both story and discourse moments can be transformed into one lump-sum attraction parameter. The attraction parameter in time provides a function that can be plotted graphically onto a timeline illustrating changes in the emotional attachment of audience to a character or the story. By inspecting these functions the story analyst can analytically decipher the moments in the story where the attachment is being established, maintained, strengthened, or conversely where it is languishing.\n",https://arxiv.org/pdf/2310.18273.pdf
...,...,...,...
423,,,
424,,,
425,,,
426,,,


In [6]:
paper


[1;35mDocument[0m[1m([0m
    [33mid[0m=[32m'3a3b1607-1ca0-457f-9924-c43980373322'[0m,
    [33mtype[0m=[32m'document'[0m,
    [33mtitle[0m=[32m'Improving Intrinsic Exploration by Creating Stationary Objectives'[0m,
    [33mabstract[0m=[32m"\nExploration bonuses in reinforcement learning guide long-horizon exploration by defining custom intrinsic objectives. Count-based methods use the frequency of state visits to derive an exploration bonus. In this paper, we identify that any intrinsic reward function derived from count-based methods is non-stationary and hence induces a difficult objective to optimize for the agent. The key contribution of our work lies in transforming the original non-stationary rewards into stationary rewards through an augmented state representation. For this purpose, we introduce the Stationary Objectives For Exploration [0m[32m([0m[32mSOFE[0m[32m)[0m[32m framework. SOFE requires identifying sufficient statistics for different explorati

In [8]:
pdf = load_pdf(paper.url)
print(f'Loaded pdf with {len(pdf)} characters')

Loaded pdf with 47216 characters


In [10]:
import spacy
import en_core_web_sm

nlp = spacy.load("en_core_web_sm")
doc = nlp(pdf)

In [11]:
class Quote(Entity):
    text: str
    source: Document
    start: int
    end: int

for chunk in batch(doc.sents, bs=10, limit=1000):
    store(
        *[
            Quote(
                text=sentence.text,
                source=paper,
                start=sentence.start_char,
                end=sentence.end_char,
            ) 
            for sentence in chunk
        ], 
        collection=collection_name
    )

query(collection=collection_name).query('type == "quote"')

In [None]:
from enum import Enum


class ThoughtCategory(str, Enum):
    fact = 'fact'
    opinion = 'opinion'
    idea = 'idea'
    connection = 'connection'
    belief = 'belief'


class Thought(Entity):
    value: str
    category: ThoughtCategory
    confidence: float
    source: Entity = Field(None, generate=False)