In [1]:
import pandas as pd
pd.set_option("display.max_colwidth", None) 

from promptx import load

load()

collection_name = 'arxiv'

[32m2023-10-31 03:58:06.959[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m137[0m - [1mloading local app from /home/rjl/promptx/examples/arxiv-reader[0m
[32m2023-10-31 03:58:06.966[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m140[0m - [1mloaded environment variables from /home/rjl/promptx/examples/arxiv-reader/.env[0m
[32m2023-10-31 03:58:06.968[0m | [1mINFO    [0m | [36mpromptx[0m:[36mload[0m:[36m141[0m - [1mAPI KEY wMeGC[0m


In [2]:
import PyPDF2
import requests
import uuid

def load_pdf(filepath_or_url):
    """
    Load content of a PDF from either a file path or a remote URL.
    
    :param filepath_or_url: File path or URL to fetch the PDF from.
    :return: Content of the PDF as a string.
    """
    
    # Handle remote URL
    if filepath_or_url.startswith(("http://", "https://")):
        response = requests.get(filepath_or_url)
        response.raise_for_status()
        id = str(uuid.uuid4())
        filepath_or_url = f'./data/{id}.pdf'
        with open(filepath_or_url, 'wb') as pdf:
            pdf.write(response.content)
    
    with open(filepath_or_url, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text_content = ''.join([page.extract_text() for page in pdf_reader.pages])
    return text_content


def batch(generator, bs=1, limit=None):
    b = []
    i = 0
    for item in generator:
        if limit and i > limit:
            break
        b.append(item)
        if len(b) == bs:
            yield b
            b = []
        i += bs
    if b and (limit and i <= limit):  # Yield any remaining items in the batch
        yield b

In [3]:
from typing import *
from pydantic import Field
import requests
from bs4 import BeautifulSoup

from promptx.collection import Entity


class Document(Entity):
    title: str
    abstract: str
    url: str


def get_arxiv_urls():
    response = requests.get('https://arxiv.org/list/cs.AI/recent')
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    urls = [f"https://arxiv.org{a.attrs['href']}" for a in soup.find_all('a', title='Abstract')]
    return urls


def extract_whitepaper_from_arxiv(url):
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1', class_='title').text.replace('Title:', '')
    abstract = soup.find('blockquote', class_='abstract').text.replace('Abstract:', '')
    url = soup.find('a', class_='download-pdf').attrs['href']
    url = f"https://arxiv.org{url}"

    return Document(
        title=title,
        abstract=abstract,
        url=url,
    )

In [4]:
import random
from pprint import pprint

try:
    urls = get_arxiv_urls()
    url = random.choice(urls)
    paper = extract_whitepaper_from_arxiv(url)
except Exception as e:
    print(f'Error loading {e}')

In [5]:
from promptx import store, query

store(paper, collection=collection_name)
query(collection=collection_name)[['title', 'abstract', 'url']]

Unnamed: 0,title,abstract,url
0,Learning to Search Feasible and Infeasible Regions of Routing Problems with Flexible Neural k-Opt,"\n In this paper, we present Neural k-Opt (NeuOpt), a novel learning-to-search\n(L2S) solver for routing problems. It learns to perform flexible k-opt\nexchanges based on a tailored action factorization method and a customized\nrecurrent dual-stream decoder. As a pioneering work to circumvent the pure\nfeasibility masking scheme and enable the autonomous exploration of both\nfeasible and infeasible regions, we then propose the Guided Infeasible Region\nExploration (GIRE) scheme, which supplements the NeuOpt policy network with\nfeasibility-related features and leverages reward shaping to steer\nreinforcement learning more effectively. Additionally, we equip NeuOpt with\nDynamic Data Augmentation (D2A) for more diverse searches during inference.\nExtensive experiments on the Traveling Salesman Problem (TSP) and Capacitated\nVehicle Routing Problem (CVRP) demonstrate that our NeuOpt not only\nsignificantly outstrips existing (masking-based) L2S solvers, but also\nshowcases superiority over the learning-to-construct (L2C) and\nlearning-to-predict (L2P) solvers. Notably, we offer fresh perspectives on how\nneural solvers can handle VRP constraints. Our code is available:\nthis https URL.\n\n",https://arxiv.org/pdf/2310.18264.pdf
1,Image Clustering Conditioned on Text Criteria,"\nClassical clustering methods do not provide users with direct control of the clustering results, and the clustering results may not be consistent with the relevant criterion that a user has in mind. In this work, we present a new methodology for performing image clustering based on user-specified text criteria by leveraging modern vision-language models and large language models. We call our method Image Clustering Conditioned on Text Criteria (IC$|$TC), and it represents a different paradigm of image clustering. IC$|$TC requires a minimal and practical degree of human intervention and grants the user significant control over the clustering results in return. Our experiments show that IC$|$TC can effectively cluster images with various criteria, such as human action, physical location, or the person's mood, while significantly outperforming baselines.\n",https://arxiv.org/pdf/2310.18297.pdf
2,The Innovation-to-Occupations Ontology: Linking Business Transformation Initiatives to Occupations and Skills,"\n The fast adoption of new technologies forces companies to continuously adapt\ntheir operations making it harder to predict workforce requirements. Several\nrecent studies have attempted to predict the emergence of new roles and skills\nin the labour market from online job ads. This paper aims to present a novel\nontology linking business transformation initiatives to occupations and an\napproach to automatically populating it by leveraging embeddings extracted from\njob ads and Wikipedia pages on business transformation and emerging\ntechnologies topics. To our knowledge, no previous research explicitly links\nbusiness transformation initiatives, like the adoption of new technologies or\nthe entry into new markets, to the roles needed. Our approach successfully\nmatches occupations to transformation initiatives under ten different\nscenarios, five linked to technology adoption and five related to business.\nThis framework presents an innovative approach to guide enterprises and\neducational institutions on the workforce requirements for specific business\ntransformation initiatives.\n\n",https://arxiv.org/pdf/2310.17909.pdf
3,Is Scaling Learned Optimizers Worth It? Evaluating The Value of VeLO's 4000 TPU Months,"\n We analyze VeLO (versatile learned optimizer), the largest scale attempt to\ntrain a general purpose ""foundational"" optimizer to date. VeLO was trained on\nthousands of machine learning tasks using over 4000 TPU months with the goal of\nproducing an optimizer capable of generalizing to new problems while being\nhyperparameter free, and outperforming industry standards such as Adam. We\nindependently evaluate VeLO on the MLCommons optimizer benchmark suite. We find\nthat, contrary to initial claims: (1) VeLO has a critical hyperparameter that\nneeds problem-specific tuning, (2) VeLO does not necessarily outperform\ncompetitors in quality of solution found, and (3) VeLO is not faster than\ncompeting optimizers at reducing the training loss. These observations call\ninto question VeLO's generality and the value of the investment in training it.\n\n",https://arxiv.org/pdf/2310.18191.pdf
4,Moments for Perceptive Narration Analysis Through the Emotional Attachment of Audience to Discourse and Story,"\nIn this work, our goal is to develop a theoretical framework that can eventually be used for analyzing the effectiveness of visual stories such as feature films to comic books. To develop this theoretical framework, we introduce a new story element called moments. Our conjecture is that any linear story such as the story of a feature film can be decomposed into a set of moments that follow each other. Moments are defined as the perception of the actions, interactions, and expressions of all characters or a single character during a given time period. We categorize the moments into two major types: story moments and discourse moments. Each type of moment can further be classified into three types, which we call universal storytelling moments. We believe these universal moments foster or deteriorate the emotional attachment of the audience to a particular character or the story. We present a methodology to catalog the occurrences of these universal moments as they are found in the story. The cataloged moments can be represented using curves or color strips. Therefore, we can visualize a character's journey through the story as either a 3D curve or a color strip. We also demonstrated that both story and discourse moments can be transformed into one lump-sum attraction parameter. The attraction parameter in time provides a function that can be plotted graphically onto a timeline illustrating changes in the emotional attachment of audience to a character or the story. By inspecting these functions the story analyst can analytically decipher the moments in the story where the attachment is being established, maintained, strengthened, or conversely where it is languishing.\n",https://arxiv.org/pdf/2310.18273.pdf
...,...,...,...
545,,,
546,,,
547,,,
548,,,


In [6]:
from promptx import query

paper = query(collection=collection_name).query('type == "document"').sample().first
paper


[1;35mDocument[0m[1m([0m
    [33mid[0m=[32m'ab51e267-273b-4329-9d6e-d7c5ab3e8471'[0m,
    [33mtype[0m=[32m'document'[0m,
    [33mtitle[0m=[32m'Image Clustering Conditioned on Text Criteria'[0m,
    [33mabstract[0m=[32m"\nClassical clustering methods do not provide users with direct control of the clustering results, and the clustering results may not be consistent with the relevant criterion that a user has in mind. In this work, we present a new methodology for performing image clustering based on user-specified text criteria by leveraging modern vision-language models and large language models. We call our method Image Clustering Conditioned on Text Criteria [0m[32m([0m[32mIC$|$TC[0m[32m)[0m[32m, and it represents a different paradigm of image clustering. IC$|$TC requires a minimal and practical degree of human intervention and grants the user significant control over the clustering results in return. Our experiments show that IC$|$TC can effectively clust

In [7]:
pdf = load_pdf(paper.url)
print(f'Loaded pdf with {len(pdf)} characters')

Loaded pdf with 89386 characters


In [8]:
import spacy
import en_core_web_sm

nlp = spacy.load("en_core_web_sm")
doc = nlp(pdf)

In [10]:
from promptx import store, query

class Quote(Entity):
    text: str
    source: Document
    start: int
    end: int

for chunk in batch(doc.sents, bs=10, limit=1000):
    store(
        *[
            Quote(
                text=sentence.text,
                source=paper,
                start=sentence.start_char,
                end=sentence.end_char,
            ) 
            for sentence in chunk
        ], 
        collection=collection_name
    )

query(collection=collection_name).query('type == "quote"')

Unnamed: 0,id,type,title,abstract,url,body,text,source,start,end,value,category,confidence
50,e59b876a-76f3-4554-8bf0-b9f2577f7ad2,quote,,,,,"Under Review\nIMAGE CLUSTERING CONDITIONED ON TEXT CRITERIA\nSehyun Kwon†♢, Jaeseung Park†♢, Minkyu Kim♢, Jaewoong Cho♢, Ernest K. Ryu†∗, Kangwook Lee♢♣∗\n†Seoul National University,♢KRAFTON,♣University of Wisconsin–Madison,∗Co-senior authors\nABSTRACT\nClassical clustering methods do not provide users with direct control of the clus-\ntering results, and the clustering results may not be consistent with the relevant\ncriterion that a user has in mind.","{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",0.0,449.0,,,
51,5c85156c-d763-485f-8e2a-44b41948d0bc,quote,,,,,"In this work, we present a new methodology for\nperforming image clustering based on user-specified text criteria by leveraging\nmodern vision-language models and large language models.","{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",450.0,633.0,,,
52,3c372465-b53f-4b08-a071-607d1c54bf5a,quote,,,,,"We call our method\nImage Clustering Conditioned on TextCriteria (IC |TC), and it represents a differ-\nent paradigm of image clustering.","{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",634.0,769.0,,,
53,a7472a5e-3c94-4870-80b8-6ddf37e93d50,quote,,,,,IC |TC requires a minimal and practical degree\nof human intervention and grants the user significant control over the clustering\nresults in return.,"{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",770.0,917.0,,,
54,c86deabc-0ec4-4b6b-9c28-c884df333017,quote,,,,,"Our experiments show that IC |TC can effectively cluster im-\nages with various criteria, such as human action, physical location, or the person’s\nmood, while significantly outperforming baselines.1\n1 I NTRODUCTION\nImage clustering has been studied as a prototypical unsupervised learning task, and it has been\nused to organize large volumes of visual data (Platt et al., 2003), to reduce the cost of labeling an\nunlabeled image dataset (Russell et al., 2008; Schmarje et al., 2022), and to enhance image retrieval\nsystems (Wu et al., 2000; J ´egou and Chum, 2012).","{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",918.0,1482.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,f3f4ce82-7ce1-4e75-a69c-f52f29a517dd,quote,,,,,CIFAR-10\nSTL-10\nCIFAR-100LLAVA,"{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",14694.0,14724.0,,,
646,757ee605-9373-4bf6-a697-2933ad57fe3e,quote,,,,,only\nLlama 2 (7B)\nLlama 2 (13B)\n,"{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",14725.0,14757.0,,,
647,e0471c26-f9a9-4c8f-88c4-7958902831e3,quote,,,,,Llama 2 (70B)\nGPT-3.5\nGPT-4Figure 3: Effect of LLM selection.\n,"{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",14757.0,14819.0,,,
648,f8d9ed1f-f122-493c-8fc4-6f770cda21ed,quote,,,,,"3.5 P RODUCING CLUSTER LABELS\nClassically, the unsupervised clustering task does not require the method to produce labels or de-\nscriptions of the output clusters.","{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",14819.0,14982.0,,,


In [12]:
from enum import Enum


class ThoughtCategory(str, Enum):
    fact = 'fact'
    opinion = 'opinion'
    idea = 'idea'
    connection = 'connection'
    belief = 'belief'


class Thought(Entity):
    value: str
    category: ThoughtCategory
    confidence: float
    source: Entity = Field(None, generate=False)

In [18]:
def read_document(doc, bs=5, limit=1000, recall_limit=3, recent_limit=5):
    sentences = doc.sents
    recent_thoughts = []
    previous_passage = None
    for chunk in batch(sentences, bs=bs, limit=limit):
        passage = [sentence.text for sentence in chunk]
        recalled_thoughts = query(text, collection=collection_name, limit=recall_limit).query('type == "thought"').objects
        
        thoughts = prompt(
            '''
            Given a passage of text and some context, generate some new thoughts about the text.
            Make sure to not repeat any existing thoughts too closely.
            ''',
            input=dict(
                context=dict(
                    previous_passage=previous_passage,
                    recent_thoughts=recent_thoughts,
                    recalled_thoughts=recalled_thoughts,
                ),
                passage=passage,
            ),
            output=[Thought],
        )

        thoughts = [Thought(**{**dict(thought), 'source': paper}) for thought in thoughts.objects]
        recent_thoughts = (thoughts + recent_thoughts)[:recent_limit]
        previous_passage = passage
        
        print(f'Generated {len(thoughts)} thoughts')
        print([thought.value for thought in thoughts])

        store(*thoughts, collection=collection_name)

In [None]:
read_document(doc)

In [20]:

thoughts = query(collection=collection_name).query('type == "thought"')
thoughts

Unnamed: 0,id,type,title,abstract,url,body,text,source,start,end,value,category,confidence
7,56f0181f-083f-42ef-9e91-f2b0412ac9a6,thought,,,,,,"{'ids': ['633eb8b2-7cd3-4154-86ad-172d76416dd5'], 'collection': 'arxiv', 'limit': 1}",,,The goal of this work is to develop a theoretical framework for analyzing the effectiveness of visual stories.,fact,0.9
8,b8241343-55b3-4690-8e41-1e1b7c978418,thought,,,,,,"{'ids': ['633eb8b2-7cd3-4154-86ad-172d76416dd5'], 'collection': 'arxiv', 'limit': 1}",,,The theoretical framework includes a new story element called moments.,fact,0.8
9,c77e00b8-9a34-4af7-a058-0e16a381e24e,thought,,,,,,"{'ids': ['633eb8b2-7cd3-4154-86ad-172d76416dd5'], 'collection': 'arxiv', 'limit': 1}",,,"Linear stories, like feature films, can be decomposed into a set of moments.",fact,0.7
10,79a260ea-90e0-422d-b5a8-503963788cd2,thought,,,,,,"{'ids': ['633eb8b2-7cd3-4154-86ad-172d76416dd5'], 'collection': 'arxiv', 'limit': 1}",,,The authors believe that moments can be used to analyze the effectiveness of visual stories.,belief,0.6
11,0bd4905c-1b39-4986-abea-c53b5174a5c8,thought,,,,,,"{'ids': ['633eb8b2-7cd3-4154-86ad-172d76416dd5'], 'collection': 'arxiv', 'limit': 1}",,,Visual stories include feature films and comic books.,fact,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,de074dee-d67a-49ef-9197-107b51276571,thought,,,,,,"{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",,,The user specifies a criterion expressed in natural language to guide the image clustering process.,fact,0.7
134,16b297de-5051-476a-aa19-892fb5158790,thought,,,,,,"{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",,,Recent image clustering methods find clusters that agree with pre-defined class labels for datasets such as CIFAR-10.,fact,0.6
135,0c0422d2-bad3-41a5-8b5e-ac3f8bea1a08,thought,,,,,,"{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",,,"The inductive biases of the neural networks and the loss function, data augmentations, and feature extractors used within the method influence the choice of clusters.",fact,0.6
136,9ae91878-be4b-483c-997f-37772120cc75,thought,,,,,,"{'ids': ['ab51e267-273b-4329-9d6e-d7c5ab3e8471'], 'collection': 'arxiv', 'limit': 1}",,,Classical clustering methods may not be consistent with the relevant criterion that a user has in mind.,opinion,0.7
