# Download Data

In [44]:
import logging
import arxiv

logging.basicConfig(level=logging.DEBUG)

client = arxiv.Client(page_size=10, delay_seconds=10.0, num_retries=20)
search = arxiv.Search(id_list=["2305.18290", "2304.15004", "2312.08782"])
results = client.results(search)

# Prints 1000 titles before needing to make another request.
for r in results:
    print(r.title)
    r.download_pdf(dirpath=".papers")

Direct Preference Optimization: Your Language Model is Secretly a Reward Model
Are Emergent Abilities of Large Language Models a Mirage?
Toward General-Purpose Robots via Foundation Models: A Survey and Meta-Analysis


In [47]:
search = arxiv.Search(
    query="emergence in large language models",
    max_results=5,
    sort_by=arxiv.SortCriterion.SubmittedDate,
)
results = client.results(search)

# Prints 1000 titles before needing to make another request.
for r in results:
    print(r.title)

MACS: Mass Conditioned 3D Hand and Object Motion Synthesis
Family Puzzle, Framing Topology, $c_-=24$ and 3(E8)$_1$ Conformal Field Theories: 48/16 = 45/15 = 24/8 =3
Model Reduction to Spectral Submanifolds in Non-Smooth Dynamical Systems
Topological Green's function zeros in an exactly solved model and beyond
A Survey of Reinforcement Learning from Human Feedback


In [1]:
import os

In [3]:
DIRECTORY = ".pipeline"
files = os.listdir(DIRECTORY)
files

[]

In [67]:
from llama_index import SimpleDirectoryReader
from llama_index.readers import PDFReader

# from llama_hub.file.pymu_pdf.base import PyMuPDFReader

loader = PDFReader()

documents = []
for i in files:
    documents.append(loader.load_data(f".papers/{i}"))

documents = [c for d in documents for c in d]



# Node Processing

In [37]:
import os
from llama_index.readers import PDFReader
import logging
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

from llama_index.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    EntityExtractor,
    BaseExtractor,
)

from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.ingestion import IngestionPipeline
from llama_index.text_splitter import SentenceSplitter

DOCUMENT_PATH = ".papers"
QUANT_VERSION = "mistral-7b-instruct-v0.2.Q3_K_S.gguf"
MODEL_PATH = f"./models/{QUANT_VERSION}"

In [20]:
import nest_asyncio
import random

random.seed(42)

nest_asyncio.apply()

In [40]:
files = os.listdir(DOCUMENT_PATH)
loader = PDFReader()

documents = []
for i in files:
    documents.append(loader.load_data(f".papers/{i}"))

documents = [c for d in documents for c in d][:5]
# documents = random.sample(documents, 5)

print(f"Loaded {len(documents)} documents")

Loaded 5 documents


In [41]:
documents

[Document(id_='f4582705-ae42-4e1e-9641-ccda6c2408b0', embedding=None, metadata={'page_label': '1', 'file_name': '.papers/2304.15004v2.Are_Emergent_Abilities_of_Large_Language_Models_a_Mirage_.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='2f6c3dd94d8e6a1f97f07f2ce1df2512ca573ac0385a0ea64be4bf13b33a33e8', text='Are Emergent Abilities of Large Language Models a\nMirage?\nRylan Schaeffer, Brando Miranda, and Sanmi Koyejo\nComputer Science, Stanford University\nAbstract\nRecent work claims that large language models display emergent abilities , abil-\nities not present in smaller-scale models that are present in larger-scale models.\nWhat makes emergent abilities intriguing is two-fold: their sharpness , transition-\ning seemingly instantaneously from not present to present, and their unpredictabil-\nity, appearing at seemingly unforeseeable model scales. Here, we present an al-\nternative explanation for emergent abilities: that for a partic

In [None]:
llm = LlamaCPP(
    model_path=MODEL_PATH,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3000,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": 30},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=False,
)

In [5]:
st = SentenceSplitter(chunk_size=256)
_ = st.split_text(documents[0].text)
len(_)

7

In [85]:
prompt = """
### [INST] Context: {context_str}. Give a highly concise title that summarizes all of \
the unique themes found in the context, in no more than 20 words. \
Dont include descriptions of what you are doing or how you did it. Be as concise as possible. \

Title: [/INST]"""

from llama_index.prompts import PromptTemplate

llm.predict(PromptTemplate(template=prompt), context_str=documents[-2].text)

' "Explaining Emergent Abilities in Large Language Models: Metric Choice, Scale, and Resolution"'

In [193]:
from llama_index.llm_predictor.base import LLMPredictorType
from llama_index.bridge.pydantic import Field


class CustomLLMExtractor(BaseExtractor):
    llm: LLMPredictorType = Field(description="The LLM to use for generation.")
    prompt: PromptTemplate = Field(
        default="""[INST] [/INST]""",
        description="The prompt to extract titles with.",
    )

    def __init__(self, llm=None, prompt=None):
        super().__init__(llm=llm, prompt=PromptTemplate(template=prompt))

    # self.llm = kwargs.get("llm")
    # self.prompt = kwargs.get("prompt")  # PromptTemplate(template=prompt)

    async def aextract(self, nodes):
        metadata_list = [
            {"custom_title": self.llm.predict(self.prompt, context_str=node.text)}
            for node in nodes
        ]
        return metadata_list

In [194]:
extractors = [
    # TitleExtractor(nodes=1, llm=llm, node_template=prompt),
    # QuestionsAnsweredExtractor(questions=3, llm=llm),
    # EntityExtractor(prediction_threshold=0.5),
    # # SummaryExtractor(summaries=["prev", "self"], llm=llm),
    # KeywordExtractor(keywords=10, llm=llm),
    CustomLLMExtractor(llm=llm, prompt=prompt),
]

transformations = [st] + extractors

pipeline = IngestionPipeline(transformations=transformations)
nodes = pipeline.run(documents=documents)

  nodes = pipeline.run(documents=documents)


Exception in callback <TaskWakeupMethWrapper object at 0x7f9f282c51f0>(<Future finis...f60>, ...],))>)
handle: <Handle <TaskWakeupMethWrapper object at 0x7f9f282c51f0>(<Future finis...f60>, ...],))>)>
Traceback (most recent call last):
  File "/home/starscream/anaconda3/lib/python3.8/asyncio/events.py", line 81, in _run
    self._context.run(self._callback, *self._args)
RuntimeError: Leaving task <Task pending name='Task-2' coro=<Kernel.dispatch_queue() running at /media/starscream/wheeljack1/projects/casper/src/language/.venv/lib/python3.8/site-packages/ipykernel/kernelbase.py:529> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7f9f37104310>()]> cb=[_wrap_awaitable.<locals>.<lambda>() at /media/starscream/wheeljack1/projects/casper/src/language/.venv/lib/python3.8/site-packages/tornado/gen.py:852, IOLoop.add_future.<locals>.<lambda>() at /media/starscream/wheeljack1/projects/casper/src/language/.venv/lib/python3.8/site-packages/tornado/ioloop.py:699]> does not match t

In [195]:
for i, n in enumerate(nodes):
    print(i)
    print(n.metadata)

0
{'page_label': '1', 'file_name': '.papers/2304.15004v2.Are_Emergent_Abilities_of_Large_Language_Models_a_Mirage_.pdf', 'custom_title': ' "Emergent Abilities in Large Language Models: A Metric Illusion"'}
1
{'page_label': '1', 'file_name': '.papers/2304.15004v2.Are_Emergent_Abilities_of_Large_Language_Models_a_Mirage_.pdf', 'custom_title': ' "Explaining Emergent Abilities in Large Language Models: Metric Choice vs. Model Scaling"'}
2
{'page_label': '1', 'file_name': '.papers/2304.15004v2.Are_Emergent_Abilities_of_Large_Language_Models_a_Mirage_.pdf', 'custom_title': ' "Investigating the Illusion of Emergent Abilities in AI: Impact of Metric Choice on Performance"'}
3
{'page_label': '1', 'file_name': '.papers/2304.15004v2.Are_Emergent_Abilities_of_Large_Language_Models_a_Mirage_.pdf', 'custom_title': ' "Emergent Abilities in AI: Debunking Myths through Metric Choices in Vision Tasks"'}
4
{'page_label': '1', 'file_name': '.papers/2304.15004v2.Are_Emergent_Abilities_of_Large_Language_Mod

  for i, n in enumerate(nodes):
