In [1]:
%pip -q install langchain openai tiktoken
%pip install arxiv PyPDF2

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip show langchain

Name: langchain
Version: 0.2.1
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/krishnamrith12/opt/anaconda3/envs/hfHoster/lib/python3.11/site-packages
Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-community


# Arxiv Paper Summarizer

1. download the paper as pdf or latext
2. reads it in in one shot
3. makes a summary


### If getting a 403 error you will need to manually download the PDF

In [2]:
THE_PAPER = "https://arxiv.org/abs/2305.10601" #"https://arxiv.org/abs/2305.16291" #

In [3]:
import os
import requests

def download_arxiv_pdf(arxiv_url, directory="papers"):
    # extract paper id from URL
    paper_id = arxiv_url.split('/')[-1]
    pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"

    # make directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    response = requests.get(pdf_url)

    # ensure the request was successful
    if response.status_code == 200:
        pdf_path = f"{directory}/{paper_id}.pdf"
        with open(f"{directory}/{paper_id}.pdf", 'wb') as f:
            f.write(response.content)
        return pdf_path
    else:
        print(f"Error: Unable to download the paper. Status code {response.status_code}")


# Example usage
PAPER_PATH = download_arxiv_pdf(THE_PAPER)


In [4]:
PAPER_PATH= "papers/2305.10601.pdf"

## PDF Text Extraction

In [5]:
from PyPDF2 import PdfReader
import tiktoken

reader = PdfReader(PAPER_PATH)

In [6]:
print(f"Number of pages: {len(reader.pages)}")

Number of pages: 14


In [7]:
reader.pages[5].extract_text

<bound method PageObject.extract_text of {'/Type': '/Page', '/Annots': [IndirectObject(208, 0, 4437966736), IndirectObject(209, 0, 4437966736), IndirectObject(250, 0, 4437966736), IndirectObject(251, 0, 4437966736), IndirectObject(252, 0, 4437966736), IndirectObject(253, 0, 4437966736), IndirectObject(254, 0, 4437966736)], '/Contents': IndirectObject(257, 0, 4437966736), '/MediaBox': [0, 0, 612, 792], '/Parent': IndirectObject(81, 0, 4437966736), '/Resources': IndirectObject(255, 0, 4437966736)}>

In [8]:
parts = []

def visitor_body(text, cm, tm, fontDict, fontSize):
    y = tm[5]
    if y > 50 and y < 720:
        parts.append(text)

for page in reader.pages:
    # page = reader.pages[3]
    page.extract_text(visitor_text=visitor_body)

text_body = "".join(parts)

print(text_body)

Tree of Thoughts: Deliberate Problem Solving
with Large Language Models
Shunyu Yao
Princeton UniversityDian Yu
Google DeepMindJeffrey Zhao
Google DeepMindIzhak Shafran
Google DeepMind
Thomas L. Griffiths
Princeton UniversityYuan Cao
Google DeepMindKarthik Narasimhan
Princeton University
Abstract
Language models are increasingly being deployed for general problem solving
across a wide range of tasks, but are still confined to token-level, left-to-right
decision-making processes during inference. This means they can fall short in
tasks that require exploration, strategic lookahead, or where initial decisions play
a pivotal role. To surmount these challenges, we introduce a new framework for
language model inference, “Tree of Thoughts” (ToT), which generalizes over the
popular “Chain of Thought” approach to prompting language models, and enables
exploration over coherent units of text (“thoughts”) that serve as intermediate steps
toward problem solving. ToT allows LMs to perform deliberat

In [9]:
# count the tokens
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    # encoding = tiktoken.get_encoding(encoding_name)
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(text_body, "gpt-3.5-turbo")

19052

In [10]:
import re

def remove_citations(text):
    split_text = re.split(r'(\[\d+\].*?(?=\[\d+\]|$))', text, flags=re.DOTALL)
    no_citations = [chunk for i, chunk in enumerate(split_text) if i % 2 == 0]
    citations = [chunk for i, chunk in enumerate(split_text) if i % 2 != 0]
    return ''.join(no_citations), ''.join(citations)

def remove_after_references(text: str) -> str:
    """Remove everything after a line containing 'References'."""
    lines = text.split('\n')
    for i, line in enumerate(lines):
        if 'References' in line:
            return '\n'.join(lines[:i+1])
    return text


In [11]:
text_body = remove_after_references(text_body)


In [12]:
num_tokens_from_string(text_body, "gpt-3.5-turbo")

14796

In [13]:
type(text_body)

str

In [14]:
import textwrap
import os

def wrap_text(text: str, width: int = 120) -> str:
    """Wrap text to a specified width."""
    return '\n'.join(textwrap.wrap(text, width))

def write_string_to_file(filename: str, text: str) -> None:
    """Write a string to a file."""
    with open(filename, 'w') as f:
        f.write(text)

def extract_filename(path_string):
    """Extract filename without extension from a path string."""
    base_name = os.path.basename(path_string)  # Get the filename with extension
    file_name_without_ext = os.path.splitext(base_name)[0]  # Remove the extension
    return file_name_without_ext


## LangChain

In [15]:
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

from langchain_openai import ChatOpenAI

from langchain.chains import LLMChain


#### System prompt

In [16]:
context_template="You are a helpful AI Researcher that specializes in analysing ML, AI and LLM papers. Please use all your expertise to approach this task. Output your content in markdown format and include titles where relevant."

system_message_prompt = SystemMessagePromptTemplate.from_template(context_template)


#### Human Prompt


In [17]:
human_template="Please summarize this paper focusing the key important takeaways for each section. Expand the summary on methods so they can be clearly understood. \n\n PAPER: \n\n{paper_content}"

human_message_prompt = HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            template=human_template,
            input_variables=["paper_content"],
        )
    )

In [18]:

chat_prompt_template = ChatPromptTemplate.from_messages([system_message_prompt,
                                                         human_message_prompt])

chat = ChatOpenAI(model_name="gpt-3.5-turbo-16k",
                  temperature=0.2)


In [19]:
summary_chain = LLMChain(llm=chat, prompt=chat_prompt_template)


  warn_deprecated(


In [20]:
#!export OPENAI_API_KEY="sk-proj-4SIe61YhBcsqajeNeRU8T3BlbkFJdhBECLtFb0rdn6vd8itI"

In [21]:
!echo $OPENAI_API_KEY

sk-proj-4SIe61YhBcsqajeNeRU8T3BlbkFJdhBECLtFb0rdn6vd8itI


## Full output

In [22]:
%%time
from langchain.callbacks import get_openai_callback

with get_openai_callback() as cb:
    output = summary_chain.run(text_body)

    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost}")
    print(f"Calc Total Cost (USD): ${(cb.prompt_tokens/1000)*0.003 + (cb.completion_tokens/1000)*0.004}")

  warn_deprecated(


Total Tokens: 15322
Prompt Tokens: 14880
Completion Tokens: 442
Total Cost (USD): $0.046408000000000005
Calc Total Cost (USD): $0.046408000000000005
CPU times: user 76.5 ms, sys: 38.7 ms, total: 115 ms
Wall time: 11.2 s


In [23]:
print(wrap_text(output))

# Tree of Thoughts: Deliberate Problem Solving with Large Language Models  ## Abstract Language models (LMs) are
powerful tools for problem-solving tasks, but they are limited by their token-level, left-to-right decision-making
process. This paper introduces a new framework called "Tree of Thoughts" (ToT) that allows LMs to perform deliberate
decision-making by considering multiple reasoning paths and self-evaluating choices. ToT enables exploration over
coherent units of text ("thoughts") that serve as intermediate steps toward problem-solving. The experiments show that
ToT significantly enhances LM's problem-solving abilities on tasks requiring planning or search.  ## Introduction This
section discusses the limitations of current language models in problem-solving tasks and introduces the concept of
"System 1" and "System 2" decision-making processes. It also highlights the need for a more deliberate planning process
in language models.  ## Background This section provides an overvie

In [24]:
write_string_to_file(f'{extract_filename(PAPER_PATH)}-summary.txt', output)

In [25]:
PAPER_PATH

'papers/2305.10601.pdf'