# About this Notebook

This note book outlines analysing the structure of text and associated patterns whilst laying the foundation of a chatbot that can read in and analyse a PDF. Of which you can query and discuss the content using vector search. 




In [10]:
#### SETTING UP THE ENVIRONMENT #####
%pip install -r requirements.txt

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt_tab')

Collecting langchain==0.0.316 (from -r requirements.txt (line 10))
  Using cached langchain-0.0.316-py3-none-any.whl.metadata (15 kB)
Collecting tokenizers==0.20.3 (from -r requirements.txt (line 11))
  Using cached tokenizers-0.20.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting gradio==4.44.1 (from -r requirements.txt (line 14))
  Using cached gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting langsmith<0.1.0,>=0.0.43 (from langchain==0.0.316->-r requirements.txt (line 10))
  Using cached langsmith-0.0.92-py3-none-any.whl.metadata (9.9 kB)
Collecting gradio-client==1.3.0 (from gradio==4.44.1->-r requirements.txt (line 14))
  Using cached gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
INFO: pip is looking at multiple versions of diversity to determine which version is compatible with other requirements. This could take a while.
Collecting diversity (from -r requirements.txt (line 5))
  Using cached diversity-0.1.22-py3-none-any.whl.metadata (4.5 kB)
  U

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jamespotter/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
#### IMPORTING DUMMY ESSAY ####

with open('data/example_essay.txt', 'r') as file:

    
    essay_text = file.read()
    
split_text = essay_text.split('\n')


#### Extracting Syntactic Templates from a generated essay

In [12]:
from diversity import compression_ratio, homogenization_score, ngram_diversity_score, extract_patterns

cr = compression_ratio(split_text, 'gzip')
hs = homogenization_score(split_text, 'rougel')
# hs = homogenization_score(data_example, 'bertscore') 
nds = ngram_diversity_score(split_text, 4)

print(cr, hs, nds)


==> Scoring all pairs


100%|██████████| 9120/9120 [00:00<00:00, 86939.05it/s]

2.216 0.052 2.938





In [13]:
n = 5 
top_n = 100
patterns = extract_patterns(split_text, n, top_n)
patterns

{'IN JJ NNS IN NN': {'on massive amounts of text',
  'on relevant parts of input',
  'over long sequences of text'},
 'HYPH NN NNP : :': {'- Thought Prompting : -', '- shot Prompting : -'},
 'NNP : : VBZ DT': {'Prompting : - Assigns a',
  'Prompting : - Provides a',
  'Prompting : - Requires no'},
 'JJ NNS CD . NN': set(),
 '. NNP NNP : :': set(),
 'JJ NN HYPH VBN NNS': {'simple rule - based systems',
  'various language - related tasks'},
 'NNP -LRB- NNP NNP JJ': {'GPT ( Generative Pre -'},
 '-LRB- NNP NNP JJ VBN': {'( Generative Pre - trained'},
 'NNP NNP JJ VBN NNP': {'Generative Pre - trained Transformer'},
 'NNP JJ VBN NNP -RRB-': {'Pre - trained Transformer )'},
 'NN NNP : : VBZ': {'shot Prompting : - Provides',
  'shot Prompting : - Requires'},
 'VBZ DT NN TO VB': {'Allows the model to focus', 'asks the model to perform'},
 ': : VBZ DT JJ': {': - Assigns a specific', ': - Provides a few'},
 'CC JJ NNS CD .': {'and residual connections 3 .', 'or logical problems 4 .'},
 'NNS CD .

In [14]:
with open('data/human_essay.txt', 'r') as new_file:
    text = new_file.read()
    text_split_human = text.split('\n')

text_split_human
patterns = extract_patterns(text_split_human, n, top_n)
patterns


{'NN IN DT JJ NN': {'definition of a good developer',
  'democracy in the Western world',
  'indicator of a good design',
  'industry like no other shakedown',
  'model as the gold standard',
  'one with the architectural fit',
  'tech on a worldwide scale',
  'wave of the same category'},
 '. DT JJ NN IN': set(),
 'DT JJ NN IN DT': {'The 2nd wave of the',
  'The sole indicator of a',
  'The standard definition of a',
  'an auxiliary outlet in the'},
 'JJ NN IN DT JJ': {'2nd wave of the same',
  'big tech on a worldwide',
  'sole indicator of a good',
  'standard definition of a good'},
 'DT JJ NN IN NN': {'The bad thing about design',
  'a dominant point in management',
  'the direct line of fire',
  'the gold standard of app',
  'the high ground around privacy'},
 'DT JJ JJ NN .': {'a good mobile developer .', 'a great mobile developer .'},
 'NNP , NNP , CC': {'Apple , Android , and',
  'C++ , Java , and',
  'Cordova , Xamarin , and'},
 'DT NNS IN DT NN': {'the buzzwords on every tec

In [15]:
# Get patterns for both texts
human_patterns = extract_patterns(text_split_human, n, top_n)
example_patterns = extract_patterns(split_text, n, top_n)

# Sort patterns by frequency and get top 5
def get_top_5_patterns(patterns):
    # Convert patterns dict to list of tuples (pattern, examples)
    pattern_list = [(k, len(v)) for k,v in patterns.items() if len(v) > 0]
    # Sort by frequency (count of examples) in descending order
    pattern_list.sort(key=lambda x: x[1], reverse=True)
    # Return top 5 or all if less than 5
    return pattern_list[:5]

print("Top 5 patterns in human-written essay:")
for pattern, freq in get_top_5_patterns(human_patterns):
    print(f"{pattern}: {freq} occurrences")

print("\nTop 5 patterns in example essay:")
for pattern, freq in get_top_5_patterns(example_patterns):
    print(f"{pattern}: {freq} occurrences")


Top 5 patterns in human-written essay:
NN IN DT JJ NN: 8 occurrences
DT JJ NN IN NN: 5 occurrences
DT JJ NN IN DT: 4 occurrences
JJ NN IN DT JJ: 4 occurrences
NNP , NNP , CC: 3 occurrences

Top 5 patterns in example essay:
IN JJ NNS IN NN: 3 occurrences
NNP : : VBZ DT: 3 occurrences
HYPH NN NNP : :: 2 occurrences
JJ NN HYPH VBN NNS: 2 occurrences
NN NNP : : VBZ: 2 occurrences


#### Analysis of Pattern Differences Between Human and Example Essays

The pattern analysis reveals interesting differences in writing style between the human-written and example essays:

1. Pattern Frequency:
- Human essay has higher pattern frequencies (8, 5, 4, 4, 3 occurrences)
- Example essay has lower frequencies (3, 3, 2, 2, 2 occurrences)

2. Pattern Types:
- Human essay favors noun-preposition-adjective patterns (e.g. "NN IN DT JJ NN")
- Example essay uses more technical/structured patterns with colons and hyphens

3. Key Differences:
- Human writing shows more natural language flow with descriptive phrases
- Example essay has more formatted/templated structure typical of technical writing

4. Notable Patterns:
Human Essay:
- Uses more complex noun phrases with prepositions
- More varied sentence structures
- Natural language patterns

Example Essay: 
- More rigid formatting patterns
- Technical/documentation style
- Structured headings and lists

This suggests the human essay has a more natural writing style while the example essay follows a more structured technical format.


# Building First Chat BOT - PDF Reader
The user uploads a PDF of their choice through the user interface.

The application parses the PDF using a PDF parsing library and splits the extracted text into manageable chunks.

The chunks are converted into vector form, called embeddings.

When a user issues a query through the chat interface, the query is also converted into vector form.

The vector similarity between the query vector and each of the chunk vectors is calculated.

The text corresponding to the top-k most similar vectors are retrieved.

The retrieved text is fed along with the query and any other additional instructions to an LLM

The LLM uses the given information to generate an answer to the user query.

The response is displayed on the user interface. The user can now respond (clarification question, new question, gratitude etc.)

The entire conversation history is fed back to the LLM during each turn of the conversation.

In [16]:

from langchain_community.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader('data/gpt4_technical_report.pdf')
data = loader.load() # eta 5 mins 

In [18]:
# The data variable contains the parsed PDF that has been split into paragraphs. 
# We will refer to each paragraph as a chunk. 
# Each chunk is now converted into its vector representation using an embedding model. 
# LangChain supports a wide variety of embedding models. For this example, we will use the 
# all-MiniLM-L6-V2 variant of sentence-transformer embeddings, available through the HuggingFace platform.
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings

# # Convert chunks into vector representation using embeddings
# model = SentenceTransformer('all-MiniLM-L6-V2')
# embeddings = model.encode(data)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [19]:
# Now that we have loaded the embedding model, we can generate the vectors from the data and store them in a vector database. 
# Several vector database integrations are available on LangChain. We will use Chroma for this example, as it is the simplest to use
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(data, embeddings)


In [20]:
# Now, the vector database is ready with the vectors! We can ask queries and get responses. For instance,

query = "What is the main objective of the paper?"
docs = db.similarity_search(query, k=3)
print(docs[0].page_content)



arXiv:2303.08774v6 [cs.CL] 4 Mar 2024

GPT-4 Technical Report

OpenAI*

Abstract

We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer- based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core component of this project was developing infrastructure and optimization methods that behave predictably across a wide range of scales. This allowed us to accurately predict some aspects of GPT-4’s performance based on models trained with no more than 1/1,000th the compute of GPT-4.

1 Introduction

This technical report p

In [21]:
# Now, the vector database is ready with the vectors! We can ask queries and get responses. For instance,

query = "What conclusions are made from the paper?"
docs = db.similarity_search(query)
print(docs[0].page_content)



arXiv:2303.08774v6 [cs.CL] 4 Mar 2024

GPT-4 Technical Report

OpenAI*

Abstract

We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer- based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core component of this project was developing infrastructure and optimization methods that behave predictably across a wide range of scales. This allowed us to accurately predict some aspects of GPT-4’s performance based on models trained with no more than 1/1,000th the compute of GPT-4.

1 Introduction

This technical report p

In [22]:

import anthropic
import os 

from dotenv import load_dotenv
load_dotenv()
CLAUDE_API_KEY = os.getenv('CLAUDE_API_KEY')

client = anthropic.Anthropic(
    # defaults to os.environ.get("ANTHROPIC_API_KEY")
    api_key=CLAUDE_API_KEY
)
message = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=1024,
    messages=[
        {"role": "user", "content": "Hello, Claude"}
    ]
)
print(message.content)


[TextBlock(text="Hi! I'm here to help. What would you like to discuss?", type='text')]


In [23]:
from langchain.chains import ConversationalRetrievalChain
import pdfsearch
from langchain_anthropic import ChatAnthropic

llm = ChatAnthropic(
    model="claude-3-5-sonnet-20240620",
    temperature=0,
    max_tokens=1024,
    timeout=None,
    max_retries=2,
    api_key=CLAUDE_API_KEY
)

messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg.content

"J'adore la programmation."

In [24]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that translates {input_language} to {output_language}.",
        ),
        ("human", "{input}"),
    ]
)

chain = prompt | llm
chain.invoke(
    {
        "input_language": "English",
        "output_language": "German",
        "input": "I love programming.",
    }
)

AIMessage(content="Here's the German translation:\n\nIch liebe Programmieren.", additional_kwargs={}, response_metadata={'id': 'msg_011qwy17R4sqPkuZwC5nCChp', 'model': 'claude-3-5-sonnet-20240620', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 23, 'output_tokens': 18}}, id='run-99ae96aa-feaf-42ba-8244-bca09d96d1e7-0', usage_metadata={'input_tokens': 23, 'output_tokens': 18, 'total_tokens': 41, 'input_token_details': {'cache_read': 0, 'cache_creation': 0}})

In [26]:
import gradio as gr

async def chat(message, history):
    # Get relevant documents from the knowledge base
    docs = db.similarity_search(message, k=3)
    
    # Format context from retrieved documents
    context = "\n".join([doc.page_content for doc in docs])
    
    # Create messages for the chat
    messages = [
        (
            "system",
            "You are a helpful assistant. Use the following context to answer questions. If you cannot answer from the context, say so.\n\nContext:\n" + context
        ),
        ("human", message)
    ]
    
    # Get response from LLM
    result = await llm.ainvoke(messages)
    return result.content

demo = gr.ChatInterface(
    fn=chat,
    title="PDF Knowledge Base Chat",
    description="Ask questions about the PDF content using Claude 3",
    examples=[
        "What is the main objective of the paper?",
        "What conclusions are made from the paper?"
    ],
    theme=gr.themes.Soft()
)

demo.launch()




* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


