## Familiarization with LLM packages such as `LangChain`, `OpenAI` & `tiktoken`

In [80]:
import credentials

### 1. Tiktoken

https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb


Encodings specify how text is converted into tokens. Different models use different encodings

`tiktoken` supports three encodings used by OpenAI models:

| Encoding name           | OpenAI models                                       |
|-------------------------|-----------------------------------------------------|
| `cl100k_base`           | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`  |
| `p50k_base`             | Codex models, `text-davinci-002`, `text-davinci-003`|
| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci`                         |

In [81]:
import tiktoken

In [82]:
encoding = tiktoken.get_encoding("cl100k_base")
#encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [83]:
text = 'Tomorrow it will rain in Budapest.'
encoding.encode(text)

[91273, 433, 690, 11422, 304, 70695, 13]

In [84]:
text = 'TomorrowitwillraininBudapest.'
encoding.encode(text)

[91273, 275, 14724, 30193, 258, 33, 664, 28724, 13]

In [85]:
[encoding.decode_single_token_bytes(token) for token in [91273, 275, 14724, 30193, 258, 33, 664, 28724, 13]]

[b'Tomorrow', b'it', b'will', b'rain', b'in', b'B', b'ud', b'apest', b'.']

In [86]:
[bytes.decode(encoding.decode_single_token_bytes(token)) for token in [91273, 275, 14724, 30193, 258, 33, 664, 28724, 13]]

['Tomorrow', 'it', 'will', 'rain', 'in', 'B', 'ud', 'apest', '.']

In [87]:
encoding.decode([433])

' it'

In [88]:
encoding.decode([275])

'it'

### 2. `LangChain`, `Pinecone` & other vector databases / vector representers for Document Representation

Load PDF

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
loader = PyPDFLoader("../docs/investment_report.pdf")
#doc = loader.load_and_split()
doc = loader.load_and_split(text_splitter = RecursiveCharacterTextSplitter(chunk_size = 4000, chunk_overlap = 200))

In [None]:
len(doc)

7

In [None]:
for i in doc:
    print(len(i.page_content))

2991
3986
549
3977
632
3869
2733


In [None]:
print(doc[0].page_content[:400])

Quarterly Investment Report 
April 2023
Despite banking instability and uncertain monetary policy expectations, the U.S. economy ended the quarter on  
a positive note. The S&P 500 Index rose 3.7% in March and 7.5% in the first quarter. The U.S. markets outperformed 
most major international markets. During the quarter, bond prices moved up, pushing Treasury yields in 1-year  and longer maturities


In [None]:
doc[0].metadata

{'source': '../docs/investment_report.pdf', 'page': 0}

In [None]:
page_num = doc[-1].metadata['page']
print('There are', page_num, 'pages in this PDF')

There are 3 pages in this PDF


Embeddings to use

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
openai_embedder = OpenAIEmbeddings(openai_api_key = credentials.openai_api)

In [None]:
openai_embed_result = openai_embedder.embed_query('Embed this text!')
len(openai_embed_result)

In [None]:
openai_embed_results = openai_embedder.embed_documents(['Embed this text!', 'This one too, please.'])
print(len(openai_embed_results))
print(len(openai_embed_results[0]))

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
hf_embedder = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [89]:
hf_embed_result = hf_embedder.embed_query('Embed this text!')
len(hf_embed_result)

768

In [None]:
hf_embed_results = hf_embedder.embed_documents(['Embed this text!', 'This one too, please.'])
print(len(hf_embed_results))
print(len(hf_embed_results[0]))

2
768


Set up environment

In [None]:
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key = credentials.pinecone_api,  
    environment = credentials.pinecone_loc  
)

### 3. `OpenAI`

In [None]:
from langchain.llms import OpenAI

In [None]:
llm = OpenAI(model_name='text-davinci-003', openai_api_key=credentials.openai_api)

In [None]:
#need billing set-up for API to be live
#llm('Tell me a joke')

In [None]:
llm.get_num_tokens('Tell me a joke.')

5

In [None]:
llm.get_num_tokens('Tellmeajoke.')

5