## OpenAI Keys

In [1]:
import os
# read API key from text file
def read_api_key(file):
    with open(file, "r") as f:
        return f.read().strip()
    
os.environ["OPENAI_API_KEY"] = read_api_key("openai_key.txt")

## Set up directory loader

In [2]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders.csv_loader import CSVLoader

# directory = '/Users/harshvardhan/Library/CloudStorage/Dropbox/Misc/Map of Tiny Perfect Things/map-of-tiny-perfect-things/master_data/city_files'
directory = '/Users/harshvardhan/Library/CloudStorage/Dropbox/Misc/Map of Tiny Perfect Things/map-of-tiny-perfect-things/master_data/master_data.csv'

In [3]:
def load_docs(directory, csv=False):
  if csv:
    loader = CSVLoader(directory)
    documents = loader.load()
    return documents
  else:
    loader = DirectoryLoader(directory)
    documents = loader.load()
    return documents

documents = load_docs(directory, csv=True)
len(documents)

217

## Split docs

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents,chunk_size=500,chunk_overlap=10):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
len(docs)

464

In [6]:
docs[3].page_content

'google_maps_link: https://www.google.com/maps/search/?api=1&query=1797%20Shattuck%20Ave.%20Ste%20A,%20Berkeley,%20CA%2094709,%20United%20States&query_place_id=ChIJc4U-eZ9-hYAR5ih4d-bZpBI\nlat: 37.8751742\nlng: -122.2684613\nopening_hours: Monday: 9:00\u202fAM\u2009–\u200911:00\u202fPM\n Tuesday: 9:00\u202fAM\u2009–\u200911:00\u202fPM\n Wednesday: 9:00\u202fAM\u2009–\u200911:00\u202fPM\n Thursday: 9:00\u202fAM\u2009–\u200911:00\u202fPM\n Friday: 9:00\u202fAM\u2009–\u200911:00\u202fPM\n Saturday: 10:00\u202fAM\u2009–\u200911:00\u202fPM\n Sunday: 10:00\u202fAM\u2009–\u200910:00\u202fPM\ntype: cafe\ngoogle_place_id: ChIJc4U-eZ9-hYAR5ih4d-bZpBI'

Generally, one chunk has about place's details. Therefore, when making the query for similarity search, we will use several similar docs. Not just one.

## Counting number of tokens
GPT-3.5 supports 4096 tokens generally.

In [7]:
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
import tiktoken

In [8]:
encoding = tiktoken.get_encoding("cl100k_base")
len(encoding.encode(docs[3].page_content))

272

In [9]:
## function to get number of tokens in a string
def get_num_tokens(string):
    return len(encoding.encode(string))

## Creating and saving embeddings

In [10]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [11]:
persist_directory = "vectorstore"
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)
vectorstore.persist()

Using embedded DuckDB with persistence: data will be stored in: vectorstore


## Number of Similar Documents

In [12]:
def get_similiar_docs(query, k=1, score=False):
  if score:
    similar_docs = vectorstore.similarity_search_with_score(query,k=k)
  else:
    similar_docs = vectorstore.similarity_search(query,k=k)
  return similar_docs

In [13]:
query = "Best coffee places in Knoxville"
similar_docs = get_similiar_docs(query)
similar_docs

[Document(page_content='name: Coffee and Chocolate\nlocation: Knoxville, TN\ncity: Knoxville\nstate: Tennessee\ncountry: United States of America\ncreators_rec: NA\nnotes: Known for heavenly chocolate cookies and the rare perk of being open until 10 pm, this cafe is a must-visit.\naddress: 327 Union Ave, Knoxville, TN 37902, United States\nrating: 4.7\nuser_ratings_total: 456\ngoogle_maps_link: https://www.google.com/maps/search/?api=1&query=327%20Union%20Ave,%20Knoxville,%20TN%2037902,%20United%20States&query_place_id=ChIJ71tZUcQXXIgRlNXT-k3IwC0\nlat: 35.9649171\nlng: -83.9189009\nopening_hours: Monday: 7:00\u202fAM\u2009–\u200910:00\u202fPM\n Tuesday: 7:00\u202fAM\u2009–\u200910:00\u202fPM\n Wednesday: 7:00\u202fAM\u2009–\u200910:00\u202fPM\n Thursday: 7:00\u202fAM\u2009–\u200910:00\u202fPM\n Friday: 7:00\u202fAM\u2009–\u200910:00\u202fPM\n Saturday: 7:00\u202fAM\u2009–\u200910:00\u202fPM\n Sunday: 7:00\u202fAM\u2009–\u200910:00\u202fPM\ntype: cafe\ngoogle_place_id: ChIJ71tZUcQXXIgRl

In [14]:
get_num_tokens(similar_docs[0].page_content)

4134