<a href="https://colab.research.google.com/github/jsmackie/TalkingToWALS/blob/mainline/TalkingToWALS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU \
    langchain \
    tiktoken \
    pinecone-client \
    langchain-openai \
    langchain-pinecone

In [None]:
import os
from uuid import uuid4

import tiktoken

from google.colab import userdata, drive

from pinecone import Pinecone, PodSpec

from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
from langchain_openai import ChatOpenAI as LangChainChatOpenAI
from langchain_openai import OpenAIEmbeddings as LangChainOpenAIEmbeddings
from langchain_pinecone import Pinecone as LangChainPinecone



In [1]:
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

In [None]:
def load_chapter_metadata(path):
  with open(os.path.join(path)) as f:
    headers = f.readline().strip().split(',')
    lines = [line.strip().split(',') for line in f]
  chapter_metadata = dict()
  for i,line in enumerate(lines):
    chapter_metadata[i] = {'chapter number': i+1,
                          'title': line[headers.index('Title')],
                          'authors': line[headers.index('Contributors')],
                          'linguistic subfield': line[headers.index('Area')]}
  return chapter_metadata

In [None]:
def text_splitter_create_and_upsert_vectors(index,
                                            path,
                                            chapter_metadata,
                                            text_splitter,
                                            embeddings,
                                            batch_limit=100,
                                            subdirectory='chapters'):
  batch_limit = batch_limit
  texts = []
  metadatas = []
  for i,file in enumerate(os.listdir(os.path.join(path, subdirectory))):
    metadata = chapter_metadata[i]
    with open(os.path.join(path, subdirectory, file), mode='r') as f:
      text = ''.join([line for line in f])
    record_texts = text_splitter.split_text(text)
    record_metadatas = [{"chunk": j, "text": text, **metadata}
                          for j, text in enumerate(record_texts)]
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    if len(texts) >= batch_limit:
      ids = [str(uuid4()) for _ in range(len(texts))]
      embeds = embeddings.embed_documents(texts)
      index.upsert(vectors=zip(ids, embeds, metadatas))
      texts = []
      metadatas = []


In [None]:
def html_splitter_create_and_upsert_vectors(index,
                                            path,
                                            chapter_metadata,
                                            html_splitter,
                                            embeddings):
  batch_limit=100
  texts = list()
  metadatas = list()
  for i,file in enumerate(os.listdir(path)):
    print(file)
    if not file.endswith('html'):
      continue
    html_splits = html_splitter.split_text_from_file(os.path.join(path, file))
    #Note: this might be too large, and maybe should be split again with RecursiveTextSplitter
    ids = [str(uuid4()) for _ in range(len(html_splits))]
    embeds = embeddings.embed_documents([split.page_content for split in html_splits])
    #metadatas = [split.metadata for split in html_splits]
    metadatas = {key:value for key,value in chapter_metadata[i].items()}
    for split in html_splits:
      metadatas.update(split.metadata)
    record_metadatas = [{"chunk": j, "text": split.page_content, **metadatas}
                          for j, split in enumerate(html_splits)]
    index.upsert(vectors=zip(ids, embeds, record_metadatas))

In [None]:
tokenizer = tiktoken.get_encoding('p50k_base')
text_embedding_model = 'text-embedding-ada-002'
embeddings = LangChainOpenAIEmbeddings(
    model = text_embedding_model,
    openai_api_key=userdata.get('OPENAI_KEY')
)
split_type = 'html'

if split_type == 'text':
  splitter = RecursiveCharacterTextSplitter(
      chunk_size=400,
      chunk_overlap=20,
      length_function=tiktoken_len,
      separators=["\n\n", "\n", " ", ""]
      )
elif split_type == 'html':
  headers = [("h2", "topic"), ("h5", "introduction")]#, ("h6", "example data")]
  splitter = HTMLHeaderTextSplitter(headers)


In [None]:
# Testing Cell
# headers = [("h2", "topic"),
#            ("table", "example data")]
# html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers)
# html_splits = splitter.split_text_from_file('/content/drive/MyDrive/WALS/chapters_html/chapter_135.html')


In [None]:
make_new_index = False

pc = Pinecone(api_key=userdata.get('PINECONE_TOKEN'))
index_name = 'starter-index'

if make_new_index:
  pc.delete_index(index_name)
  pc.create_index(name=index_name, dimension=1536, metric="cosine", spec=PodSpec(environment="gcp-starter"))
  index = pc.Index(index_name)
  chapter_metadata = load_chapter_metadata('/content/drive/MyDrive/WALS/chapter_metadata.csv')
  if split_type == 'text':
    text_splitter_create_and_upsert_vectors(index, '/content/drive/MyDrive/WALS/', chapter_metadata, splitter, embeddings, subdirectory='chapters_html')
  elif split_type == 'html':
    html_splitter_create_and_upsert_vectors(index, '/content/drive/MyDrive/WALS/chapters_html', chapter_metadata, splitter, embeddings)
else:
  index = pc.Index(index_name)
#See also: https://docs.pinecone.io/docs/manage-indexes#create-a-pod-based-index


chapter_135.html
chapter_134.html
chapter_133.html
chapter_131.html
chapter_132.html
chapter_127.html
chapter_128.html
chapter_130.html
chapter_129.html
chapter_13.html
chapter_123.html
chapter_126.html
chapter_124.html
chapter_121.html
chapter_122.html
chapter_12.html
chapter_120.html
chapter_118.html
chapter_119.html
chapter_111.html
chapter_108.html
chapter_11.html
chapter_110.html
chapter_117.html
chapter_115.html
chapter_112.html
chapter_113.html
chapter_114.html
chapter_116.html
chapter_109.html
chapter_106.html
chapter_104.html
chapter_105.html
chapter_103.html
chapter_102.html
chapter_10.html
chapter_1.html
chapter_101.html
chapter_107.html
chapter_100.html
chapter_136.html
chapter_22.html
chapter_2.html
chapter_21.html
chapter_17.html
chapter_19.html
chapter_27.html
chapter_26.html
chapter_24.html
chapter_25.html
chapter_20.html
chapter_23.html
chapter_15.html
chapter_18.html
chapter_16.html
chapter_144.html
chapter_143.html
chapter_142.html
chapter_141.html
chapter_140.html
c

In [None]:
vectorstore = LangChainPinecone(index, embeddings, text_key='text')

In [None]:
llm = LangChainChatOpenAI(
    openai_api_key=userdata.get('OPENAI_KEY'),
    model_name='gpt-3.5-turbo',
    temperature=0.8
)

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)

In [None]:
query = 'Which chapters did Ian Maddieson write?'
response = qa.invoke(query)
print(response['result'])

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens. However, your messages resulted in 7159 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}