<a href="https://colab.research.google.com/github/jsmackie/TalkingToWALS/blob/mainline/TalkingToWALS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install -qU \
    cohere \
    langchain \
    tiktoken \
    pinecone-client \
    langchain-openai \
    langchain-pinecone

In [42]:
import os
import json
from uuid import uuid4

import tiktoken

from google.colab import userdata, drive

from pinecone import Pinecone, PodSpec, PineconeApiException

from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
from langchain_openai import ChatOpenAI as LangChainChatOpenAI
from langchain_openai import OpenAIEmbeddings as LangChainOpenAIEmbeddings
from langchain_pinecone import Pinecone as LangChainPinecone



In [14]:
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

In [15]:
def load_chapter_metadata(path):
  with open(os.path.join(path)) as f:
    headers = f.readline().strip().split(',')
    lines = [line.strip().split(',') for line in f]
  chapter_metadata = dict()
  for i,line in enumerate(lines):
    chapter_metadata[i] = {'chapter number': i+1,
                          'title': line[headers.index('Title')],
                          'authors': line[headers.index('Contributors')],
                          'linguistic subfield': line[headers.index('Area')]}
  return chapter_metadata

In [16]:
def text_splitter_create_and_upsert_vectors(index,
                                            path,
                                            chapter_metadata,
                                            text_splitter,
                                            embeddings,
                                            batch_limit=100,
                                            subdirectory='chapters'):
  batch_limit = batch_limit
  texts = []
  metadatas = []
  for i,file in enumerate(os.listdir(os.path.join(path, subdirectory))):
    metadata = chapter_metadata[i]
    with open(os.path.join(path, subdirectory, file), mode='r') as f:
      text = ''.join([line for line in f])
    record_texts = text_splitter.split_text(text)
    record_metadatas = [{"chunk": j, "text": text, **metadata}
                          for j, text in enumerate(record_texts)]
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    if len(texts) >= batch_limit:
      ids = [str(uuid4()) for _ in range(len(texts))]
      embeds = embeddings.embed_documents(texts)
      index.upsert(vectors=zip(ids, embeds, metadatas))
      texts = []
      metadatas = []


In [17]:
def html_splitter_create_and_upsert_vectors(index,
                                            path,
                                            chapter_metadata,
                                            html_splitter,
                                            embeddings):
  batch_limit=100
  texts = list()
  metadatas = list()
  for i,file in enumerate(os.listdir(path)):
    #print(file)
    if not file.endswith('html'):
      continue
    html_splits = html_splitter.split_text_from_file(os.path.join(path, file))
    #Note: this might be too large, and maybe should be split again with RecursiveTextSplitter
    ids = [str(uuid4()) for _ in range(len(html_splits))]
    embeds = embeddings.embed_documents([split.page_content for split in html_splits])
    #metadatas = [split.metadata for split in html_splits]
    metadatas = {key:value for key,value in chapter_metadata[i].items()}
    for split in html_splits:
      metadatas.update(split.metadata)
    record_metadatas = [{"chunk": j, "text": split.page_content, **metadatas}
                          for j, split in enumerate(html_splits)]
    index.upsert(vectors=zip(ids, embeds, record_metadatas))

In [81]:
def json_splitter_create_and_upsert_vectors(index, path, chapter_metadata, embeddings):
  texts = list()
  for i,file in enumerate(sorted(os.listdir(path))):
    if not file.endswith('.json'):
      continue
    print(i, file)
    metadatas = {key:value for key,value in chapter_metadata[i].items()}
    text_preamble = ''.join(['Hints: ',
                            ','.join([f'{k}={v}' for (k,v) in metadatas.items()]),
                            '\n'])
    with open(os.path.join(path, file), encoding='utf-8', mode='r') as f:
      json_file = json.load(f)
    for blob in json_file:
      chapter_metadata[i]['section'] = list()
      try:
        chapter_metadata[i]['section'].append(blob['section'].strip())
      except KeyError:
        chapter_metadata[i]['section'].append('1. Introduction')
      for p in blob['paragraphs']:
        texts.append('\n'.join([text_preamble, p['text']]))
      #texts.append(blob['paragraphs'][0]['text'])
    embeds = embeddings.embed_documents(texts)

    ids = [str(uuid4()) for _ in range(len(texts))]
    records = [{"chunk": j, "text": text, **metadatas}
                          for j, text in enumerate(texts)]
    try:
      index.upsert(vectors=zip(ids, embeds, records))
    except PineconeApiException:
      print('Pinecone exception occured. File is bigger than 2MB! Igoring.')

In [82]:
text_embedding_model = 'text-embedding-ada-002'
embeddings = LangChainOpenAIEmbeddings(
    model = text_embedding_model,
    openai_api_key=userdata.get('OPENAI_KEY'))

split_type = 'json'

if split_type == 'text':
  tokenizer = tiktoken.get_encoding('p50k_base')
  splitter = RecursiveCharacterTextSplitter(
      chunk_size=400,
      chunk_overlap=20,
      length_function=tiktoken_len,
      separators=["\n\n", "\n", " ", ""]
      )
elif split_type == 'html':
  headers = [("h2", "topic"), ("h5", "introduction")]#, ("h6", "example data")]
  splitter = HTMLHeaderTextSplitter(headers)
elif split_type == 'json':
  pass #no extra formatting required


In [83]:
make_new_index = True

pc = Pinecone(api_key=userdata.get('PINECONE_TOKEN'))
index_name = 'starter-index'

if make_new_index:
  pc.delete_index(index_name)
  pc.create_index(name=index_name, dimension=1536, metric="cosine", spec=PodSpec(environment="gcp-starter"))
  index = pc.Index(index_name)
  chapter_metadata = load_chapter_metadata('/content/drive/MyDrive/WALS/chapter_metadata.csv')
  if split_type == 'text':
    text_splitter_create_and_upsert_vectors(index, '/content/drive/MyDrive/WALS/', chapter_metadata, splitter, embeddings, subdirectory='chapters_html')
  elif split_type == 'html':
    html_splitter_create_and_upsert_vectors(index, '/content/drive/MyDrive/WALS/chapters_html', chapter_metadata, splitter, embeddings)
  elif split_type == 'json':
    json_splitter_create_and_upsert_vectors(index, '/content/drive/MyDrive/WALS/chapters_json', chapter_metadata, embeddings)
else:
  index = pc.Index(index_name)
#See also: https://docs.pinecone.io/docs/manage-indexes#create-a-pod-based-index


1 chapter_1.json
2 chapter_10.json
3 chapter_100.json
4 chapter_101.json
5 chapter_102.json
6 chapter_103.json
7 chapter_104.json
8 chapter_105.json
9 chapter_106.json
10 chapter_107.json
11 chapter_108.json
12 chapter_109.json
13 chapter_11.json
14 chapter_110.json
15 chapter_111.json
16 chapter_112.json
17 chapter_113.json
18 chapter_114.json
19 chapter_115.json
20 chapter_116.json
21 chapter_117.json
22 chapter_118.json
23 chapter_119.json
24 chapter_12.json
25 chapter_120.json
26 chapter_121.json
27 chapter_122.json
28 chapter_123.json
29 chapter_124.json
30 chapter_125.json
31 chapter_126.json
32 chapter_128.json
Pinecone exception occured. File is bigger than 2MB! Igoring.
33 chapter_129.json
Pinecone exception occured. File is bigger than 2MB! Igoring.
34 chapter_13.json
Pinecone exception occured. File is bigger than 2MB! Igoring.
35 chapter_130.json
Pinecone exception occured. File is bigger than 2MB! Igoring.
36 chapter_131.json
Pinecone exception occured. File is bigger than

JSONDecodeError: Expecting value: line 1 column 4138 (char 4137)

In [80]:
with open('/content/drive/MyDrive/WALS/chapter_metadata.csv', encoding='utf-8') as file:
    file_content = file.read()
embeds = embeddings.embed_documents([file_content])
vector = {
      "id":str(uuid4()),
      "values": embeds[0][0],
      "metadata": {"topic": "table of contents"}
    }
#index.upsert(vector)

In [55]:
vectorstore = LangChainPinecone(index, embeddings, text_key='text')

In [56]:
llm = LangChainChatOpenAI(
    openai_api_key=userdata.get('OPENAI_KEY'),
    model_name='gpt-3.5-turbo',
    temperature=0.8
)

In [57]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)

In [60]:
query = 'Who wrote chapter 28?'
response = qa.invoke(query)
print(response['result'])

Chapter 28 titled "Reduplication" was written by Carl Rubino.
