<a href="https://colab.research.google.com/github/jsmackie/TalkingToWALS/blob/mainline/HidingBehindWALS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qU \
    cohere \
    langchain \
    tiktoken \
    pinecone-client \
    langchain-openai \
    langchain-pinecone

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.2/52.2 kB[0m [31m983.8 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m816.1/816.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.4/207.4 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.4/246.4 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import json
from uuid import uuid4

import tiktoken

from google.colab import userdata, drive

from pinecone import Pinecone, PodSpec, PineconeApiException

from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
from langchain_openai import ChatOpenAI as LangChainChatOpenAI
from langchain_openai import OpenAIEmbeddings as LangChainOpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore as LangChainPinecone



In [3]:
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

In [4]:
def load_chapter_metadata(path):
  with open(os.path.join(path)) as f:
    headers = f.readline().strip().split(',')
    lines = [line.strip().split(',') for line in f]
  chapter_metadata = dict()
  for i,line in enumerate(lines):
    chapter_metadata[i] = {'chapter number': i+1,
                          'title': line[headers.index('Title')],
                          'authors': line[headers.index('Contributors')],
                          'linguistic subfield': line[headers.index('Area')]}
  return chapter_metadata

In [5]:
def text_splitter_create_and_upsert_vectors(index,
                                            path,
                                            chapter_metadata,
                                            text_splitter,
                                            embeddings,
                                            batch_limit=100,
                                            subdirectory='chapters'):
  batch_limit = batch_limit
  texts = []
  metadatas = []
  for i,file in enumerate(os.listdir(os.path.join(path, subdirectory))):
    metadata = chapter_metadata[i]
    with open(os.path.join(path, subdirectory, file), mode='r') as f:
      text = ''.join([line for line in f])
    record_texts = text_splitter.split_text(text)
    record_metadatas = [{"chunk": j, "text": text, **metadata}
                          for j, text in enumerate(record_texts)]
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    if len(texts) >= batch_limit:
      ids = [str(uuid4()) for _ in range(len(texts))]
      embeds = embeddings.embed_documents(texts)
      index.upsert(vectors=zip(ids, embeds, metadatas))
      texts = []
      metadatas = []


In [6]:
def html_splitter_create_and_upsert_vectors(index,
                                            path,
                                            chapter_metadata,
                                            html_splitter,
                                            embeddings):
  batch_limit=100
  texts = list()
  metadatas = list()
  for i,file in enumerate(os.listdir(path)):
    #print(file)
    if not file.endswith('html'):
      continue
    html_splits = html_splitter.split_text_from_file(os.path.join(path, file))
    #Note: this might be too large, and maybe should be split again with RecursiveTextSplitter?
    ids = [str(uuid4()) for _ in range(len(html_splits))]
    embeds = embeddings.embed_documents([split.page_content for split in html_splits])
    #metadatas = [split.metadata for split in html_splits]
    metadatas = {key:value for key,value in chapter_metadata[i].items()}
    for split in html_splits:
      metadatas.update(split.metadata)
    record_metadatas = [{"chunk": j, "text": split.page_content, **metadatas}
                          for j, split in enumerate(html_splits)]
    index.upsert(vectors=zip(ids, embeds, record_metadatas))

In [17]:
def json_splitter_create_and_upsert_vectors(index, path, chapter_metadata, embeddings):
  texts = list()
  for i,file in enumerate(sorted(os.listdir(path))):
    if not file.endswith('.json'):
      continue
    print(i, file)
    metadatas = {key:value for key,value in chapter_metadata[i].items()}
    text_preamble = ''.join(['Hints: ',
                            ','.join([f'{k}={v}' for (k,v) in metadatas.items()]),
                            '\n'])
    with open(os.path.join(path, file), encoding='utf-8', mode='r') as f:
      json_file = json.load(f)
    for blob in json_file:
      chapter_metadata[i]['section'] = list()
      try:
        chapter_metadata[i]['section'].append(blob['section'].strip())
      except KeyError:
        chapter_metadata[i]['section'].append('1. Introduction')
      for p in blob['paragraphs']:
        texts.append('\n'.join([text_preamble, p['text']]))
    embeds = embeddings.embed_documents(texts)

    ids = [str(uuid4()) for _ in range(len(texts))]
    records = [{"chunk": j, "text": text, **metadatas}
                          for j, text in enumerate(texts)]
    try:
      index.upsert(vectors=zip(ids, embeds, records))
    except PineconeApiException:
      print('Pinecone exception occured. File is bigger than 2MB! Ignoring.')

In [8]:
def upsert_wals_data(index, embeddings):
  #Get table of contents and other general metadata about WALS
  with open('/content/drive/MyDrive/WALS/chapter_metadata.csv', encoding='utf-8') as file:
    file_content = file.read()
  embeds = embeddings.embed_documents([file_content])
  vector = [{
      "id":str(uuid4()),
      "values": embeds[0],
      "metadata": {"topic": "table of contents"}
    }]
  index.upsert(vector)

In [14]:
def upsert_language_data(index, embeddings):
  with open('/content/drive/MyDrive/WALS/languages.csv') as f:
    headers = f.readline().strip().split(',')
    rows = [row.strip().split(',') for row in f]
  blobs = list()
  for row in rows:
    blobs.append({headers[j]:value for (j,value) in enumerate(row)})
  #There are ~3000 languages, which makes too many individual records
  #try grouping them by family

  families = list(set([language['Family'] for language in blobs]))
  for family in families:
    full_text = list()

    family_matches = [language for language in blobs if language['Family'] == family]
    genera = list(set([language['Genus'] for language in family_matches]))
    full_text.append(f'Language Family: {family} has {len(genera)} subdivisions/groups/genera/etc.\n')

    for genus in genera:
      full_text.append(f'- Genus: {genus} includes these languages: \n-- ')
      genus_matches = [language for language in family_matches if language['Genus'] == genus]

      language_text = list()
      for language in genus_matches:
        language_text.append('{} ISO code {} spoken in {}'.format(language['Name'], language['ISO639'], language['Macroarea'] ))
      language_text = '\n-- '.join(language_text)
      full_text.append(language_text)

    full_text = ''.join(full_text)
    #print(full_text)
    embeds = embeddings.embed_documents([full_text])
    vector = [{
      "id":str(uuid4()),
      "values": embeds[0],
      "metadata": {"topic": f"Data about the {family} family"}
      "text": full_text,
    }]

    index.upsert(vector)


In [10]:
def get_splitter(split_type='html'):
  #WALS documents are available in text, html, and json formats
  #Chunking plain text seems to give the worst results, likely becaues it's blind to context right now
  #html gives better results, because WALS is already structured by headings
  #json gives best results, because each text blob can be augemented with some metadata
  #this allows every paragraph in a section to carry information about the section name and topic

  if split_type == 'text':
    tokenizer = tiktoken.get_encoding('p50k_base')
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=20,
        length_function=tiktoken_len,
        separators=["\n\n", "\n", " ", ""]
        )
  elif split_type == 'html':
    headers = [("h2", "topic"), ("h5", "introduction")]
    splitter = HTMLHeaderTextSplitter(headers)
  elif split_type == 'json':
    return None

  return splitter


In [18]:
make_new_index = False
upsert_metadata = False
upsert_chapter_data = True

index_name = 'starter-index'
split_type = 'json'

pc = Pinecone(api_key=userdata.get('PINECONE_TOKEN'))
embeddings = LangChainOpenAIEmbeddings(
    model =  'text-embedding-ada-002',
    openai_api_key=userdata.get('OPENAI_KEY'))

if make_new_index:
  #https://docs.pinecone.io/docs/manage-indexes#create-a-pod-based-index
  pc.delete_index(index_name)
  pc.create_index(name=index_name, dimension=1536, metric="cosine", spec=PodSpec(environment="gcp-starter"))
index = pc.Index(index_name)

if upsert_metadata:
  upsert_wals_data(index, embeddings)
  upsert_language_data(index, embeddings)


if upsert_chapter_data:
  chapter_metadata = load_chapter_metadata('/content/drive/MyDrive/WALS/chapter_metadata.csv')
  splitter = get_splitter(split_type)
  if split_type == 'text':
    text_splitter_create_and_upsert_vectors(index, '/content/drive/MyDrive/WALS/', chapter_metadata, splitter, embeddings, subdirectory='chapters_html')
  elif split_type == 'html':
    html_splitter_create_and_upsert_vectors(index, '/content/drive/MyDrive/WALS/chapters_html', chapter_metadata, splitter, embeddings)
  elif split_type == 'json':
    json_splitter_create_and_upsert_vectors(index, '/content/drive/MyDrive/WALS/chapters_json', chapter_metadata, embeddings)

1 chapter_1.json
2 chapter_10.json
3 chapter_100.json
4 chapter_101.json
5 chapter_102.json
6 chapter_103.json
7 chapter_104.json
8 chapter_105.json
9 chapter_106.json
10 chapter_107.json
11 chapter_108.json
12 chapter_109.json
13 chapter_11.json
14 chapter_110.json
15 chapter_111.json
16 chapter_112.json
17 chapter_113.json
18 chapter_114.json
19 chapter_115.json
20 chapter_116.json
21 chapter_117.json
22 chapter_118.json
23 chapter_119.json
24 chapter_12.json
25 chapter_120.json
26 chapter_121.json
27 chapter_122.json
28 chapter_123.json
29 chapter_124.json
30 chapter_125.json
31 chapter_126.json
32 chapter_128.json
Pinecone exception occured. File is bigger than 2MB! Igoring.
33 chapter_129.json
Pinecone exception occured. File is bigger than 2MB! Igoring.
34 chapter_13.json
Pinecone exception occured. File is bigger than 2MB! Igoring.
35 chapter_130.json
Pinecone exception occured. File is bigger than 2MB! Igoring.
36 chapter_131.json
Pinecone exception occured. File is bigger than

In [19]:
vectorstore = LangChainPinecone(index, embeddings, text_key='text')

In [20]:
llm = LangChainChatOpenAI(
    openai_api_key=userdata.get('OPENAI_KEY'),
    model_name='gpt-3.5-turbo',
    temperature=0.9
)

In [24]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)

In [56]:
preamble = """You are an expert on the World Atlas of Language Structures, also called WALS.
Assume that any questions you are asked are referring to WALS, so if someone says Chatper 12
they mean Chapter 12 in WALS, if they ask about verbal morphology they really mean what does
WALS say about verbal morphology. Act like a helpful librarian who knows about WALS."""
query = 'Help me find information about velar nasals'
response = qa.invoke('\n'.join([preamble, query]))
print(response['result'])

Chapter 10 of the World Atlas of Language Structures (WALS) is titled "The Velar Nasal" authored by Gregory D.S. Anderson. This chapter focuses on the linguistic subfield of phonology. If you are looking for information specifically about velar nasals in language structures, Chapter 10 of WALS would be a good place to start.
