<a href="https://colab.research.google.com/github/jsmackie/TalkingToWALS/blob/mainline/TalkingToWALS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install -qU \
    langchain \
    tiktoken \
    pinecone-client \
    langchain-openai \
    langchain-pinecone
    #langchain_pinecone

In [13]:
import os
from uuid import uuid4

import tiktoken

from google.colab import userdata, drive

from pinecone import Pinecone, PodSpec

from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI as LangChainChatOpenAI
from langchain_openai import OpenAIEmbeddings as LangChainOpenAIEmbeddings
from langchain_pinecone import Pinecone as LangChainPinecone



In [20]:
def load_chapter_metadata(path):
  with open(os.path.join(path)) as f:
    headers = f.readline().strip().split(',')
    lines = [line.strip().split(',') for line in f]
  chapter_metadata = dict()
  for i,line in enumerate(lines):
    chapter_metadata[i] = {'title': line[headers.index('Name')],
                          'authors': line[headers.index('Contributor')],
                          'source': line[headers.index('Citation')],
                          'topic': line[headers.index('Topic')]}
  return chapter_metadata

In [27]:
def create_and_upsert_vectors(path,
                              chapter_metadata,
                              text_splitter,
                              embeddings,
                              batch_limit=100):
  batch_limit = batch_limit
  texts = []
  metadatas = []
  for i,file in enumerate(os.listdir(os.path.join(path, 'chapters'))):
    metadata = chapter_metadata[i]
    with open(os.path.join(path, 'chapters', file), mode='r') as f:
      text = ''.join([line for line in f])
    record_texts = text_splitter.split_text(text)
    record_metadatas = [{"chunk": j, "text": text, **metadata}
                        for j, text in enumerate(record_texts)]
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    if len(texts) >= batch_limit:
      ids = [str(uuid4()) for _ in range(len(texts))]
      embeds = embeddings.embed_documents(texts)
      index.upsert(vectors=zip(ids, embeds, metadatas))
      texts = []
      metadatas = []


In [25]:
def tiktoken_len(text):
    tokens = tokenizer.encode(text,disallowed_special=())
    return len(tokens)

In [29]:
#Initialized a bunch of useful objets: tokenizer, splitter, embeddings

tokenizer = tiktoken.get_encoding('p50k_base')
text_embedding_model = 'text-embedding-ada-002'

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
    )

embeddings = LangChainOpenAIEmbeddings(
    model = text_embedding_model,
    openai_api_key=userdata.get('OPENAI_KEY')
)


In [31]:
#Change this value depending on whether the index already exists
new_index = False

pc = Pinecone(api_key=userdata.get('PINECONE_TOKEN'))
index_name = 'starter-index'

if new_index:
  pc.delete_index(index_name)
  pc.create_index(name=index_name, dimension=1536, metric="cosine", spec=PodSpec(environment="gcp-starter"))
  chapter_metadata = load_chapter_metadata('/content/drive/MyDrive/WALS/chapter_metadata.csv')
  create_and_upsert_vectors('/content/drive/MyDrive/WALS/', chapter_metadata, text_splitter, embeddings)
index = pc.Index(index_name)
#See also: https://docs.pinecone.io/docs/manage-indexes#create-a-pod-based-index


In [32]:

vectorstore = LangChainPinecone(index, embeddings, text_key='text')

In [36]:
llm = LangChainChatOpenAI(
    openai_api_key=userdata.get('OPENAI_KEY'),
    model_name='gpt-3.5-turbo',
    temperature=0.5
)

In [37]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)

In [42]:
query = 'What is WALS?'
response = qa.invoke(query)
print(response['result'])

WALS stands for the World Atlas of Language Structures. It is an atlas that provides maps showing the geographical distribution of structural linguistic features. It is the first feature atlas on a worldwide scale and focuses on abstract features of language systems that can be compared across unrelated languages. WALS is used by linguists interested in linguistic typology to study the ways in which languages vary structurally and the limits to this variation.
