<a href="https://colab.research.google.com/github/jsmackie/TalkingToWALS/blob/mainline/TalkingToWALS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU \
    langchain \
    cohere \
    tiktoken \
    openai \
    pinecone-client \
    pinecone-datasets \
    tqdm \
    langchain-openai

In [None]:
!pip install langchain_pinecone



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.colab import userdata, drive
import tiktoken
import os
from langchain_openai import OpenAIEmbeddings
from uuid import uuid4
import pinecone
from pinecone import Pinecone, PodSpec
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_pinecone import Pinecone as LangChainPinecone
from langchain_openai import OpenAIEmbeddings


In [None]:
tokenizer = tiktoken.get_encoding('p50k_base')
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)


In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
model_name = 'text-embedding-ada-002'

openai_embeddings = OpenAIEmbeddings(
    model = model_name,
    openai_api_key=userdata.get('OPENAI_KEY')
)

In [None]:
#https://docs.pinecone.io/docs/manage-indexes#create-a-pod-based-index

pc = Pinecone(api_key=userdata.get('PINECONE_TOKEN'))
index = pc.Index('starter-index')

# pc.create_index(
#   name="starter-index",
#   dimension=1536,
#   metric="cosine",
#   spec=PodSpec(
#     environment="gcp-starter"
#   )
# )


In [None]:

path = '/content/drive/MyDrive/WALS'
with open(os.path.join(path, 'chapter_metadata.csv')) as f:
  headers = f.readline().strip().split(',')
  lines = [line.strip().split(',') for line in f]
chapter_metadata = dict()
for i,line in enumerate(lines):
  chapter_metadata[i] = {'title': line[headers.index('Name')],
                         'authors': line[headers.index('Contributor')],
                         'source': line[headers.index('Citation')],
                         'topic': line[headers.index('Topic')]}

In [None]:
batch_limit = 100
texts = []
metadatas = []
for i,file in enumerate(os.listdir(os.path.join(path, 'chapters'))):
  metadata = chapter_metadata[i]
  with open(os.path.join(path, 'chapters', file), mode='r') as f:
    text = ''.join([line for line in f])
  record_texts = text_splitter.split_text(text)
  record_metadatas = [{"chunk": j, "text": text, **metadata}
                      for j, text in enumerate(record_texts)]
  texts.extend(record_texts)
  metadatas.extend(record_metadatas)
  if len(texts) >= batch_limit:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = openai_embeddings.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))
    texts = []
    metadatas = []


In [None]:
client = OpenAI(
    api_key=userdata.get('OPENAI_KEY')
)

In [None]:
query = "What tone patterns are common?"
xq = client.embeddings.create(input=query, model=model_name).data[0].embedding

# query, returning the top 5 most similar results
res = index.query(vector=[xq], top_k=5, include_metadata=True)

for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")
# res = client.embeddings.create(input=[query],model='text-embedding-ada-002')
# embedding = res.data[0].embedding
# pinecone_result = index.query(vector=embedding, top_k=3, include_metadata=True)

0.83: 1. Introduction All languages make use of variations in the musical pitch of the voice as part of their sound systems, but they differ in the ways in which modifications of pitch are used and how many different types of functions are served by pitch variations. Linguists distinguish between two of the major uses of pitch as tone and intonation . Intonation is the term that is used to describe sentence types, such as question versus statement, or to indicate whether a speaker has finished or intends to continue speaking, or to show which parts of an utterance present new or highlighted information versus old or less significant information. Tone is the term used to describe the use of pitch patterns to distinguish individual words or the grammatical forms of words, such as the singular and plural forms of nouns or different tenses of verbs. In the simplest cases, each syllable of a language with tones will have its own characteristic tonal pattern, which may be a relatively flat p

In [None]:
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    openai_api_key=userdata.get('OPENAI_KEY')
)
vectorstore = LangChainPinecone(index, embed, text_key='text')

In [None]:
llm = ChatOpenAI(
    openai_api_key=userdata.get('OPENAI_KEY'),
    model_name='gpt-3.5-turbo',
    temperature=0.1
)

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)

In [None]:
qa.invoke(query)

{'query': 'What tone patterns are common?',
 'result': 'The passage does not provide specific information about the common tone patterns in languages. It only mentions that many languages in East and Southeast Asia, including Chinese, Vietnamese, and Thai, have tone systems that include contour tones. However, it does not specify the common tone patterns within these languages.'}