## Overview

This notebook is used for the following:
- Test loading the vector database
- Retrieving Documents from the database

In [37]:
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from pathlib import Path
from dotenv import load_dotenv


load_dotenv(override=True)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
CHROMA_PATH = os.getenv("CHROMA_PATH")


In [3]:
import lingtypology.glottolog as glotto

glotto.get_iso_by_glot_id('arap1274')

def get_by_iso(iso_code):
    name = glotto.get_by_iso(iso_code)
    return name.replace(" ", "_")

## Load ChromaDB with Grammar Books

Languages represented in Grambank that are also in Back to School paper. Languages are looked up in languages.csv file
- min : Minangkabau
- lus : Mizo
- Wolof
- Dinka???
- Chuvash
- gug : Guarani
- kgv : Kalamang
- ilo : Iloko (aka Ilokano)
- kac : Kachin (aka Southern Jinghpaw)
- ntu : Natugu

### Setup

Setup database connections. Also prints out existing collections.

In [13]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vectorstore = Chroma(embedding_function=embeddings, persist_directory=CHROMA_PATH)
chroma_client = vectorstore._client

collections = chroma_client.list_collections()

for collection in collections:
    print(f"{collection.name}: {collection.id}")

iloko: 0207435c-e88e-4741-bd15-141a00959a80
kalamang: 178a81d7-eb70-447b-897b-331a67ebbf28
mizo: 34524b0d-1ddc-4687-b51b-dc2ad329338b
southern_jinghpaw: 61b50fae-58de-4fa7-8b94-3c1042dc3038
minangkabau: 92e0fe92-eb4f-4eab-964f-d9e1f7f1096c
langchain: 9a25548c-b248-4c48-8e00-43178c24f3b7


### Delete Existing Database

In [None]:
# Delete a collection by name
# chroma_client.delete_collection('mizo')

for collection in collections:
    print(f"Deleting collection {collection.name}")
    chroma_client.delete_collection(collection.name)

Deleting collection langchain
Deleting collection minangkabau


### Load Grammar Books into Database

In [7]:
languages = ['min', 'lus', 'kac', 'ilo', 'kgv']

for language in languages:
    print(get_by_iso(language))

Minangkabau
Mizo
Southern_Jinghpaw
Iloko
Kalamang


In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200,
    length_function = len,
    is_separator_regex=False
)

for language in languages:
    language_name = get_by_iso(language)
    print(f"COLLECTION NAME: {language_name}")

    resource_directory = Path(f"./resources/{language_name.lower()}")
    resources = list(resource_directory.glob('*.txt'))

    for resource in resources:
        print(f"Loading resource: {resource}")
        # loader = TextLoader(TEXT_DOC_PATH)
        loader = TextLoader(resource)
        pages = loader.load()

        chunks = text_splitter.split_documents(pages)

        embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
        db_chroma = Chroma.from_documents(documents=chunks, collection_name=language_name.lower(), embedding=embeddings, persist_directory=CHROMA_PATH)

COLLECTION NAME: Minangkabau
Loading resource: resources/minangkabau/adelaar_proto-malayic1992v2_o.txt
Loading resource: resources/minangkabau/zarbaliev_minangkabau1987_o.txt
Loading resource: resources/minangkabau/reibaud_minangkabau2004_o.txt
Loading resource: resources/minangkabau/crouch_minangkabau2009.txt
COLLECTION NAME: Mizo
Loading resource: resources/mizo/weidert_lushai1975_o.txt
Loading resource: resources/mizo/subbarao_mizo1998_o.txt
Loading resource: resources/mizo/chhangte_mizo1993_o.txt
Loading resource: resources/mizo/chhangte_mizo1989_o.txt
COLLECTION NAME: Southern_Jinghpaw
Loading resource: resources/southern_jinghpaw/kurabe_jinghpaw2017_o.txt
Loading resource: resources/southern_jinghpaw/hertz_kachin1902_o.txt
Loading resource: resources/southern_jinghpaw/qingxia-diehl_jingpho2003_s.txt
COLLECTION NAME: Iloko
Loading resource: resources/iloko/espiritu_ilokano1984_o.txt
COLLECTION NAME: Kalamang
Loading resource: resources/kalamang/grammar_book_long.txt


## Retrieve From Database

Assumes database is already loaded

### Create a Custom Retriever

In [18]:
from typing import List, Tuple
from pydantic import PrivateAttr
from langchain.schema import Document  # Schema for document objects
from langchain.schema.retriever import BaseRetriever  # Base class for retrievers
from langchain.vectorstores import VectorStore  # VectorStore for similarity search

class CustomRetriever(BaseRetriever):
    _vector_store: VectorStore = PrivateAttr()

    def __init__(self, vector_store: VectorStore):
        super().__init__()
        self._vector_store = vector_store

    def _get_relevant_documents(self, query: str) -> List[Document]:
        """Retrieve relevant documents with scores and IDs."""
        results = self._vector_store.similarity_search_with_score(query)
        
        # Store scores and IDs in document metadata for future use
        retrieved_docs = []
        for doc, score in results:
            doc.metadata["score"] = score
            doc.metadata["id"] = doc.metadata.get("id", "unknown_id")
            retrieved_docs.append(doc)
        
        return retrieved_docs
    
    def retrieve_with_scores_and_ids(self, query: str, top_k: int = 5) -> List[Tuple[Document, float, str]]:
        """Retrieve documents with similarity scores and IDs."""
        results = self._vector_store.similarity_search_with_score(query, k=top_k)
        
        retrieved_docs = []
        for doc, score in results:
            doc_id = doc.metadata.get("id", "unknown_id")
            retrieved_docs.append((doc, score, doc_id))
        
        return retrieved_docs


### Query Vectorstore

In [19]:
language = 'min'
language_name = get_by_iso(language)
print(language_name)

Minangkabau


In [39]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vectorstore = Chroma(embedding_function=embeddings, persist_directory=CHROMA_PATH)
chroma_client = vectorstore._client

collections = chroma_client.list_collections()

for collection in collections:
    print(collection.name)


iloko
kalamang
mizo
southern_jinghpaw
minangkabau
langchain


In [40]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
embedding_db = Chroma(collection_name=language_name.lower(),
                      embedding_function=embeddings,
                      persist_directory=CHROMA_PATH
)

retriever = embedding_db.as_retriever(search_kwargs={'k': 2})

In [41]:
retriever.invoke("Are there definite articles?")

[Document(metadata={'source': 'resources/minangkabau/adelaar_proto-malayic1992v2_o.txt'}, page_content='5.5.2.2 LOCATIVE PRONOUNS\nIn all isolects, locative pronouns are derived from demonstratives. In IBN this happened through prefixation of di-, and in the other isolects through prefixation of s-; in SM locative pronouns are usually precliticised by a locative preposition (viz. di ‘in, at’, kə ‘to(wards)’, and dari ‘from’). This leads to the following reconstructions:\n*(?)-(i)ni(?) ‘here’; ^l)-(i)tu(f) ‘there’; *(?)-(i)na(n) / *(?)-(a)na(?) ‘yonder’.'),
 Document(metadata={'source': 'resources/minangkabau/adelaar_proto-malayic1992v2_o.txt'}, page_content='apa mana mana siapa bila\n\nbar/a\n\nsa?apa\n\nindefinite\n\nanu\n\nanu\n\nanu\n\n(tu)apo mano -mano siapo kəbilo\nbaxapo\nanu\n\nnama ni -ni sapa kamaya\nbər/apa məsak\nanu?\n\napd mand -mani siapð kapan\nbər/apð\nanu/anð\n\n5.5.2.1 Demonstrative pronouns\nMost isolects have- a bipartite series of demonstratives, but it is likely 

### Using CustomRetriever

In [42]:
# Initialize with a vector store (like FAISS, ChromaDB, etc.)
my_retriever = CustomRetriever(embedding_db)

# Using the standard LangChain interface
relevant_docs = my_retriever.get_relevant_documents("Are there definite articles?")
for doc in relevant_docs:
    print(f"ID: {doc.metadata.get('id')}, Score: {doc.metadata.get('score')}, Content: {doc.page_content}")


  relevant_docs = my_retriever.get_relevant_documents("Are there definite articles?")


ID: unknown_id, Score: 0.4084357023239136, Content: 79 4.2.2.3 Specifiers................................................................................................. 80 4.2.2.4 Negators .................................................................................................. 80 4.2.2.5 TAM Adverbs.......................................................................................... 80 4.2.2.6 Pragmatic Particles .................................................................................. 81
ID: unknown_id, Score: 0.41273611783981323, Content: 5.5.2.2 LOCATIVE PRONOUNS
In all isolects, locative pronouns are derived from demonstratives. In IBN this happened through prefixation of di-, and in the other isolects through prefixation of s-; in SM locative pronouns are usually precliticised by a locative preposition (viz. di ‘in, at’, kə ‘to(wards)’, and dari ‘from’). This leads to the following reconstructions:
*(?)-(i)ni(?) ‘here’; ^l)-(i)tu(f) ‘there’; *(?)-(i)na(n)

## Random Stuff

Need to determine if I want to keep these cells

In [43]:
my_set = set()


collection = chroma_client.get_or_create_collection(name='minangkabau')
results = collection.get()
# print(results.keys())
hus = results['metadatas']
# List document IDs
# document_ids = results['ids']
# for doc_id in document_ids:
#     print(f"Document ID: {doc_id}")


# for metadata in results['metadatas']:
#     my_set.add(metadata['source'])
# print(my_set)

In [44]:
results.keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data', 'included'])

In [45]:
hus

[{'source': 'resources/minangkabau/zarbaliev_minangkabau1987_o.txt'},
 {'source': 'resources/minangkabau/zarbaliev_minangkabau1987_o.txt'},
 {'source': 'resources/minangkabau/reibaud_minangkabau2004_o.txt'},
 {'source': 'resources/minangkabau/adelaar_proto-malayic1992v2_o.txt'},
 {'source': 'resources/minangkabau/reibaud_minangkabau2004_o.txt'},
 {'source': 'resources/minangkabau/crouch_minangkabau2009.txt'},
 {'source': 'resources/minangkabau/reibaud_minangkabau2004_o.txt'},
 {'source': 'resources/minangkabau/adelaar_proto-malayic1992v2_o.txt'},
 {'source': 'resources/minangkabau/adelaar_proto-malayic1992v2_o.txt'},
 {'source': 'resources/minangkabau/crouch_minangkabau2009.txt'},
 {'source': 'resources/minangkabau/reibaud_minangkabau2004_o.txt'},
 {'source': 'resources/minangkabau/zarbaliev_minangkabau1987_o.txt'},
 {'source': 'resources/minangkabau/adelaar_proto-malayic1992v2_o.txt'},
 {'source': 'resources/minangkabau/adelaar_proto-malayic1992v2_o.txt'},
 {'source': 'resources/minan

In [None]:
def get_resources_list(lang_name):
    my_set = set()
    collection = chroma_client.get_or_create_collection(name=lang_name)
    results = collection.get()
    # print(results.keys())
    hus = results['metadatas']
    # List document IDs
    # document_ids = results['ids']
    # for doc_id in document_ids:
    #     print(f"Document ID: {doc_id}")


    for metadata in results['metadatas']:
        my_set.add(metadata['source'])
    return my_set