# Meter los datos en la base de datos

Puedes ejecutar este notebook desde la linea de comandos con:

```bash
papermill --cwd ./chat -k python3 --no-log-output chat/add_info_python.ipynb /tmp/output.log.json
```

In [None]:
import os
from tqdm.notebook import tqdm
import more_itertools
import chromadb
import chromadb.utils.embedding_functions as embedding_functions

print(f"Inicializando chromadb")
num_chunks = 7

chroma_client = chromadb.PersistentClient(path='db') # current folder
llama3_embedding_function = embedding_functions.OllamaEmbeddingFunction(
    url="http://localhost:11434/api/embeddings", # ollama running in the background
    model_name="tinyllama",
)

collection_glossary = chroma_client.get_or_create_collection(name="glossary",
                                                      embedding_function=llama3_embedding_function)
collection_turras = chroma_client.get_or_create_collection(name="turras",
                                                    embedding_function=llama3_embedding_function)


# Meter términos de glosario


In [None]:
import csv
print("Glosario loader")
def glosario():
    file_path = os.path.join(os.getcwd(), '..', 'db', 'glosario.csv')
    
    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile, fieldnames=['word', 'definition', 'sources'], delimiter=',', quotechar='"')
        next(reader) # header
        for row in reader:
            yield {
                'word': row['word'],
                'definition': row['definition'],
                'sources': row['sources'] or ''
            }

glos_documents = []
glos_ids = []

for item in glosario():
    glos_documents.append(f"{item['word']}: {item['definition']}")
    glos_ids.append(item['word'])
    
ids=list(more_itertools.chunked(glos_documents, num_chunks))
docs=list(more_itertools.chunked(glos_ids, num_chunks))
for ids, docs in tqdm(list(zip(ids, docs))):
    collection_glossary.upsert(documents=glos_documents, ids=glos_ids)

## Meter las turras

In [None]:
import json
from pprint import pprint
import os
from tqdm import tqdm

print(f"Turra loader")
num_chunks = 7

def turras():
    urls = []
    completas = []

    with open(os.path.join(os.getcwd(), '..', 'db', 'tweets.json'), mode='r', encoding='utf-8') as jsonfile:
        turras = json.load(jsonfile)
        for turra in turras:
            try:
                i = str(turra[0]['id'])
                urls.append("https://turrero.vercel.app/turra/" + i)
                completas.append('\n'.join(tuit["tweet"] for tuit in turra))
            except Exception as e:
                pprint(turra)
                raise e
    assert len(urls) == len(completas), "Bug: no hay la misma cantidad de urls y turras"
    return (urls, completas)

# Partirlo en cachos para poder ver la barrita llenarse, pues tarda mucho
urls_list, turras_list = turras()

urls_chunks = list(more_itertools.chunked(urls_list, num_chunks))
turras_chunks = list(more_itertools.chunked(turras_list, num_chunks))
batched = list(zip(urls_chunks, turras_chunks))

for ids, docs in tqdm(batched):
    collection_turras.upsert(
        documents=docs,
        ids=ids
    )