In [1]:
import pandas as pd
import os, openai
import time
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings


import string
LETTERS = string.ascii_uppercase

from dotenv import load_dotenv
load_dotenv()


True

In [2]:
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings

if os.getenv("OPENAI_API_KEY") is not None:
    openai.api_key = os.getenv("OAI")
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

underlying_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OAI"))


store = LocalFileStore("./cache/")

embeddings = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)


OPENAI_API_KEY is ready


  underlying_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OAI"))


In [3]:
if os.getenv("OAI") is not None:
    openai.api_key = os.getenv("OAI")
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OAI"))

df = pd.read_parquet('data/articles.parquet.gzip')
df.columns = ["src","content","LEN"]
df = df[(df.LEN > 1500) & (df.LEN < 30000)].reset_index(drop=True)
print(len(df),"articles of good lengths in the articles.partquet.gzip")
titles = pd.read_parquet("data/titles.parquet.gzip")
df = df.merge(titles, on="src",how="left")
mt = pd.read_parquet("data/metatags.parquet.gzip")
df = df.merge(mt,on="src",how="left")
df.to_parquet("data/consolidated.parquet.gzip",compression="gzip")
#df["text"] = df.content
df["source"] = df.url
df["author"] = df.origin.apply(lambda x: "kelu" if str(x).startswith("20") else "other")
for x in df.columns:
    df[x] = df[x].astype(str)
df_loader = DataFrameLoader(df, page_content_column="content")

df_document = df_loader.load()


OPENAI_API_KEY is ready
2117 articles of good lengths in the articles.partquet.gzip


In [4]:

text_splitter = CharacterTextSplitter(separator='\n\n',chunk_size=2000, chunk_overlap=200)
chunked_documents = text_splitter.split_documents(df_document)
print(len(chunked_documents),"docs to add")

base_path = "./DB/"
if os.getenv("OAI") is not None:
    openai.api_key = os.getenv("OAI")
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OAI"))


if not os.path.isfile(base_path+"chroma.sqlite3"):
    print("Start a new DB")
    vectordb = Chroma.from_documents(
        documents=[chunked_documents[0]],
        embedding=embeddings,
        persist_directory=base_path
    )
    vectordb.persist()
else:
    print("Continue on the DB")
    vectordb = Chroma(persist_directory=base_path,embedding_function=embeddings)
    print(len(vectordb.get()["ids"]),"elements already stored.")
    LSDOCS = vectordb.get()["documents"]

print("Already",len(vectordb.get()["documents"]),"documents.")
print("Adding",len(chunked_documents),"documents.")

if vectordb:
    LSDOCS = vectordb.get()["documents"]
else:
    LSDOCS = []
    
for doc in chunked_documents:
    # Check if the text already exists somewhere
    if not doc.page_content in LSDOCS:
        vectordb.add_documents(
            documents=[doc], 
            embedding=embeddings, 
            persist_directory=base_path
        )
        # Ugly hack to avoid reaching token per min limit 
        # So it sleeps 1s between page
        time.sleep(0.001)
        vectordb.persist()
    else:
        0
        #print("Item already in the DB",doc.page_content[:100].replace("\n"," "))
LSDOCS = vectordb.get()["documents"]
vectordb.persist()

2117 docs to add
OPENAI_API_KEY is ready
Continue on the DB


  vectordb = Chroma(persist_directory=base_path,embedding_function=embeddings)


2109 elements already stored.
Already 2109 documents.
Adding 2117 documents.


  vectordb.persist()


In [5]:
vectordb.get()

{'ids': ['28f9c348-7544-4628-be07-e598d8738ec8',
  'dd0115ac-f0d9-41dd-a893-5730eaf1fd22',
  'e820da38-3bbd-413a-97d9-17d2a681727b',
  'f8207dcb-1ce7-4cdc-beed-6e5b41ba8e19',
  'f041e059-ed0c-4ebf-a6e0-885343118bab',
  '30121147-d0b6-4ca1-8218-afa7e4602bd8',
  'ad90d8f4-6f05-4072-b6d3-24d28451d1a0',
  '0a728eec-d337-4d72-bd80-a894b59fcd28',
  'bb7c894b-0903-4cce-8665-7a65266158c5',
  '8145e256-5909-4a4c-a35a-daf7769d8222',
  'f65ec515-7244-48b1-b7c4-0ea170902d70',
  'dba756ea-17c0-49c8-926b-9598ba6dc3f0',
  'aad3428f-ec0c-4da8-9d77-89b2157b6267',
  '1c837a07-9f89-4996-a225-a412add789f0',
  '6cc13bc3-ccfe-4181-99f3-53e2ed425930',
  '387da3c9-2a2d-4049-900e-5da9d1f15126',
  '24b6996d-4e49-4b16-af39-ed2a441cb7da',
  'f57f3bc4-16d7-4494-8b3e-a55eacb3950d',
  '69c77185-16cc-4fa3-91d2-f60d69a352e6',
  '17ec3e3c-9fa6-4d9b-8e21-4f12ecb0829b',
  'd483af58-0383-49c6-af16-c7b80e2bf608',
  '054273f6-a7fc-4c89-b033-55faed41d2af',
  '4d93c329-b569-4954-ac4d-e7a3b7e2f7c1',
  'da7440f8-de52-4a17-82cf-