In [25]:
%load_ext autoreload
%autoreload 2

import helpers as s

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
import os, openai, datetime, hashlib, re
import time
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import Chroma

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

import pandas as pd


from dotenv import load_dotenv
load_dotenv()

In [31]:
df = pd.read_parquet('data/articles.parquet.gzip')
df.columns=  ["src","content","LEN"]
df = df[(df.LEN > 1500) & (df.LEN < 30000)].reset_index(drop=True)
titles = pd.read_parquet("data/titles.parquet.gzip")
df = df.merge(titles,on="src",how="left")
mt = pd.read_parquet("data/metatags.parquet.gzip")
df = df.merge(mt,on="src",how="left")
df.to_parquet("data/consolidated.parquet.gzip",compression="gzip")

In [32]:
for x in df.columns:
    df[x] = df[x].astype(str)

In [33]:
if os.getenv("OAI") is not None:
    openai.api_key = os.getenv("OAI")
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OAI"))


OPENAI_API_KEY is ready


In [34]:
df_loader = DataFrameLoader(df, page_content_column="content")

In [35]:
df_document = df_loader.load()
display(df_document[1])

Document(page_content='Today’s post is the final one in my series on goals. The rest of the posts in this series are here. Recent editions ICYMI:\nMetacognition: A Critical Leadership & Career Skill\nMost Performance Reviews are biased\nBeware of Existential Asymmetry\nIn my previous edition titled Good Goals Gone Bad, I highlighted how an exclusive focus on pre-determined, outcome-focused goals often blinds us to the emergent. At its worse, this can even lead to disengagement and burnout.\nSo I was gratified to come across a particularly poignant and revelatory piece of writing from Nobel laureate physicist Richard Feynman, which captures many of the key points I made about positivist vs constructionist approach to goals.\nA Nobel Prize for piddling around\nThe young Feynman was going through a depressive phase and was disengaged with his work at the time. It included some major imposter syndrome as well.\nDuring this period I would get offers from different places universities and in

In [36]:
text_splitter = CharacterTextSplitter(separator='\n\n',chunk_size=2000, chunk_overlap=200)
chunked_documents = text_splitter.split_documents(df_document)
print(len(chunked_documents))

1410


In [38]:
base_path = "./DB/"

if not os.path.isfile(base_path+"chroma.sqlite3"):
    print("Start a new DB")
    vectordb = Chroma.from_documents(
        documents=[chunked_documents[0]],
        embedding=embeddings,
        persist_directory=base_path
    )
    vectordb.persist()
else:
    print("Continue on the DB")
    vectordb = Chroma(persist_directory=base_path,embedding_function=embeddings)
    print(len(vectordb.get()["ids"]),"elements already stored.")
    LSDOCS = vectordb.get()["documents"]

Continue on the DB
1410 elements already stored.


In [39]:
print("Already",len(vectordb.get()["documents"]),"documents.")
print("Adding",len(chunked_documents),"documents.")

if vectordb:
    LSDOCS = vectordb.get()["documents"]
else:
    LSDOCS = []
    
for doc in chunked_documents:
    # Check if the text already exists somewhere
    if not doc.page_content in LSDOCS:
        vectordb.add_documents(
            documents=[doc], 
            embedding=embeddings, 
            persist_directory=base_path
        )
        # Ugly hack to avoid reaching token per min limit 
        # So it sleeps 1s between page
        time.sleep(0.2)
        vectordb.persist()
    else:
        0
        #print("Item already in the DB",doc.page_content[:100].replace("\n"," "))
LSDOCS = vectordb.get()["documents"]
vectordb.persist()

Already 1410 documents.
Adding 1410 documents.


In [40]:
embeddings = OpenAIEmbeddings()
base_path = "./DB/"
vectordb = Chroma(persist_directory=base_path,embedding_function=embeddings)
def create_agent_chain(llm):
    chain = load_qa_chain(llm, chain_type="stuff")#,verbose=True)
    return chain

In [57]:
def get_llm_response(query,vectordb,temperature=0.1,k=10,seed="",overwrite=False,tag={}):
    F = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    Q = query+","+str(temperature)+","+str(k)+str(seed)+str(tag)
    Q = hashlib.md5(Q.encode()).hexdigest()
    print(Q)
    FILE = "cache/"+Q+".md"
    if os.path.isfile(FILE) and not overwrite:
        with open(FILE,"r") as f:
            A =f.read()
            answer = A.split("\n\n---\n\n>A:\n")[-1].strip()
            docs = A.split("\n\n---\n\n>A:\n")[-2].split("\n\n---\n\nD:\n")[-1].strip()
            docs = re.findall("\'url\': \'(.*?)\'}", docs, re.DOTALL)
    else:
        llm = ChatOpenAI(
            # models : https://platform.openai.com/docs/models
            temperature=temperature,
            model="gpt-3.5-turbo-0125"
        )
        chain = create_agent_chain(llm)
        if tag:
            retriever = vectordb.as_retriever(search_kwargs={"filter":tag,"k":k})
            matching_docs = retriever.get_relevant_documents(query)

            #matching_docs = vectordb.similarity_search(query,k,search_kwargs={'filter': tag})
        else:
            matching_docs = vectordb.similarity_search(query,k)
        answer = chain.run(input_documents=matching_docs, question=query)
        docs = [x.metadata["url"] for x in matching_docs]
        with open(FILE,"w") as f:
            f.write(">Q:\n"+query +"\n\n---\n\nD:\n"+str(matching_docs)+ "\n\n---\n\n>A:\n"+answer)
    return answer, docs

In [58]:
answer, matching_docs = get_llm_response("What are digital twins? ",vectordb,temperature=0.05,k=5)
print(answer,"\n","\n* "+"\n* ".join(matching_docs))

c24af503fe0ed08792f7f093aa96f8da
Digital twins are virtual models or replicas of physical objects, processes, or systems. They are created using real-time data from sensors and other sources to simulate the behavior and characteristics of the physical counterpart. Digital twins are used for monitoring, analyzing, and optimizing the performance of the physical object or system. They can help in predicting maintenance needs, improving efficiency, and testing different scenarios without impacting the actual physical object. In the context of the text provided, digital twins are being used to monitor the behavior of the 3D-printed steel bridge in Amsterdam and refine the design of similar structures in the future. 
 
* https://jyx.jyu.fi/handle/123456789/87497?locale-attribute=en
* https://www.absolutdata.com/blog/why-scaling-customer-driven-marketing-requires-digital-twins-absolutdata/
* https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/ten-unsung-digital-and-ai-ideas-sh

In [69]:
B = vectordb.similarity_search("biology innovation",5)
for b in list(B):
    print(b.metadata["src"],b.metadata["title"])

62a5bae52266a680c6a13bd3ef8dc48c The Bio Revolution: Advances in Biology and Their Impact on Society
99e803820e0c09fbc3163a5a26ff49bb Paradigm Shift: Holobionts in Biology
21110afd93356cd60bb66cf8f6ffdfdb Exploring the Effectiveness of Living Labs
b1f6991b9b718672751c9427c3f5b794 Cyborg Botany: Augmented Plants as Sensors, Displays and Actuators
115b59fc3f0d7b148482545adb1a8038 Google DeepMind Discovers 2.2 Million Crystal Structures Through AI
