In [4]:
import streamlit as st

st.title("CFRD Knowledge Base")
st.info("""I am a knowledgeable chatbot with extensive information about Concrete Face Rockfill Dams (CFRDs). 
        I can answer questions about CFRDs, and I can also summarize the contents of a PDF file.""")
# set up pinecone
import pinecone
import os
from tqdm.auto import tqdm
from langchain.embeddings.openai import OpenAIEmbeddings
import openai

pinecone.init(api_key='48640420-7e79-46d4-b71d-d07286818fef', environment='us-central1-gcp')

index_name = 'icold'

embeddings = OpenAIEmbeddings()

index = pinecone.Index(index_name, embeddings)

query = st.text_input("Ask me a question about CFRDs", "")
query = "how to estimate leakage through a cfrd?"


In [5]:
if query != "":
    index.describe_index_stats()

In [6]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 8100}},
 'total_vector_count': 8100}

In [7]:
query_vec = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

In [50]:
limit = 2000

def retrieve(query):
    """
    This function retrieves the top 10 most relevant contexts from the index
    """
    # Create a vector from the query
    res = openai.Embedding.create(
        input=[query],
        engine=embed_model
    )

    # retrieve from Pinecone
    xq = res['data'][0]['embedding']

    # get relevant contexts
    res = index.query(xq, top_k=10, include_metadata=True)
    contexts = [
        x['metadata']['text'] for x in res['matches']
    ]
    
    sources = [x['metadata']['source'] for x in res['matches']]

    # build our prompt with the retrieved contexts included
    prompt_start = (
        """
        Answer the question based on the context below. Be as detailed as possible but do not
        provide information that is not in the context. Do provide as much relevant context in the response as possible.\n\n"""
        +
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    for i in range(1, len(contexts)):
        if len("\n\n---\n\n".join(contexts[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts[:i-1]) +
                prompt_end
            )
            break
        elif i == len(contexts)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts) +
                prompt_end
            )
    return prompt, sources, contexts

In [45]:
def complete(prompt):
    # query text-davinci-003
    res = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        temperature=0,
        max_tokens=3000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    from langchain.chat_models import ChatOpenAI
    llm = ChatOpenAI(temperature=0.5, max_tokens=400)
    from langchain.chains import RetrievalQA
    
    # Create a vector from an existing index
    docsearch = Pinecone.from_existing_index('icold', embeddings, 
                                         )
    qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True,
    )
    
    lang = 'EN'

    I=0
    query = "Como se comportan los cfrd durange los terremotos?"

    def run_retrieval(query):
        return qa({"query": query + f". Answer in {lang} language with examples of actual dams as possible from the context. Do not mention that there was a context provided.", "n": 1})
    result = run_retrieval(query)

    print(pretty_print(result["result"]))

    return res['choices'][0]['text'].strip()

In [46]:
def pretty_print(response):
    """
    Print a new line every 80 characters
    """
    for i in range(0, len(response), 150):
        print(response[i:i+150])

In [51]:
query = "what caused the cracking of the campos novos dam?"
query_with_contexts, sources, contexts = retrieve(query)
pretty_print(complete(query_with_contexts))

The cracking of the Campos Novos dam is believed to be caused by the differing support to the face slab and the collapse as a result of progressive er
osion within the Zone 2B. Additionally, the changing water level was the main cause of cracking in the face slab of the dam. Horizontal cracks were li
kely caused by a short period of rapid cooling or insulation failure.


In [55]:
print(contexts[1])

of the concrete face connected. In addition, at this location, the foundation rock slopes

steeply downward. It is believed that the initial cause of the cracks is the differing

support to the face slab and the collapse as a result of progressive erosion within the

Zone 2B.

In November 2000, 16 years after first filling of the reservoir, leakage suddenly

increased from a stable 100 l/s to 900 l/s. Within two weeks, leakage had increased to

2 200 l/s. The leakage source was at mid-height of the dam where the dam is 90 m tall.


In [56]:
sources[1]

'G:\\.shortcut-targets-by-id\\1vE28d8xZuJXkpcinFbuku9FJgeaDd48K\\ICOLD - CFRD New Bulletin 2023\\B141.pdf'

In [27]:
query = "what leakage rate was reported in the campos novos dam upon impounding?"
query_with_contexts, sources = retrieve(query)
pretty_print(complete(query_with_contexts))

40 l/s


In [30]:
sources

['G:\\.shortcut-targets-by-id\\1vE28d8xZuJXkpcinFbuku9FJgeaDd48K\\ICOLD - CFRD New Bulletin 2023\\Structural Analysis\\(ASCE)GM.1943-5622.0000478.pdf',
 'G:\\.shortcut-targets-by-id\\1vE28d8xZuJXkpcinFbuku9FJgeaDd48K\\ICOLD - CFRD New Bulletin 2023\\B141.pdf',
 'G:\\.shortcut-targets-by-id\\1vE28d8xZuJXkpcinFbuku9FJgeaDd48K\\ICOLD - CFRD New Bulletin 2023\\Structural Analysis\\jmacr.16.00367.pdf',
 'G:\\.shortcut-targets-by-id\\1vE28d8xZuJXkpcinFbuku9FJgeaDd48K\\ICOLD - CFRD New Bulletin 2023\\Structural Analysis\\jmacr.16.00367.pdf',
 'G:\\.shortcut-targets-by-id\\1vE28d8xZuJXkpcinFbuku9FJgeaDd48K\\ICOLD - CFRD New Bulletin 2023\\Dam Response\\Monitoring System.pdf']