# Objectives
1. Use Cohere in Langchain Framework
  - https://python.langchain.com/docs/integrations/providers/cohere
  - https://colab.research.google.com/github/cohere-ai/notebooks/blob/main/notebooks/Multilingual_Search_with_Cohere_and_Langchain.ipynb?ref=txt.cohere.com#scrollTo=s12ZE7vcHRJI
    - Helped me to solve the cohere embeddings problem
  - interestingly, you can use cohere's own RAG! (try that out!)

2. Use Weaviate as the Vector database
  - https://python.langchain.com/docs/integrations/vectorstores/weaviate
3. Create a conversational agent w/ citation (interesting)

Useful Information:
- Multiquery generation (could replace with cohere's only implementation)
  - https://python.langchain.com/docs/modules/data_connection/retrievers/MultiQueryRetriever?ref=blog.langchain.dev

## Prerequisite


In [None]:
from google.colab import drive
drive.mount('/content/drive')

PATH = '/content/drive/MyDrive/Cohere Hackathon/data'

Mounted at /content/drive


In [None]:

!pip install cohere langchain weaviate-client PyPDF2 chromadb faiss-cpu openai sentence_transformers unidecode

In [None]:
import os

COHERE_API_KEY = 'BbUfe7pYwdcPQ1ByK313NDm0uiav1FpJMXWzU59R' # production key!!
WEAVIATE_API_KEY = 'wHqgQrLferdBfKXGBxfF9t3wJStLxJ6aRXGb'
WEAVIATE_URL = 'https://cohere-hackathon-v9totjun.weaviate.network'
OPENAI_API_KEY = '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'

# get from https://dashboard.cohere.com/api-keys
os.environ["COHERE_API_KEY"] = COHERE_API_KEY
# get from https://console.weaviate.cloud/dashboard
os.environ["WEAVIATE_API_KEY"] = WEAVIATE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY



In [None]:
import cohere

from weaviate.util import generate_uuid5

from langchain.llms import Cohere
from langchain.chat_models import ChatCohere
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.cohere import CohereEmbeddings


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Weaviate
from langchain.vectorstores import Chroma, FAISS
import weaviate

from langchain.docstore.document import Document

from langchain.agents import initialize_agent
from langchain.agents import AgentType

from langchain.memory import ConversationBufferMemory

from langchain.retrievers import ContextualCompressionRetriever, CohereRagRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain

from unidecode import unidecode

import PyPDF2
import time
import os
import json

## Helper Functions

In [None]:
# 0. get files from google drive
def get_files(path: str, count=None) -> dict:

    files = {}
    for subdir in ['Proposal', 'RFP']:
        subpath = os.path.join(PATH, subdir)
        for i, project in enumerate(os.listdir(subpath)):
            full_path = os.path.join(subpath, project)
            files['{} {}'.format(project, subdir)] = [open(os.path.join(full_path, f), 'rb') for f in os.listdir(full_path)]

            if count == i+1:
                break
    return files

In [None]:
# 1. Collect and process data
# - index data appropriately using metadata
def get_document(fileobj: object, project:str) -> list:
    doc_combined = []

    #create reader variable that will read the pdffileobj
    reader = PyPDF2.PdfReader(fileobj)

    #This will store the number of pages of this pdf file
    num_pages = len(reader.pages)

    timestamp = time.time()

    for i in range(num_pages):
        #create a variable that will select the selected number of pages
        pageobj = reader.pages[i]
        text = pageobj.extract_text()
        text = unidecode(text)  # strips away all the unicode stuff

        #split text recursively
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
        splits = text_splitter.split_text(text)

        for j, t in enumerate(splits[:]):
            metadata = {'source': project, 'filename': fileobj.name, 'page_number': str(i+1)}
            doc_combined.append(Document(page_content=t, metadata=metadata))
            # doc_combined.append(Document(page_content=t))
    return doc_combined

In [None]:
# 2. Settup VectorStore
# Comments
# a) use metadata to keep track of proposal data: https://python.langchain.com/docs/modules/data_connection/indexing - okay done

def get_db(docs=[], create=False, use_cohere=False, device='cuda'):
  # create vectorstore for retrieval --------------------------------
  # what is wrong with cohere embeddings - okay now it works!
  if use_cohere:
    embeddings = CohereEmbeddings(model='embed-english-v3.0', cohere_api_key=COHERE_API_KEY)  # !!cohere embedding v3.0 requires to specify input_type!

  else:
    model_name = "BAAI/bge-small-en"
    model_kwargs = {'device': device}
    encode_kwargs = {'normalize_embeddings': True}

    embeddings = HuggingFaceBgeEmbeddings(
          model_name=model_name,
          model_kwargs=model_kwargs,
          encode_kwargs=encode_kwargs
      )

  client = weaviate.Client(
        url=WEAVIATE_URL,
        auth_client_secret=weaviate.AuthClientPassword(
            username = 'siukai.cheung@mail.utoronto.ca',  # Replace w/ your WCS username
            password = "Kc97690461-",  # Replace w/ your WCS password
        ),
    )

  if create:

    client.schema.delete_all()

    db = Weaviate.from_documents(docs, embeddings, weaviate_url=WEAVIATE_URL, by_text=False, index_name='Cfa_proposal', text_key='text')
    # db = FAISS.from_documents(docs, embeddings)
  else:

    # get the schema
    schema = client.schema.get()

    # print the schema
    print(json.dumps(schema, indent=4))
    db = Weaviate(client, index_name='Cfa_proposal', text_key='text', embedding=embeddings, by_text=False)

  return db

In [None]:
# 3. set up the retriever chain w/ Cohere rerank
def get_response(db, use_cohere_rag=False, docs=[]):
    ''' Inspired by the top k and n semantic search approach '''
    retriever = db.as_retriever(
      search_kwargs={"k": 10}
    )

    if use_cohere_rag:
        retriever = CohereRagRetriever(llm=ChatCohere())  # need to work on!

    # cohere rerank...
    compressor = CohereRerank(top_n=5, user_agent="my-app")
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
    )

    # please update to return source as well!
    chain = RetrievalQA.from_chain_type(
      llm=ChatCohere(), retriever=compression_retriever, return_source_documents=True
    )
    # chain = RetrievalQAWithSourcesChain.from_chain_type(
    #   llm=ChatCohere(), retriever=compression_retriever
    # )

    return chain, retriever


In [None]:
# purely thru co.chat end point
def get_response_cohere(query, chat_history=[], docs=[]):
    # format docs
    docs_formatted = []
    for d in docs:
      doc = {
          'title': d.metadata['source'],
          'page_number': str(d.metadata['page_number']),
          'snippet': d.page_content
      }
      docs_formatted.append(doc)

    print(type(docs_formatted))

    co = cohere.Client(COHERE_API_KEY)
    response = co.chat(
      chat_history=chat_history,
      message=query,
      documents=docs_formatted
    )

    return response

## Main Code

In [None]:
files = get_files(PATH, count=None)

documents = []
for project in files:
    try:
        for fileobj in files[project]:
            documents += get_document(fileobj, project)
    except:
        print(project)

104-92 Replacement of Noise Walls at Various Locations Proposal
104-65 Peel King St E Bridges over Humber RFP
105-100 Rehabilitation of Cochrane Street over CP Rail Bridge RFP
251-30 DNS Rainbow Bridge Replacement RFP




In [None]:
database = get_db(docs=documents, create=False, use_cohere=True, device='cuda')

In [None]:
retrieval_chain, retriever = get_response(database)
results = retrieval_chain(
    {"query": "Tell me about the past projects CFA done for the City of Brampton"}
)

In [None]:
results['source_documents']
# ISSUE: we are not getting the metadata from weaviates database for some reason... (shit this is weird...)

[Document(page_content='2020-269P - DETAILED DESIGN, CONTRACT ADMINISTRATION\nAND CONSTRUCTION INSPECTION FOR CONVERSION OF\nPRIVATE NOISE WALLS ON BOVAIRD DRIVE, WEST OF HIGHWAY\n410, IN THE CITY OF BRAMPTON, PROJECT 19-4517\nDate Issued: March 5, 2020 12:00 PM\nVendor Details\nCompany Name: Chisholm Fleming & Associates\nAddress:317 Renfrew Dr., Suite 301\nMarkham, Ontario L3R 9S8\nContact: James Moffat\nEmail: rfp@chisholmfleming.com\nPhone: 905-474-1458 232\nFax: 905-474-1458\nHST#:\nSubmission Details', metadata={'filename': '/content/drive/MyDrive/Cohere Hackathon/data/Proposal/104-85 Noise Walls on Bovaird West of 410/2020-269P Final Submission Complete.pdf', 'page_number': '1', 'source': '104-85 Noise Walls on Bovaird West of 410 Proposal', 'relevance_score': 0.30725408}),
 Document(page_content="Created On: Wednesday March 18, 2020 13:37:31\nSubmitted On: Tuesday April 14, 2020 10:48:06\nSubmitted By: James Moffat\nEmail: rfp@chisholmfleming.com\nTransaction #: 76e0c396-be77-4