In [3]:
import os 
import json 
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY =os.getenv("OPENAI_API_KEY")
ACTIVELOOP_TOKEN=os.getenv("ACTIVELOOP_TOKEN")

In [10]:

from langchain import OpenAI 
from langchain.chains import RetrievalQA
from langchain.llms import OpenAIChat
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.prompts import PromptTemplate


In [5]:
import os
import requests
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
import re

In [8]:
#DEEPLAKE_DATASET_PATH='hub://pravosnazna/dataset101'
dataset_path='https:/app.activeloop.ai/pravosnazna/dataset101'
embeddings = OpenAIEmbeddings()

def get_documentation_urls():
    # List of relative URLs for Hugging Face documentation pages, commented a lot of these because it would take too long to scrape all of them
    return [
    'https://zapier.com/resources/guides/quick-start/automation-basics',
    'https://zapier.com/resources/guides/quick-start/create-zap',
    'https://zapier.com/resources/guides/quick-start/more-to-know',
    'https://zapier.com/blog/maximize-productivity-with-multi-step-zaps/',
    'https://zapier.com/blog/how-ai-tools-reach-no-code-audiences/',
    'https://zapier.com/blog/new-product-features-april-2023/',
    
    ]


def construct_full_url(base_url, relative_url):
    # Construct the full URL by appending the relative URL to the base URL
    return base_url + relative_url


def scrape_page_content(url):
    # Send a GET request to the URL and parse the HTML response using BeautifulSoup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract the desired content from the page (in this case, the body text)
    text=soup.body.text.strip()
    # Remove non-ASCII characters
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\xff]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def scrape_all_content(base_url, relative_urls, filename):
    # Loop through the list of URLs, scrape content, and add it to the content list
    content = []
    for relative_url in relative_urls:
        full_url = construct_full_url(base_url, relative_url)
        scraped_content = scrape_page_content(full_url)
        content.append(scraped_content.rstrip('\n'))

    # Write the scraped content to a file
    with open(filename, 'w', encoding='utf-8') as file:
        for item in content:
            file.write("%s\n" % item)
    
    return content

# Define a function to load documents from a file
def load_docs(root_dir,filename):
    # Create an empty list to hold the documents
    docs = []
    try:
        # Load the file using the TextLoader class and UTF-8 encoding
        loader = TextLoader(os.path.join(
            root_dir, filename), encoding='utf-8')
        # Split the loaded file into separate documents and add them to the list of documents
        docs.extend(loader.load_and_split())
    except Exception as e:
        # If an error occurs during loading, ignore it and return an empty list of documents
        pass
    # Return the list of documents
    return docs

  
def split_docs(docs):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    return text_splitter.split_documents(docs) 

def load_vectors_into_deeplake(dataset_path, source_chunks):
    # Initialize the DeepLake database with the dataset path and embedding function
    deeplake_db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)
    # Add the text chunks to the database
    deeplakedb=deeplake_db.add_texts(source_chunks)
    return deeplakedb


# Define the main function
def main():
    base_url = 'https://zapier.com/'
    # Set the name of the file to which the scraped content will be saved
    filename='content.txt'
    # Set the root directory where the content file will be saved
    root_dir =''
    relative_urls = get_documentation_urls()
    # Scrape all the content from the relative urls and save it to the content file
    content = scrape_all_content(base_url, relative_urls,filename)
    # Load the content from the file
    docs = load_docs(root_dir,filename)
    # Split the content into individual documents
    texts = split_docs(docs)
    # Create a DeepLake database with the given dataset path and embedding function
    db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)
    # Add the individual documents to the database
    db.add_documents(texts)
    # Clean up by deleting the content file
    os.remove(filename)

# Call the main function if this script is being run as the main program
if __name__ == '__main__':
    main()

https:/app.activeloop.ai/pravosnazna/dataset101 loaded successfully.





Deep Lake Dataset in https:/app.activeloop.ai/pravosnazna/dataset101 already exists, loading from the storage
Dataset(path='https:/app.activeloop.ai/pravosnazna/dataset101', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype    shape    dtype  compression
  -------   -------  -------  -------  ------- 
 embedding  generic   (0,)    float32   None   
    ids      text     (0,)      str     None   
 metadata    json     (0,)      str     None   
   text      text     (0,)      str     None   


Evaluating ingest: 100%|██████████| 1/1 [00:05<00:00


Dataset(path='https:/app.activeloop.ai/pravosnazna/dataset101', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype     shape     dtype  compression
  -------   -------   -------   -------  ------- 
 embedding  generic  (2, 1536)  float32   None   
    ids      text     (2, 1)      str     None   
 metadata    json     (2, 1)      str     None   
   text      text     (2, 1)      str     None   




In [53]:
# Prepare a retriever from the deeplake_db
db = DeepLake(dataset_path=dataset_path, read_only=True, embedding_function=embeddings)
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['k'] = 20
# Construct the prompt template for the chain
template = """
Given these portions of documents and the presented question, you're tasked to provide a comprehensive response with appropriate sources. If the answer is beyond my capabilities, I'll acknowledge that instead of guessing. Every response should include relevant sources.

QUESTION: {question}

DOCUMENT EXCERPTS:
{summaries}

RESPONSE AND SOURCES:
"""

prompt_template = PromptTemplate(
    template=template,
    input_variables=["summaries", "question"],
)

# Construct the RetrievalQAWithSourcesChain
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=OpenAI(temperature=0),
    chain_type="stuff", 
    retriever=retriever,
    chain_type_kwargs={
        "prompt": prompt_template,
    },
)



https:/app.activeloop.ai/pravosnazna/dataset101 loaded successfully.





Deep Lake Dataset in https:/app.activeloop.ai/pravosnazna/dataset101 already exists, loading from the storage
Dataset(path='https:/app.activeloop.ai/pravosnazna/dataset101', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype    shape    dtype  compression
  -------   -------  -------  -------  ------- 
 embedding  generic   (0,)    float32   None   
    ids      text     (0,)      str     None   
 metadata    json     (0,)      str     None   
   text      text     (0,)      str     None   




In [54]:
chain("What is Zapier?")

{'question': 'What is Zapier?',
 'answer': '\nZapier is an online automation tool that allows users to connect different web applications and automate repetitive tasks. It is a web-based service that enables users to integrate the web applications they use, and automate workflows without any coding. Zapier allows users to create "Zaps" which are automated workflows that connect two or more apps together. For example, a user can create a Zap that automatically adds new contacts from a spreadsheet to their CRM. Zapier is used by millions of people around the world to automate their workflows and save time. \n\nSources: \n\n- https://zapier.com/\n- https://www.g2.com/products/zapier/reviews\n- https://www.capterra.com/p/170045/Zapier/',
 'sources': ''}

In [62]:
template = """
As a Question Answering Chatbot, I am tasked to provide comprehensive answers to specific questions based on the provided document excerpts. If the information is not contained within these excerpts, it is crucial to admit the absence of information rather than speculate. Each answer MUST include reference to the sources it was derived from. Please ensure that all answers end with a 'SOURCES' section detailing from where the information was obtained.

QUESTION: {question}

DOCUMENT EXCERPTS:
{summaries}

RESPONSE:
"""


In [63]:
prompt_template = PromptTemplate(
    template=template,
    input_variables=["summaries", "question"],
)

chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=OpenAI(temperature=0),
    chain_type="stuff", # Replace with the correct chain type
    retriever=retriever,
    chain_type_kwargs={
        "prompt": prompt_template,
    },
)


In [64]:
chain("How Zapier connects two apps together?")

{'question': 'How Zapier connects two apps together?',
 'answer': 'Zapier is a web-based automation tool that connects two apps together. It allows users to create automated workflows, called Zaps, that can be triggered by a specific event in one app and then perform an action in another app. For example, a Zap can be set up to automatically create a new task in a project management app when a new customer is added to a CRM. ',
 'sources': 'https://zapier.com/learn/getting-started-guide/what-is-zapier/'}