In [None]:
pip install langchain requests openai transformers faiss-cpu

In [None]:
from getpass import getpass
OPENAI_API_KEY = getpass('Enter your OpenAI key: ')

In [None]:
import os
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
from langchain.llms import OpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
import requests

In [None]:
def get_wiki_data(title, first_paragraph_only):
    url = f"https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext=1&titles={title}"
    if first_paragraph_only:
        url += "&exintro=1"
    data = requests.get(url).json()
    return Document(
        page_content=list(data["query"]["pages"].values())[0]["extract"],
        metadata={"source": f"https://en.wikipedia.org/wiki/{title}"},
    )


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from time import sleep

def get_page_text(url, depth=2, visited_links=None, max_links=10, cache=None, timeout=3):
    """
    Recursively follow links on a webpage and return a list of documents of subsequent found pages.
    :param url: The URL of the webpage to scrape
    :param depth: The number of levels deep to recursively follow links. Default is 2.
    :param visited_links: A set of links that have already been visited to prevent revisiting links
    :param max_links: The maximum number of links to follow. Default is 50.
    :param cache: A cache of links and their corresponding documents to prevent unnecessary web requests
    :param timeout: Number of seconds to wait before timing out a request. Default is 5.
    """
    # Initialize the visited links set if not provided
    if visited_links is None:
        visited_links = set()
    if cache is None:
        cache = {}
    # Extract the root domain from the URL
    parsed_uri = urlparse(url)
    root_domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    # Check if the link has already been visited
    if url in visited_links:
        print("Hit in visited_links set: ", url)
        return None
    # Check if the link is in the cache
    if url in cache:
        print("Hit in cache set: ", url)
        return cache[url]

    visited_links.add(url)
    # Send a GET request to the URL and handle common errors
    try:
        print("Retrieving: ", url)
        page = requests.get(url, timeout=timeout)
        page.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving the webpage {url}: {str(e)}")
        return None
    # parse the HTML and extract the text
    soup = BeautifulSoup(page.text, 'html.parser')
    text = soup.get_text()
    # Add the link and its corresponding document to the cache
    cache[url] = Document(text, {"source": url})
    # Check if we have reached the maximum depth or maximum number of links to follow
    if depth <= 0 or max_links <= 0:
        return cache[url]
    # Follow links on the webpage
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        # Only follow links that are on the same root domain
        if href and root_domain in href:
            links.append(href)
    # Follow the links recursively and space out the requests to avoid throttling
    for link in links:
        sleep(timeout)
        doc = get_page_text(link, depth-1, visited_links, max_links-1, cache, timeout)
        if doc:
            cache[link] = doc
    return cache[url]



In [None]:

#visited_links = set()
#doc = get_page_text("https://www.betterup.com/en/about-us?hsLang=en", depth=1, visited_links=visited_links)
#print(doc.page_content)
#print(doc.metadata)


In [None]:
sources = [
    get_page_text("https://www.betterup.com/en/about-us?hsLang=en", depth=0),
    get_page_text("https://www.betterup.com/about-us/leadership-team?hsLang=en", depth=0),
    get_page_text("https://www.betterup.com/about-us/careers", depth=1),
    get_page_text("https://www.betterup.com/blog/page/1", depth=1),
]

In [None]:
source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
for source in sources:
    for chunk in splitter.split_text(source.page_content):
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))

search_index = FAISS.from_documents(source_chunks, OpenAIEmbeddings())

In [None]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0))

def print_answer(question):
    print(
        chain(
            {
                "input_documents": search_index.similarity_search(question, k=4),
                "question": question,
            },
            return_only_outputs=False,
        )["output_text"]
    )

In [None]:
print_answer("Who are all of the VPs at BetterUP?")

In [None]:
print_answer("How many VPs are men vs women? List the woman, list the men. Emit as a markdown table")

In [None]:
print_answer("What are all of the ways coaching can help people?")

In [None]:
print_answer("What is Better UP?")