In [None]:
!pip3 install -U readabilipy langchain openai bs4 requests chromadb tiktoken

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
from urllib.parse import unquote

In [2]:
query = "langchain"

In [3]:
response = requests.get(f"https://www.google.com/search?q={query}") # Make the request
soup = BeautifulSoup(response.text, "html.parser") # Parse the HTML
links = soup.find_all("a") # Find all the links in the HTML

In [4]:
def filter_links(links):
    urls = []
    for link in links:
        if link["href"].startswith("/url?q="):
            url = link["href"].replace("/url?q=", "")
            url = unquote(url.split("&sa=")[0])
            if url.startswith("https://scholar.google.com/scholar_url?url=http"):
                url = url.replace("https://scholar.google.com/scholar_url?url=", "").split("&")[0]
            elif 'google.com/' in url or url.endswith('.pdf'):
                continue
            if '#' in url:
                url = url.split('#')[0]
            if url.startswith('http://') or url.startswith('https://'):
                urls.append(url)
    return urls

urls = filter_links(links)
urls = list(np.unique(urls)) # Use numpy to dedupe the list of urls after removing anchors
urls

['https://api.python.langchain.com/',
 'https://aws.amazon.com/what-is/langchain/',
 'https://en.wikipedia.org/wiki/LangChain',
 'https://github.com/langchain-ai/langchain',
 'https://js.langchain.com/docs/get_started/introduction',
 'https://pypi.org/project/langchain/',
 'https://python.langchain.com/docs/get_started/introduction',
 'https://python.langchain.com/docs/get_started/quickstart',
 'https://twitter.com/langchainai?lang=en',
 'https://www.ibm.com/topics/langchain',
 'https://www.langchain.com/',
 'https://www.techtarget.com/searchenterpriseai/definition/LangChain']

In [5]:
# HTML -> text
from readabilipy import simple_json_from_html_string 
from langchain.schema import Document
from typing import Optional 

In [6]:
def scrape_and_parse(url: str) -> Optional[Document]:
    """Scrape a webpage and parse it into a Document object"""
    try:
        req = requests.get(url)
        article = simple_json_from_html_string(req.text, use_readability=True)
        
        if not article['plain_text']:
            print(f"No plain text found in the article from {url}")
            return None
        
        return Document(page_content='\n\n'.join([a['text'] for a in article['plain_text']]), metadata={'source': url, 'page_title': article['title']})
    except (requests.exceptions.RequestException, Exception) as e:
        print(f"Error occurred while processing {url}: {e}")
        return None


In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed

# This requires nodejs 
# You can install it like so: curl -sL https://deb.nodesource.com/setup_lts.x | sudo -E bash - && sudo apt-get install -y nodejs
documents = []
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_url = {executor.submit(scrape_and_parse, url): url for url in urls}
    for future in as_completed(future_to_url):
        url = future_to_url[future]
        try:
            doc = future.result()
        except Exception as exc:
            print(f'{url} generated an exception: {exc}')
        else:
            if doc:
                documents.append(doc)


No plain text found in the article from https://pypi.org/project/langchain/
No plain text found in the article from https://python.langchain.com/docs/get_started/quickstart
No plain text found in the article from https://python.langchain.com/docs/get_started/introduction
No plain text found in the article from https://www.langchain.com/


In [8]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(separator=' ', chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
len(texts)

955

In [9]:
import openai
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

In [10]:
embeddings = OpenAIEmbeddings()

In [11]:
import pprint
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.schema import StrOutputParser

def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

db = Chroma.from_documents(texts, embeddings)
retriever = db.as_retriever()

In [16]:
docs = retriever.invoke("Who created langchain?")

In [18]:
format_docs(docs)

"What is LangChain? LangChain is an open source framework that lets software developers working with artificial intelligence (AI) and its machine learning subset combine large language models with other external components to develop LLM-powered applications. The goal of LangChain is to link powerful LLMs, such as OpenAI's GPT-3.5 and GPT-4, to an array of external data sources to create and reap the benefits of natural language processing (NLP) applications. Developers, software engineers and data scientists with experience in the Python, JavaScript or TypeScript programming languages can make use of LangChain's packages offered in those languages. LangChain was launched as an open source project by co-founders Harrison Chase and Ankush Gola in 2022; the initial version was released that same year. Why is LangChain important? LangChain is a framework that simplifies the process of creating generative AI application interfaces. Developers working on these types of interfaces use variou

In [19]:
from langchain.chains import RetrievalQA

llm = ChatOpenAI()
query = "Why is langchain useful?"
chain = RetrievalQA.from_chain_type(llm=llm,
                                    chain_type="stuff",
                                    retriever=retriever)
result = chain.invoke(query)

In [24]:
pprint.pprint(result['result'])

('LangChain is useful for several reasons:\n'
 '\n'
 '1. Simplifies the development process: LangChain provides a framework that '
 'simplifies the process of creating generative AI application interfaces. It '
 'streamlines the development of advanced natural language processing (NLP) '
 'applications by providing tools and packages that developers can use.\n'
 '\n'
 '2. Combines powerful language models with external data sources: LangChain '
 "allows developers to link powerful language models, such as OpenAI's GPT-3.5 "
 'and GPT-4, with external data sources. This enables the creation of '
 'LLM-powered applications that can leverage the benefits of natural language '
 'processing.\n'
 '\n'
 '3. Enables the creation of NLP applications: LangChain empowers developers, '
 'software engineers, and data scientists with the ability to create NLP '
 'applications. These applications can understand and process natural '
 'language, enabling tasks such as diagnosing medical conditions, au