<a href="https://colab.research.google.com/github/FMurray/hyperdemocracy/blob/main/hyper_democracy_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [None]:
%%capture
!pip install datasets
!pip install langchain==0.0.193
!pip install chromadb
!pip install openai
!pip install tiktoken
!pip install rich

In [None]:
import os
import random
import re

from datasets import load_dataset
import rich

# Setup Keys

In [None]:
# os.environ["OPENAI_API_KEY"] = "YOUR KEY HERE"
import getpass
key = getpass.getpass()

In [None]:
os.environ['OPENAI_API_KEY'] = key

# Load Demo Records

We are going to use a small subset of records provided by https://assembled.app/.

For the purposes of this workshop, we have created a huggingface dataset https://huggingface.co/datasets/hacdc/hyperdemocracy which we can load using the `load_dataset` function to returns a Huggingface dataset. See more info here [datasets](https://huggingface.co/docs/datasets/index) package.

In [None]:
ds = load_dataset("hacdc/hyperdemocracy", split="train")

In [None]:
ds

# Convert to Pandas DataFrame

We could do all the processing using the HF dataset, but sometimes it is simpler to explore with a DataFrame.

In [None]:
df_all = ds.to_pandas()

In [None]:
df_all.shape

In [None]:
df_all.head()

# Sponsor Graph Sidequest

[Notebook here:](https://github.com/FMurray/hyperdemocracy/sidequests/sponsor_graph.ipynb)

In [None]:
import networkx as nx

In [None]:
G = nx.Graph()
for _, record in df_all.iterrows():
    node = (record['key'], {"kind": "record", "name": record["name"]})
    G.add_nodes_from([node])
    for sponsor in record['sponsors']:
        node = (sponsor[0], {"name_tag": sponsor[1], "kind": "person"})
        G.add_nodes_from([node])
        edge = (record['key'], sponsor[0], {"kind": sponsor[2]})
        G.add_edges_from([edge])

In [None]:
for node in list(G.nodes.data())[0:5]:
    print(node)

In [None]:
for edge in list(G.edges)[0:5]:
    print(G.edges[edge], G.nodes[edge[0]], G.nodes[edge[1]])

In [None]:
nx.draw(G)

# Remove Records with no Text

In [None]:
df = df_all[df_all['body']!='']
df.shape

In [None]:
def split_key(key):
    """
    TODO: add a link explaining this notation and variable names
    """
    congress_num, legis_class, legis_num = re.match("(\d+)(\D+)(\d+)", key).groups()
    return congress_num, legis_class, legis_num

In [None]:
def url_from_key(key):
    """Return congress.gov url from key."""
    url_map = {
        "HR": "house-bill",
        "HCONRES": "house-concurrent-resolution",
        "HRES": "house-resolution",
        "HJRES": "house-joint-resolution",
        "S": "senate-bill",
        "SCONRES": "senate-concurrent-resolution",
        "SRES": "senate-resolution",
        "SJRES": "senate-joint-resolution",
    }
    congress_num, legis_class, legis_num = split_key(key)
    url_legis_class = url_map[legis_class]
    url = f"https://www.congress.gov/bill/{congress_num}th-congress/{url_legis_class}/{legis_num}"
    return url

# From Pandas Dataframe to Langchain Document

TODO: Document me!

TODO: Try loading directly from source with langchain HTML Document loader

Langchain makes [UnstructuredText](https://unstructured-io.github.io/unstructured/examples.html)

In [None]:
from bs4 import BeautifulSoup
from langchain.schema import Document 

In [None]:
docs = []
for irow, row in df.iterrows():
    congress_num, legis_class, legis_num = split_key(row['key'])
    doc = Document(
        page_content=BeautifulSoup(row['body']).get_text(),
        metadata={
            'key': row['key'],
            'congress_num': congress_num,
            'legis_class': legis_class,
            'legis_num': legis_num,
            'name': row['name'],
            'summary': BeautifulSoup(row['summary']).get_text(),
            #'summary': row['summary'],
            'source': url_from_key(row['key']),

            # Note: chroma can only filter on float, str, or int
            # https://docs.trychroma.com/usage-guide#using-where-filters

            'sponsor': row['sponsors'][0][0],

            # TODO: figure out how to break theme list up in a better way
            'theme0': row['themes'][0] if row['themes'].size > 0 else ""
        },
    )
    docs.append(doc)

In [None]:
doc

In [None]:
print(doc.page_content)

In [None]:
print(len(docs))

In [None]:
doc.metadata

# Document QA Quickstart

https://python.langchain.com/en/latest/modules/indexes/getting_started.html

TODO: What is DocumentQA? 

In [None]:
import langchain
langchain.verbose = False

# Introducing indexes

TODO: What are indexes?

In [None]:
from langchain.indexes import VectorstoreIndexCreator

In [None]:
index = VectorstoreIndexCreator().from_documents(docs)

In [None]:
index

In [None]:
query = "What are the primary themes around energy policy?"

Try some other questions for yourself!

In [None]:
out = index.query(query)
out

In [None]:
out = index.query_with_sources(query)

In [None]:
out

In [None]:
print(out['sources'])

In [None]:
query = "Describe in 100 words the proposed solutions to climate change?"
out = index.query_with_sources(query)
out

# Step by step explanation of the DocumentQA

## Langchain Text Splitters

https://python.langchain.com/en/latest/modules/indexes/text_splitters.html

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
split_docs = text_splitter.split_documents(docs)

In [None]:
print("The length before splitting: ", len(docs))
print("The length after splitting: ", len(split_docs))

TODO drill in on chunk size and chunk overlap

In [None]:
split_docs[0]

## Embed and Index Doc Chunks

### Intro to embeddings

TODO: Gabe to do intro to embeddings

In [None]:
import openai

In [None]:
oai_embd = openai.Embedding.create(input=docs[0].page_content, model='text-embedding-ada-002')['data'][0]['embedding']
len(oai_embd), oai_embd[0:10]

In [None]:
from langchain.embeddings import OpenAIEmbeddings

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
lc_embd = embeddings.embed_documents([docs[0].page_content])
len(lc_embd[0]), lc_embd[0][0:10]

## Index Embeddings in a Vector Database

In [None]:
from langchain.vectorstores import Chroma

In [None]:
db = Chroma.from_documents(split_docs, embeddings)

In [None]:
db

In [None]:
# TODO Forrest can implement vector search in numpy and explain what it's doing

# we can just do a search of the vectors (text embeddings)
# and some vectorscores allow us to do different types of search
# https://python.langchain.com/en/latest/modules/indexes/vectorstores.html

In [None]:
# explnain similarity types, cosine, inner-product, squared L2, 
# looks like chroma uses hnswlib which supports 3 distances (default cosine) [TODO confirm default]
# https://github.com/hwchase17/langchain/blob/master/langchain/vectorstores/chroma.py
# https://docs.trychroma.com/usage-guide#changing-the-distance-function
# https://github.com/nmslib/hnswlib/tree/master#supported-distances

# in addition langchain offers maximal marginal relevance on top of cosine
# https://github.com/hwchase17/langchain/blob/master/langchain/vectorstores/utils.py#L10

ret_docs = db.similarity_search_with_score(
    "nuclear power", 
    k=10, 
    filter={"source": "https://www.congress.gov/bill/118th-congress/house-concurrent-resolution/17"},
)

for doc in ret_docs:
    print(doc)

In [None]:
# show that this is all the docs from filter
len([d for d in split_docs if d.metadata['source']=='https://www.congress.gov/bill/118th-congress/house-concurrent-resolution/17'])

# What are retrievers?

TODO: TL;DR 

In [None]:
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI

In [None]:
retriever = db.as_retriever(search_kwargs={'k':10})

In [None]:
retriever

Compare the chains in the original DocumentQA quickstart with the chains here

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(), 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True,
)

# Questions

* what are the components of the RetrievalQA chain?
* what is the QA prompt?
* how would you modify the QA prompt?
* what is the difference between the following qa chain types?,
    * stuff
    * map_reduce
    * map_rerank
    * refine

# Resources

* https://github.com/hwchase17/langchain/tree/master/langchain/chains/retrieval_qa
* https://github.com/hwchase17/langchain/tree/master/langchain/chains/question_answering

In [None]:
rich.print(qa)

## How many ways can we print a prompt? 

In [None]:
prompt_template = qa.combine_documents_chain.llm_chain.prompt
prompt_template

In [None]:
print(prompt_template.template)

In [None]:
import textwrap

In [None]:
print('\n'.join(textwrap.wrap(prompt_template.template)))

In [None]:
rich.print(prompt_template.format(context='[CONTEXT]', question='[QUESTION]'))

In [None]:
answer = qa("What is the solution to climate change?")

In [None]:
answer.keys()

In [None]:
answer['result']

In [None]:
qaws = RetrievalQAWithSourcesChain.from_chain_type(
    llm=OpenAI(), 
    chain_type="stuff", 
    retriever=retriever, 
    return_source_documents=True,
)

In [None]:
rich.print(qaws)

In [None]:
pt = qaws.combine_documents_chain.llm_chain.prompt

In [None]:
rich.print(pt.format(summaries='[SUMMARIES]', question='[QUESTION]'))

In [None]:
answer = qaws("What is the solution to climate change?")

In [None]:
answer.keys()

In [None]:
answer['answer'], answer['sources']

# TODO

Try alternatives to stuff

Figure out how to pass all the options to the high level constructor. 

https://github.com/hwchase17/langchain/blob/master/langchain/indexes/vectorstore.py

In [None]:
index_creator = VectorstoreIndexCreator(
    vectorstore_cls=Chroma, 
    embedding=OpenAIEmbeddings(),
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128)
)


Sticking this here to decide if we want to use this in the course content

https://xml.house.gov/

TODO: Sidequest on implementing a langchain document loader using this XML schema ^^^

https://www.everycrsreport.com/

# Lets make it a conversation

https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html

In [None]:
from langchain.chains import ConversationalRetrievalChain

In [None]:
db

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [None]:
qachat = ConversationalRetrievalChain.from_llm(
    OpenAI(temperature=0), 
    db.as_retriever(), 
    memory=memory
)

In [None]:
query = "What is the solution to climate change?"
answer = qachat(query)

In [None]:
rich.print(answer)

In [None]:
follow_up = "How certain is the 350 number?"
result = qachat({"question": follow_up})

In [None]:
rich.print(result)