In [None]:
## This function will help to process websites to provide structured represenations of the listed content within them. 
# It will first take take an list of string urls. 
# For each url, it will download the content of the website
# If it is a github, it will use the README.md (downloading the github repo if it is not downloaded, and pulling it if it is already present in the saved directory)
# It will then use gpt3 to identify the main columns to be extracted for that website. 
# It will suggest these columns to the user. 
# The user may use these columns or not. The columns will then be used in a prompt template that we will use to extract information. 

# The html content will then be 'scanned through' in blocks.   
# For each block, we will run the prompt with gpt4 extract the information into the columns.


In [25]:
%load_ext autoreload
%autoreload 2

from genai.memory.downloader import url_downloader
output_path = '../../downloads/'
dry_run = False
verbose = True
overwrite = False

link = 'https://github.com/a16z-infra/llm-app-stack'

local_path = url_downloader(link, output_path, overwrite=overwrite, dry_run=dry_run, verbose=verbose)

if 'github' in local_path:
    # add readme.md
    local_path = local_path + '/README.md'

# open content
with open(local_path, 'r') as f:
    content = f.read()

# print(content)




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
local_path=../../downloads/github/a16z-infra/llm-app-stack
Document ../../downloads/github/a16z-infra/llm-app-stack already exists


In [31]:
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

output_parser = CommaSeparatedListOutputParser()

format_instructions = output_parser.get_format_instructions()

In [72]:
## Extract the information from the input. 
template = """
    Please survey the following information and extract the column names that you would like to extract from the content.
    First focus on columns that are already present in any tables. USE THOSE AS STARTING COLUMNS.
    Next, determine if tables are separated by headings subheadings. 
    Add any necessary column names to describe those headings/subheadings for better classification.
    These could include 'type', 'category', 'tags', etc... 
    For instance for the the following Content would yield the Results
    
    === Content
    ## section 1
    ...
    ### subjectA
    ...
    <table with column col1, col2, col3>
    
    === Result
    col1, col2, col3, DescriptorOf(subjectA)

    === 
    An example of DescriptorOf(subjectA) if subjectA was 'Data Pipelines' would be 'category'
    {subject}
    """
prompt = PromptTemplate(
    template=template+".\n{format_instructions}",
    input_variables=["subject"],
    partial_variables={"format_instructions": format_instructions},
)

model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.9)

chain = prompt | model | output_parser

columns = chain.invoke({"subject": content})
print(columns)



In [73]:
columns.append('Category')

In [68]:
# Go through the content in blocks, as separated by the headings.
# For each block, we will run the prompt with gpt4 extract the information into the columns.

# for block in content.split('#'):
#     print(block)
    # print('\n

import re
# re.split("^#.*$", content, flags=re.MULTILINE)
# This splits wrong. It should append the the heading as part of the block.
headings = re.findall("^#.*$", content, flags=re.MULTILINE)
# get information between the headings

blocks = []
for i in range(len(headings)-1):
    block = content.split(headings[i])[1].split(headings[i+1])[0]
    blocks.append(headings[i] + block)
    # print(block)
    # print('\n





In [86]:
# For each block, we will run the prompt with gpt4 extract the information into the columns.
template = """
 
    Use any of the headers to help fill in any columns that are not explicitly mentioned in tables. 
    {block}
    """

prompt = PromptTemplate(
    template="You will be presented with content, that may or may not have relevant information to the following columns: {columns}\n. If there is no relevant content, do not output anything. If there is multiple elemnts of relevant content, output a table with the columns specified." + template ,
    input_variables=["block"],
    partial_variables={"columns": columns},
)

model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.0)

chain = prompt | model | output_parser

column_results = []
for block in blocks:
    if len(block) < 20:
        continue
    results = chain.invoke({"block": block})
    column_results.append(results)

In [88]:
for c in column_results:
    print(len(c))

3
2
18
6
5
3
5
4
1
18
5
2
4
3
4
9


In [85]:
len(blocks[2])

17