In [1]:
%pip install accelerate transformers[torch] torch sentencepiece --user

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import BigBirdForQuestionAnswering, AutoTokenizer

model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-base-trivia-itc", block_size=16, num_random_blocks=2)
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-base-trivia-itc")

Downloading (…)lve/main/config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/527M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

In [3]:
import os
from pathlib import Path

models_path = Path(f"{os.getcwd()}/models/bigbird-base-trivia")
if not models_path.exists():
 models_path.mkdir(parents=True, exist_ok=True)

# Storing tokenizer locally
tokenizer.save_pretrained(str(models_path))
print("Tokenizer saved successfully!")
# Storing model locally
model.save_pretrained(str(models_path))
print("Model saved successfully!")

Tokenizer saved successfully!
Model saved successfully!


## Loading stored model

In [4]:
import os
from pathlib import Path
from transformers import BigBirdForQuestionAnswering, AutoTokenizer

models_path = Path(f"{os.getcwd()}/models/bigbird-base-trivia")
model = BigBirdForQuestionAnswering.from_pretrained(str(models_path))
tokenizer = AutoTokenizer.from_pretrained(str(models_path))

## Documents URL Scraping

In [5]:
%pip install beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.


In [22]:
from pathlib import Path
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests

root_url = "https://airflow.apache.org/docs/apache-airflow/stable/"
root_response = requests.get(root_url)
root_html = root_response.content.decode("utf-8")
soup = BeautifulSoup(root_html, 'html.parser')

root_url_parts = urlparse(root_url)
root_links = soup.find_all("a", attrs={"class": "reference internal"})

result = set()
for root_link in root_links:
    path = root_url_parts.path + root_link.get("href")
    path = str(Path(path).resolve())
    path = urlparse(path).path
    url = f"{root_url_parts.scheme}://{root_url_parts.netloc}{path}"
    result.add(url)
urls = list(result)
print(*urls, sep="\n")

https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/index.html
https://airflow.apache.org/docs/apache-airflow/stable/license.html
https://airflow.apache.org/docs/apache-airflow/stable/migrations-ref.html
https://airflow.apache.org/docs/apache-airflow/stable/
https://airflow.apache.org/docs/apache-airflow/stable/deprecated-rest-api-ref.html
https://airflow.apache.org/docs/apache-airflow/stable/best-practices.html
https://airflow.apache.org/docs/apache-airflow/stable/templates-ref.html
https://airflow.apache.org/docs/apache-airflow/stable/configurations-ref.html
https://airflow.apache.org/docs/apache-airflow/stable/integration.html
https://airflow.apache.org/docs/apache-airflow/stable/database-erd-ref.html
https://airflow.apache.org/docs/apache-airflow/stable/release_notes.html
https://airflow.apache.org/docs/apache-airflow/stable/ui.html
https://airflow.apache.org/docs/apache-airflow/stable/operators-and-hooks-ref.html
https://airflow.apache.org/docs/apac

## Embeddings Database

In [7]:
%pip install langchain

Note: you may need to restart the kernel to use updated packages.


In [89]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = WebBaseLoader(urls)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splitted_documents = text_splitter.split_documents(documents)
print("Total documents: ", len(splitted_documents))

Total documents:  1459


In [90]:
splitted_documents[0]

Document(page_content='Administration and Deployment — Airflow Documentation\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                            Community\n                        \n\n                            Meetups\n                        \n\n                            Documentation\n                        \n\n                            Use-cases\n                        \n\n                            Announcements\n                        \n\n                            Blog\n                        \n\n                            Ecosystem\n                        \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                Community\n                            \n\n                                Meetups\n                            \n\n                                Documentation\n                            \n\n                                Use-cases\n        

In [34]:
%pip install pandas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [91]:
import pandas as pd

page_contents = []
sources = []
titles = []
languages = []

for document in splitted_documents:
    page_contents.append(document.page_content)
    if document.metadata:
        sources.append(document.metadata.get('source', "Unknown"))
        titles.append(document.metadata.get('title', "Unknown"))
        languages.append(document.metadata.get('language', "Unknown"))

documents_df = pd.DataFrame({
    'page_content': page_contents,
    'source': sources,
    'title': titles,
    'language': languages
})
documents_df.fillna("Unknown", inplace=True)
documents_df.head()

Unnamed: 0,page_content,source,title,language
0,Administration and Deployment — Airflow Docume...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en
1,Announcements\n \n\...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en
2,Internal DB details\n\nDatabase Migrations\nDa...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en
3,Production Deployment\nDatabase backend\nMulti...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en
4,DAG Serialization\nDag Serialization Settings\...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en


In [92]:
# Replace \n and \t with a space
documents_df["page_content"] = documents_df["page_content"].replace('\n', ' ', regex=True)
documents_df["page_content"] = documents_df["page_content"].replace('\t', ' ', regex=True)
# Remove leading and trailing spaces
documents_df["page_content"] = documents_df["page_content"].str.strip()

In [93]:
documents_df.head()

Unnamed: 0,page_content,source,title,language
0,Administration and Deployment — Airflow Docume...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en
1,Announcements ...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en
2,Internal DB details Database Migrations Datab...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en
3,Production Deployment Database backend Multi-N...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en
4,DAG Serialization Dag Serialization Settings L...,https://airflow.apache.org/docs/apache-airflow...,Administration and Deployment — Airflow Docume...,en


In [94]:
documents_df.isnull().sum()

page_content    0
source          0
title           0
language        0
dtype: int64

In [38]:
%pip install pyarrow fastparquet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting fastparquet
  Downloading fastparquet-2023.7.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.6.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.6.2 fastparquet-2023.7.0
Note: you may need to restart the kernel to use updated packages

In [95]:
documents_df.to_parquet('./documents.parquet')

In [12]:
%pip install chromadb

Collecting chromadb
  Downloading chromadb-0.3.26-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.6/123.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting hnswlib>=0.7 (from chromadb)
  Downloading hnswlib-0.7.0.tar.gz (33 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting clickhouse-connect>=0.5.7 (from chromadb)
  Downloading clickhouse_connect-0.6.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting duckdb>=0.7.1 (from chromadb)
  Downloading duckdb-0.8.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (14.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m16.2 MB/s

In [14]:
import chromadb
from chromadb.config import Settings
client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="./db/"
))

In [96]:
client.delete_collection(name="airflow_docs_stable")
collection = client.get_or_create_collection(name="airflow_docs_stable")

In [97]:
for index, row in documents_df.iterrows():
    if pd.notnull(row['source']) and pd.notnull(row['title']) and pd.notnull(row['language']):
        metadata = {
            'source': row['source'],
            'title': row['title'],
            'language': row['language']
        }
    collection.add(
        documents=[row['page_content']],
        metadatas=[metadata],
        ids=[str(index)],
    )

In [98]:
client.persist()

True

In [101]:
question = "How to create a DAG?"
results = collection.query(
    query_texts=[question],
    n_results=3,
)
print(results)

{'ids': [['1212', '1313', '87']], 'embeddings': None, 'documents': [['Using the Public Interface for DAG Authors¶  DAGs¶ The DAG is Airflow’s core entity that represents a recurring workflow. You can create a DAG by instantiating the DAG class in your DAG file. You can also instantiate them via :class::~airflow.models.dagbag.DagBag class that reads DAGs from a file or a folder. DAGs can also have parameters specified via :class::~airflow.models.param.Param class. Airflow has a set of example DAGs that you can use to learn how to write DAGs   airflow.example_dags   You can read more about DAGs in DAGs. References for the modules used in DAGs are here:   airflow.models.dag airflow.models.dagbag airflow.models.param     Operators¶ Operators allow for generation of certain types of tasks that become nodes in the DAG when instantiated. There are 3 main types of operators:', 'Positional Arguments¶  dag_id The id of the dag  execution_date The execution date of the DAG (optional)     Named Ar

In [102]:
formatted_result = "\n\n".join(results["documents"][0])
print(formatted_result)

Using the Public Interface for DAG Authors¶  DAGs¶ The DAG is Airflow’s core entity that represents a recurring workflow. You can create a DAG by instantiating the DAG class in your DAG file. You can also instantiate them via :class::~airflow.models.dagbag.DagBag class that reads DAGs from a file or a folder. DAGs can also have parameters specified via :class::~airflow.models.param.Param class. Airflow has a set of example DAGs that you can use to learn how to write DAGs   airflow.example_dags   You can read more about DAGs in DAGs. References for the modules used in DAGs are here:   airflow.models.dag airflow.models.dagbag airflow.models.param     Operators¶ Operators allow for generation of certain types of tasks that become nodes in the DAG when instantiated. There are 3 main types of operators:

Positional Arguments¶  dag_id The id of the dag  execution_date The execution date of the DAG (optional)     Named Arguments¶  -c, --conf JSON string that gets pickled into the DagRun’s con

In [113]:
question = f"Q: {question}?"
context = f"Context: {formatted_result}"
encoded_input = tokenizer(question, context, return_tensors='pt')
output = model(**encoded_input)
print(f"Response Tokens: {output}")

Response Tokens: BigBirdForQuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-1.1608e+01, -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06,
         -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06,
         -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06,
         -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06,
         -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06,
         -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06, -1.0000e+06,
         -1.0000e+06, -1.0000e+06, -6.0038e+00, -1.3273e+01, -4.2253e+00,
         -1.0556e+01, -4.7226e+00, -8.4252e+00, -7.3219e+00, -1.3025e+00,
         -7.5367e+00, -5.9061e+00, -1.0644e+01, -1.2375e+00, -7.3016e+00,
         -9.5997e+00, -1.1638e+01, -2.0047e+00, -8.3847e+00, -1.3064e+01,
         -5.5648e+00, -7.6803e+00, -1.0448e+01, -1.2911e+01, -7.7571e+00,
         -9.4308e+00, -1.3540e+01, -1.2837e+01, -1.3776e+01, -1.0777e+01,
         -8.7497e+00, -1.1388e+0