# Using T5 Small 80M for Airflow Docs Q&A

## Dependencies

In [1]:
%pip install accelerate transformers[torch] torch sentencepiece --user

Note: you may need to restart the kernel to use updated packages.


## Model Setup

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<pad> Wie ich er bitten?</s>




In [17]:
import os
from pathlib import Path

models_path = Path(f"{os.getcwd()}/models/t5-small")
if not models_path.exists():
 models_path.mkdir(parents=True, exist_ok=True)

# Storing tokenizer locally
tokenizer.save_pretrained(str(models_path))
print("Tokenizer saved successfully!")
# Storing model locally
model.save_pretrained(str(models_path))
print("Model saved successfully!")

Tokenizer saved successfully!
Model saved successfully!


## Loading stored model

In [18]:
import os
from pathlib import Path
from transformers import T5Tokenizer, T5ForConditionalGeneration

models_path = Path(f"{os.getcwd()}/models/t5-small")
tokenizer = T5Tokenizer.from_pretrained(str(models_path))
model = T5ForConditionalGeneration.from_pretrained(str(models_path))

## Documents URL Scraping

In [19]:
%pip install beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.


In [20]:
from pathlib import Path
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests

root_url = "https://airflow.apache.org/docs/apache-airflow/stable/index.html"
root_response = requests.get(root_url)
root_html = root_response.content.decode("utf-8")
soup = BeautifulSoup(root_html, 'html.parser')

root_url_parts = urlparse(root_url)
root_links = soup.find_all("a", attrs={"class": "reference internal"})

result = set()
for root_link in root_links:
    path = root_url_parts.path + root_link.get("href")
    path = str(Path(path).resolve())
    path = urlparse(path).path  # remove the hashtag
    url = f"{root_url_parts.scheme}://{root_url_parts.netloc}{path}"
    if not url.endswith("/"):
        url = url + "/"
    result.add(url)
print(list(result))

['https://airflow.apache.org/docs/apache-airflow/stable/index.htmlhowto/index.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmltemplates-ref.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmldeprecated-rest-api-ref.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmltutorial/index.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmlconfigurations-ref.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmlfaq.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmllicense.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmlintegration.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmlstable-rest-api-ref.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmlui.html/', 'https://airflow.apache.org/docs/apache-airflow/stable/index.htmlrele

## Embeddings Database

In [21]:
%pip install langchain

Collecting langchain
  Downloading langchain-0.0.220-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.8.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.9-py3-none-any.whl (26 kB)
Collecting langchainplus-sdk>=0.0.17 (from langchain)
  Downloading langchainplus_sdk-0.0.19-py3-none-any.whl (25 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyda

In [22]:
import os
from pathlib import Path
from langchain import HuggingFacePipeline

models_path = Path(f"{os.getcwd()}/models/t5-small")
llm = HuggingFacePipeline.from_model_id(
    model_id=str(models_path),
    task="text-generation",
    model_kwargs={"temperature": 0, "max_length": 64},
)

ValueError: Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, CodeGenConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, TransfoXLConfig, TrOCRConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.

In [None]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma


CHROMA_DB_DIRECTORY = "chroma_db/ask_django_docs"

loader = WebBaseLoader(urls)
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splitted_documents = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

db = Chroma.from_documents(
    splitted_documents,
    embeddings,
    collection_name="ask_django_docs",
    persist_directory=CHROMA_DB_DIRECTORY,
)
db.persist()