In [None]:
%load_ext jupyter_black

In [None]:
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
model_root = Path("store/gpt2-bullets-base1")


In [None]:
url = "http://www.eprbulletsafsc.com/"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")

1. scrape the main page for all of the links to the individual afsc bullets

In [None]:
urls = {url + a["href"] for a in soup.find_all("a") if a["href"].startswith("epr")}
print(f"{len(urls)=}")

2. iterate through each link and scrape all of the bullets

In [None]:
def get_bullets(soup: BeautifulSoup) -> list[str]:
    def generate_bullets():
        for p in soup.find_all("p"):
            if p.text.startswith("-"):
                for bullet in p.text.split("\n"):
                    if bullet:
                        yield bullet.strip()

    return list(generate_bullets())


results = []
for url in urls:
    print(url)
    r = requests.get(url)
    print(r)
    soup = BeautifulSoup(r.text, "html.parser")
    results.extend(get_bullets(soup))

In [None]:
# clean up the data by
# - stripping the `-` prefix
# - removing lines that are too long
# - dropping duplicates

s = pd.Series(results).str.lstrip("- ")
length = s.str.len().astype(float)
s = (
    s.where(length < length.mean() * 1.5)
    .dropna()
    .drop_duplicates()
    .reset_index(drop=True)
)
print(f"unique valid bullets: {len(s):,}")
s

4. write the training data to the store

In [None]:
model_root = Path("store/gpt2-bullets-base1")
if not model_root.exists():
    model_root.mkdir(parents=True)
training_data = model_root / "training-data.txt"


print(f"unique bullets: {len(s):,}")
with training_data.open("w") as f:
    f.write("\n###\n".join(s))

In [None]:
from pathlib import Path
import re
model_root = Path("store/gpt2-bullets-base1")
training_data = model_root / "training-data.txt"

with training_data.open("r") as f:
    data = re.split("\n###\n", f.read())

In [None]:
from fite.util import SpecialTokens
df = (
    pd.Series(data).str.extract(r"(?P<action>^.*);(?P<impact>.*)--(?P<result>.*$)")
    .dropna()
    .reset_index(drop=True)
)
df["action"] = SpecialTokens.bos_token + df.action
df["result"] = df.result + SpecialTokens.eos_token
print(f"bullets that follow '<action>;<impact>--<result>': {len(df):,}")

r = pd.DataFrame(dict(
    prompt =  (df.action + ";").to_list() + (df.action + ";" + df.impact).to_list(),
    completion = (df.impact + "--" + df.result).to_list() + df.result.to_list()
)).to_json(model_root / 'training-data.jsonl', orient="records", lines=True)

r


4. update the pyproject.toml and run `python -m fite.train --verbose gpt2-bullets-base1`

In [2]:
from fite.pipeline import PipelineEngine, Strategys

engine = PipelineEngine.load_from_pyproject("pyproject.toml")
engine.list_models()

['gpt2-taf-base1', 'gpt2-bullets-base1']

In [3]:
prompts = [
    "Augmented manning shortfall; ",
    "Displayed expert system knowledge; ",
    "Advanced professional education; graduated Agile Team Management/NCOA",
    "Fueled 726 patrols;" ,
    "Devoted"
]

for strategy in Strategys:
    results = engine.generate(
        "gpt2-bullets-base1",
        prompts,
        strategy = strategy
    )
    for text, prompt in zip(results, prompts):
        generated_text = ' '.join(text)
        print(f"""\
{prompt=} strategy={strategy} 
{generated_text=}
""")

prompt='Augmented manning shortfall; ' strategy=GREEDY 
generated_text='Augmented manning shortfall; reduced AF $2M'

prompt='Displayed expert system knowledge; ' strategy=GREEDY 
generated_text='Displayed expert system knowledge; 100% compliant'

prompt='Advanced professional education; graduated Agile Team Management/NCOA' strategy=GREEDY 
generated_text='Advanced professional education; graduated Agile Team Management/NCOA'

prompt='Fueled 726 patrols;' strategy=GREEDY 
generated_text='Fueled 726 patrols; $1M in assets'

prompt='Devoted' strategy=GREEDY 
generated_text='Devoted $1M in assets to supply'

prompt='Augmented manning shortfall; ' strategy=BEAM_SEARCH 
generated_text='Augmented manning shortfall; saved $2K'

prompt='Displayed expert system knowledge; ' strategy=BEAM_SEARCH 
generated_text='Displayed expert system knowledge; 100% compliant'

prompt='Advanced professional education; graduated Agile Team Management/NCOA' strategy=BEAM_SEARCH 
generated_text='Advanced profess