# LLM

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
from pprint import pprint
import re
from peft import LoraConfig, TaskType, get_peft_model
import json

In [None]:
model = AutoModelForCausalLM.from_pretrained("Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct",
                                             torch_dtype=torch.bfloat16, device_map="auto")

#peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
#                         inference_mode=False, r=8,
#                         lora_alpha=32, lora_dropout=0.1)
#model = get_peft_model(model, peft_config)
 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct")

In [None]:
model.device

In [None]:
with open('test-ds.json') as f:
    ds = json.load(f)
ds.keys()

In [None]:
sample = ds['nachos-recipe']

In [None]:
x = [{'role': 'system', 'content': f'You are a kitchen assistant chatbot for a chef. The chef wants to make the recipe from this article: {sample["article"]}'},
          {'role': 'user', 'content': 'Provide only a bulleted list of the recipe\'s ingredients without an introduction.'}]
#x = [{'role': 'user', 'content': 'Who are you?'}]

y = sample['ingredients']

In [None]:
print(tokenizer.apply_chat_template(x, return_tensors='pt', tokenize=False, add_generation_prompt=True) + y)

In [None]:
x = tokenizer.apply_chat_template(x, return_tensors='pt', add_generation_prompt=True)
y = tokenizer.encode(y, return_tensors='pt')
x.shape, y.shape

In [None]:
x = torch.cat([x,y], -1)
response_len = y.size(-1)
y = torch.cat([torch.tensor([[-100 for _ in range(x.size(-1) - y.size(-1))]]), y], -1)
x.shape, y.shape

In [None]:
# Training
model_output = model(input_ids=x.to(model.device), labels=y.to(model.device))
model_output.loss

In [None]:
print(tokenizer.decode(model_output.logits[0,-response_len:].argmax(-1)))

In [None]:
# Inference/Generation
generated_text = x.to(model.device)
#cache = None
for i in range(20):
    model_output = model(generated_text)#, use_cache=True, past_key_values=cache)
    #cache = model_output.past_key_values
    next_tokens = model_output.logits[:, -1:, :].argmax(dim=-1)
    generated_text = torch.cat([generated_text, next_tokens], dim=-1)
    if next_tokens.item() == tokenizer.eos_token_id: break
print(tokenizer.decode(generated_text[0]))

In [None]:
if 1:
    access_token = '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'
    pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct",
                    model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto", token=access_token)

if 0:
    model = "Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct"
    pipe = pipeline("text-generation", model=model, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")

In [None]:
pipe.device

# Web Scraper

In [None]:
from urllib.parse import urlparse


In [None]:
u = urlparse('https://joyfoodsunshine.com/the-most-amazing-chocolate-chip-cookies/')

In [None]:
u

In [None]:
u.path[1:-1]

In [None]:
# Imports
from bs4 import BeautifulSoup
import cloudscraper

#url = 'https://joyfoodsunshine.com/the-most-amazing-chocolate-chip-cookies/'
#scraper = cloudscraper.create_scraper()
#soup = BeautifulSoup(scraper.get(url).text, 'html.parser')
#text = soup.get_text()


# Grabbing URLs:
misc = ["https://dinnerthendessert.com/contact-us/",
        "https://dinnerthendessert.com/recipe-index/",
        "https://dinnerthendessert.com/videos/",
        "https://dinnerthendessert.com/privacy-and-copyright/",
        "https://dinnerthendessert.com/start-here/",
        "https://dinnerthendessert.com/feed/",
        "https://dinnerthendessert.com/work-with-me/",
        "https://dinnerthendessert.com/website-accessibility-policy/",
        "https://dinnerthendessert.com/dinner-then-dessert-cookbook/",
        ]

urls = []
for i in range(2,63):
    url = f'https://dinnerthendessert.com/category/dinner/page/{i}/'
    scraper = cloudscraper.create_scraper()
    soup = BeautifulSoup(scraper.get(url).text, 'html.parser')
    for link in soup.find_all('a'):
        l = link.get('href')
        if l.startswith('https://dinnerthendessert.com') and len(l.split('/')) == 5 and l not in misc:
            urls += [l]
urls = set(urls)

for url in urls:
    print(f'"{url}",')

In [None]:
# Grabbing URLs:
misc = [
    "https://joyfoodsunshine.com/contact/",
    "https://joyfoodsunshine.com/instant-updates/",
    "https://joyfoodsunshine.com/work-with-me/",
    "https://joyfoodsunshine.com/privacy-policy/",
    "https://joyfoodsunshine.com/recipe-index/",
    "https://joyfoodsunshine.com/about-laura/",
]

urls = []
for i in range(2,6):
    url = f'https://joyfoodsunshine.com/category/appetizer/page/{i}/'
    scraper = cloudscraper.create_scraper()
    soup = BeautifulSoup(scraper.get(url).text, 'html.parser')
    for link in soup.find_all('a'):
        l = link.get('href')
        if l.startswith('https://joyfoodsunshine.com') and len(l.split('/')) == 5 and l not in misc:
            urls += [l]
urls = set(urls)

for url in urls:
    print(f'"{url}",')

In [None]:
import json
with open('urls.json') as fp:
    ds = json.load(fp)

In [None]:
ds['joyfoodsunshine.com'] = list(set(ds['joyfoodsunshine.com']))

In [None]:
with open('urls.json', 'w') as fp:
    json.dump(ds, fp, indent=2)

In [None]:
text_clean = text.replace(u'\xa0', u' ').replace(u'\n', u' ')
text_clean = re.sub(' +', ' ', text_clean)
text_clean = text_clean.strip()#.splitlines()

In [None]:
#pprint(text_clean)

In [None]:
messages = [
    {"role": "system", "content": f"You are a kitchen assistant chatbot for a chef. The chef wants to make the recipe from this article: {text_clean}"},
    {"role": "user", "content": f"Provide only a bulleted list of the recipe's ingredients without an introduction.."}
]

terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipe(
    messages,
    max_new_tokens=1024,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9
)
pprint(outputs[0]['generated_text'][-1]['content'])

In [None]:
messages = outputs[0]['generated_text']
messages += [
    {"role": "user", "content": f"Provide only a bulleted list of the recipe's instructions without an introduction."}
]
outputs = pipe(
    messages,
    max_new_tokens=1024,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9
)
pprint(outputs[0]['generated_text'][-1]['content'])

## Batched Input

In [None]:
def prompt_gen():
    to_provide = ['ingredients', 'instructions']
    for inst in to_provide:
        messages = [
            {"role": "system", "content": f"You are a kitchen assistant chatbot for a chef. The chef wants to make the recipe from this article: {text_clean}"},
            {"role": "user", "content": f"Provide only a bulleted list of the recipe's {inst}."}
        ]
        prompt = pipe.tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True,
        )
        yield prompt

In [None]:
pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id
terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outs = []
for out in pipe(prompt_gen(), batch_size=2,
                max_new_tokens=1024,
                eos_token_id=terminators,
                do_sample=True,
                temperature=0.6,
                top_p=0.9):
    outs += [out] 