# Frontend notebook

## Imports

In [1]:
!pip install datasets
!pip install sentencepiece
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install bitsandbytes
!pip install git+https://github.com/huggingface/peft.git
!pip install gradio

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-xiriwmfa
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-xiriwmfa
  Resolved https://github.com/huggingface/transformers.git to commit a691ccb0c224f6f76ef585535eec26456236b2e3
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers<0.22,>=0.21 (from transformers==4.48.0.dev0)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected p

In [2]:
import torch
from peft import PeftModel
import transformers
import gradio as gr
assert (
    "LlamaTokenizer" in transformers._import_structure["models.llama"]
), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig

## Setup

In [3]:
tokenizer = LlamaTokenizer.from_pretrained("baffo32/decapoda-research-llama-7B-hf")

tokenizer_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


In [4]:
BASE_MODEL = "baffo32/decapoda-research-llama-7B-hf"
LORA_WEIGHTS = "kunchum/capstone-llama-finetuned"

In [5]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
try:
    if torch.backends.mps.is_available():
        device = "mps"
except:
    pass

In [6]:
if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(
        BASE_MODEL,
        load_in_8bit=True,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(
        model, LORA_WEIGHTS, force_download=True
    )

config.json:   0%|          | 0.00/428 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin.index.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/33 [00:00<?, ?it/s]

pytorch_model-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

pytorch_model-00002-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00003-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00004-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00005-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00006-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00007-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00008-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00009-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00010-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00011-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00012-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00013-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00014-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00015-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00016-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00017-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00018-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00019-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00020-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00021-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00022-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00023-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00024-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00025-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00026-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00027-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00028-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00029-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00030-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00031-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00032-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00033-of-00033.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

Some weights of the model checkpoint at baffo32/decapoda-research-llama-7B-hf were not used when initializing LlamaForCausalLM: ['model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]



adapter_model.bin:   0%|          | 0.00/8.41M [00:00<?, ?B/s]

In [7]:
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""

In [8]:
model.half()
model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)

## Evaluate function setup

In [9]:
def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128,
    repetition_penalty=1.15,
    **kwargs,
):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        do_sample=True,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        **kwargs,
    )
    with torch.autocast("cuda"):
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return output.split("### Response:")[1].strip()

    #     response = ""
    # for s in generation_output.sequences:
    #     response = tokenizer.decode(generation_output.sequences[0], skip_special_tokens=True)

    # # Truncate the response at the first occurrence of triple quotes (""")
    # if '"""' in response:
    #     response = response.split('"""')[0].strip()
    #     response = response.split('### Response:')[1].strip()

    # return response

## Gradio Interface

In [None]:
# import gradio as gr

# g = gr.Interface(
#     fn=evaluate,
#     inputs=[
#         gr.Textbox(
#             lines=2, label="Instruction", placeholder="When should I get a second credit card?"
#         ),
#         gr.Textbox(lines=2, label="Input", placeholder="none"),
#         gr.Dropdown(
#             choices=["LLaMa 7B", "Mistral 7B"],
#             label="Model",
#             value="LLaMa 7B",
#         ),
#         gr.Slider(minimum=0, maximum=1, value=0.6, label="Temperature"),
#         gr.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
#         # gr.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
#         # gr.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
#         gr.Slider(minimum=1, maximum=512, step=1, value=128, label="Max tokens"),
#         gr.Slider(minimum=0.1, maximum=2, step=0.1, value=1.15, label="Repetition Penalty"),
#     ],
#     outputs=[
#         gr.Textbox(
#             lines=10,
#             label="Output",
#         )
#     ],
#     title="FinAdvisor - Financial Advisory Tool for contextually-aware personal finance assistance",
#     description="FinAdvisor LLM based project to generate quick, personal advice focused on wealth/finance."
# )

# # Remove concurrency_count argument
# g.queue()
# g.launch(share=True, debug=True)

In [None]:
def dummy_model_selection(model_name):
    return f"Selected model: {model_name}"

# Gradio Interface
g = gr.Blocks()

with g:
    gr.Markdown("# FinCoach - Financial Advisory Tool for Personal Finance")
    gr.Markdown("Generate quick, personal advice focused on finance.")

    with gr.Row():
        instruction = gr.Textbox(
            lines=2, label="Instruction", placeholder="When should I get a second credit card?"
        )
        input_box = gr.Textbox(lines=2, label="Input", placeholder="none")

    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=["LLaMa 7B"],
            label="Model",
            value="LLaMa 7B",
        )

    with gr.Row():
        temperature = gr.Slider(minimum=0, maximum=1, value=0.6, label="Temperature")
        top_p = gr.Slider(minimum=0, maximum=1, value=0.95, label="Top p")
        max_tokens = gr.Slider(minimum=1, maximum=512, step=1, value=128, label="Max tokens")
        repetition_penalty = gr.Slider(minimum=0.1, maximum=2, step=0.1, value=1.15, label="Repetition Penalty")

    with gr.Row():
        output = gr.Textbox(lines=10, label="Output")

    evaluate_button = gr.Button("Evaluate")

    evaluate_button.click(
        fn=evaluate,
        inputs=[instruction, input_box, temperature, top_p, max_tokens, repetition_penalty],
        outputs=[output],
    )

# Launch interface
g.queue()
g.launch(share=True, debug=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://f37607aef99a695a2f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
