In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import torch

In [None]:
name = "meta-llama/Meta-Llama-3-8B"
auth_token = ""

In [None]:
tokenizer = AutoTokenizer.from_pretrained(name, 
    cache_dir='./model/', use_auth_token=auth_token)

model = AutoModelForCausalLM.from_pretrained(name, 
    cache_dir='./model/', use_auth_token=auth_token, torch_dtype=torch.float16, 
    rope_scaling={"type": "dynamic", "factor": 2}, load_in_8bit=True) 

In [None]:
prompt = "### User:What is the fastest car in  \
          the world and how much does it cost? \
          ### Assistant:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [None]:
output = model.generate(**inputs, streamer=streamer, 
                        use_cache=True, max_new_tokens=float('inf'))

In [None]:
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
from llama_index.prompts.prompts import SimpleInputPrompt
system_prompt = """[INST] <>
Provide Info of Financial Perfomance<>
"""
# Throw together the query wrapper
query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

In [None]:
query_wrapper_prompt.format(query_str='hello')

In [None]:
from llama_index.llms import HuggingFaceLLM

llm = HuggingFaceLLM(context_window=4096,
                    max_new_tokens=256,
                    system_prompt=system_prompt,
                    query_wrapper_prompt=query_wrapper_prompt,
                    model=model,
                    tokenizer=tokenizer)

In [None]:
from llama_index.embeddings import LangchainEmbedding

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [None]:
embeddings=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

In [None]:
from llama_index import set_global_service_context
from llama_index import ServiceContext

In [None]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embeddings
)
set_global_service_context(service_context)

In [None]:
from llama_index import VectorStoreIndex, download_loader
from pathlib import Path

In [None]:
PyMuPDFReader = download_loader("PyMuPDFReader")
loader = PyMuPDFReader()
documents = loader.load(file_path=Path('./HanaBank Report.pdf'), metadata=True)

In [None]:
index = VectorStoreIndex.from_documents(documents)

In [None]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("what was the FY2022 return on equity?")