In [7]:
from torch import cuda, bfloat16
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig, BitsAndBytesConfig, AutoConfig
import transformers

from langchain_huggingface import HuggingFaceEmbeddings

from pinecone import Pinecone
from pinecone import ServerlessSpec

from torch import cuda
from dotenv import load_dotenv, find_dotenv

import os

In [2]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

# Loading Embedding Model

In [3]:

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)




# Connect to Pincone

In [8]:
index_name = 'llama-2-fin-rag-proto'

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
# configure client
pc = Pinecone(api_key=pinecone_api_key)

spec = ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 

# connect to index
index = pc.Index(index_name)
# view index stats
print(index.describe_index_stats())

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 17268}},
 'total_vector_count': 17268}


## Load in LLama Model

In [None]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

hf_auth = os.environ.get("HUGGING_FACE_AUTH")

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=bfloat16
# )

quanto_config = QuantoConfig(
    weights='int8'

)

# begin initializing HF items, need auth token for these
hf_auth = os.environ.get("HUGGING_FACE_AUTH")


model_config = AutoConfig.from_pretrained(
    model_id,
    token=hf_auth
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    # quantization_config=quanto_config,
    token=hf_auth,
    device_map=device
)
model.eval()