In [17]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [18]:
import json

json_file_path = '/home/hb/LLM-research/finetuning_dataset/BGP/PyBGPStream/PyBGPStream_main10K.json'

with open(json_file_path, 'r') as file:
    data = json.load(file)

# Concatenate instruction and output for each item
docs = [item['instruction'] + " " + item['output'] for item in data]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")


We have 10110 doc embeddings, each with a dimensionality of 384.


In [19]:
from pinecone import Pinecone
from pinecone import PodSpec
from pinecone import ServerlessSpec, PodSpec

# initialize connection to pinecone (get API key at app.pc.io)
api_key = os.environ.get('PINECONE_API_KEY')
environment = os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

# configure client
pc = Pinecone(api_key=api_key)
spec = PodSpec(environment=environment)
# spec=ServerlessSpec(
#         cloud='aws', 
#         region='us-west-2'
#     ) 

In [20]:
import time

index_name = 'llama-2-rag'

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# we create a new index
pc.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine',
        spec=spec
    )

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

KeyboardInterrupt: 

In [None]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
from datasets import load_dataset

data = load_dataset("json", data_files="/home/hb/LLM-research/finetuning_dataset/BGP/PyBGPStream/BGP_real_cases_realtime_3500.json", split="train")
data

Found cached dataset json (/home/hb/.cache/huggingface/datasets/json/default-37672de3b108d008/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


Dataset({
    features: ['instruction', 'input', 'output', 'most_similar_instructions', 'avg_similarity_score'],
    num_rows: 3532
})

In [None]:
from datasets import DatasetDict

data_df = data.to_pandas()

batch_size = 32
total_rows = len(data_df)

for i in range(0, total_rows, batch_size):
    i_end = min(len(data_df), i+batch_size)
    batch = data_df.iloc[i:i_end]
    # ids = [f"{x['instruction']}-{x['output']}" for i, x in batch.iterrows()]
    ids = [f"id_{j+1}" for j in range(i, i_end)]

    instructions = [row['instruction'] for index, row in batch.iterrows()]
    embeds = embed_model.embed_documents(instructions)
    # get metadata to store in Pinecone
    metadata = [
        {'instruction': row['instruction'],
         'input': row['input'],
         'output': row['output']} for index, row in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

['id_1', 'id_2', 'id_3', 'id_4', 'id_5', 'id_6', 'id_7', 'id_8', 'id_9', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32']
['id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'id_39', 'id_40', 'id_41', 'id_42', 'id_43', 'id_44', 'id_45', 'id_46', 'id_47', 'id_48', 'id_49', 'id_50', 'id_51', 'id_52', 'id_53', 'id_54', 'id_55', 'id_56', 'id_57', 'id_58', 'id_59', 'id_60', 'id_61', 'id_62', 'id_63', 'id_64']
['id_65', 'id_66', 'id_67', 'id_68', 'id_69', 'id_70', 'id_71', 'id_72', 'id_73', 'id_74', 'id_75', 'id_76', 'id_77', 'id_78', 'id_79', 'id_80', 'id_81', 'id_82', 'id_83', 'id_84', 'id_85', 'id_86', 'id_87', 'id_88', 'id_89', 'id_90', 'id_91', 'id_92', 'id_93', 'id_94', 'id_95', 'id_96']
['id_97', 'id_98', 'id_99', 'id_100', 'id_101', 'id_102', 'id_103', 'id_104', 'id_105', 'id_106', 'id_107', 'id_108', 'id_109', 'id_110', 'id_

In [None]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'
# model_id = 'codellama/CodeLlama-7b-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_8bit=True,
)


# Need auth token for these
hf_auth = os.environ.get('hf_token')

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

# device_map = {"": 0}

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_auth
)

model.eval()
print(f"Model loaded on {device}")