In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import accelerate
from langchain import HuggingFacePipeline

from langchain.chains import LLMChain, ConversationChain, SimpleSequentialChain
from langchain.prompts import PromptTemplate
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.schema import BaseOutputParser
from langchain.schema import AIMessage, HumanMessage

import warnings
warnings.filterwarnings("ignore")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [5]:
model_name = "TheBloke/Llama-2-7B-Chat-GPTQ"

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model.safetensors: 100%|██████████| 3.90G/3.90G [00:41<00:00, 93.4MB/s]
generation_config.json: 100%|██████████| 137/137 [00:00<00:00, 1.12MB/s]
tokenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 5.18MB/s]
tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 673kB/s]
tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 1.85MB/s]
special_tokens_map.json: 100%|██████████| 411/411 [00:00<00:00, 2.63MB/s]


In [6]:
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    max_new_tokens = 1024,
    top_p = 0.95,
    do_sample = True,
    repetition_penalty = 1.1,
)

llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})

In [None]:
template = """
Return all the subcategories of the following category

{category}
"""

prompt = PromptTemplate(
    input_variables=['category'],
    template=template
)

prompt