In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, pipeline

In [2]:
model_name = "microsoft/Phi-3.5-mini-instruct" 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16
)

generation_args = {
    "max_new_tokens": 50,
    "return_full_text": False,
    "temperature": 0.9,
    "do_sample": True,
    "cache_implementation": "offloaded"
} 
documents = [
    "The visa application process requires a DS-160 form and an appointment.",
    "International students can apply for OPT after completing their degree.",
    "Northeastern offers various co-op opportunities in tech and finance.",
    "Travel restrictions may apply based on your country of origin."
]
 
document_context = "\n".join(documents)
full_prompt = (
    "You are an expert on creating helpful answers to user queries using the information here in your Website Information.\n"
    "Your task is to create a single response to the single User Query below.\n"
    "Use only the **Website Information** to answer the question below.\n"
    "**Do not include any information from outside of this Website Information**\n"
    "**Only answer the query, do not provide unrelated information.**\n"
    "If the answer is not found, say so clearly.\n\n"
    "---\n"
    f"**Website Information:**\n{document_context}\n" 
)
query = "How can I apply for a student visa?"
query = "Does Northeastern have coop opportunities in healthcare?"
query = "Does Northeastern have coop opportunities in the tech sector?"
user_query = (
    "Generate a sentence that answers the following Query.\n"
    f"**User Query:** {query}\n"
)

messages = [
    {"role": "system", "content": full_prompt},
    {"role": "user", "content": user_query},
    ]
output = pipe(messages, **generation_args)
print("System Content:", full_prompt)
print("User Query:", user_query)
print("Model Output:", output[0]['generated_text'])

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [2]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

def get_documents():
    docs = [
        "The visa application process requires a DS-160 form and an appointment.",
        "International students can apply for OPT after completing their degree.",
        "Northeastern offers various co-op opportunities in tech and finance.",
        "Travel restrictions may apply based on your country of origin."
    ]
    return docs
    
class CAGGenerator:
    def __init__(self, docs, model_name="microsoft/Phi-3.5-mini-instruct"):
        
        # load the model and configurations 
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=torch.bfloat16
        ).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            torch_dtype=torch.float16
        )
        self.generation_args = {
            "max_new_tokens": 50,
            "return_full_text": False,
            "temperature": 0.9,
            "do_sample": True,
            "cache_implementation": "offloaded" # offloaded cache use
        }
        
        # get docs and concatenate all into one context
        self.documents = get_documents()
        self.document_context = "\n".join(self.documents)

        # context prompt
        self.full_prompt = (
            "You are an expert on creating helpful answers to user queries using the information here in your Website Information.\n"
            "Your task is to create a single response to the single User Query below.\n"
            "Use only the **Website Information** to answer the question below.\n"
            "**Do not include any information from outside of this Website Information**\n"
            "**Only answer the query, do not provide unrelated information.**\n"
            "If the answer is not found, say so clearly.\n\n"
            "---\n"
            f"**Website Information:**\n{self.document_context}\n"
        )

    def get_answer(self, query):
        # build the user query prompt.
        user_query = (
            "Generate a sentence that answers the following Query.\n"
            f"**User Query:** {query}\n"
        )
        # construct conversation messages.
        messages = [
            {"role": "system", "content": self.full_prompt},
            {"role": "user", "content": user_query},
        ]
        # Call the pipeline and return output text
        output = self.pipe(messages, **self.generation_args)
        return output[0]['generated_text']


generator = CAGGenerator()

# query = "How can I apply for a student visa?"
# query = "Does Northeastern have coop opportunities in healthcare?"
query = "Does Northeastern have coop opportunities in the tech sector?"
answer = generator.get_answer(query)

print("System Content:", generator.full_prompt)
print("User Query:", query)
print("Model Output:", answer)

query = "Does Northeastern have coop opportunities in the healthcare sector?"
answer = generator.get_answer(query)

print("-"*20)
print("User Query:", query)
print("Model Output:", answer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


System Content: You are an expert on creating helpful answers to user queries using the information here in your Website Information.
Your task is to create a single response to the single User Query below.
Use only the **Website Information** to answer the question below.
**Do not include any information from outside of this Website Information**
**Only answer the query, do not provide unrelated information.**
If the answer is not found, say so clearly.

---
**Website Information:**
The visa application process requires a DS-160 form and an appointment.
International students can apply for OPT after completing their degree.
Northeastern offers various co-op opportunities in tech and finance.
Travel restrictions may apply based on your country of origin.

User Query: Does Northeastern have coop opportunities in the tech sector?
Model Output:  Yes, Northeastern offers various co-op opportunities in the tech sector.
System Content: You are an expert on creating helpful answers to user qu