In [6]:
import torch

BERT_CONFIGURATION={
    "model" : "../inputs/bert-finetuned-math-prob-classification",
    "device"  'cuda' if torch.cuda.is_available() else 'cpu'
    "api_key": "EMPTY", 
    "max_tokens":1000,
    # "base_url":f"http://localhost:11434/v1"
    }

In [7]:
# custom client with custom model loader
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, AutoModelForSequenceClassification
import random

class BertModelClient:
    def __init__(self, config, **kwargs):
        print(f"CustomModelClient config: {config}")
        self.device = config.get("device", "cpu")
        self.model = AutoModelForCausalLM.from_pretrained(config["model"]).to(self.device)
        self.model_name = config["model"]
        self.tokenizer = AutoTokenizer.from_pretrained(config["model"], use_fast=False)
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
        self.label_mapping = {
            0: "Algebra",
            1: "Counting & Probability",
            2: "Geometry",
            3: "Intermediate Algebra",
            4: "Number Theory",
            5: "Prealgebra",
            6: "Precalculus"
        }

        # params are set by the user and consumed by the user since they are providing a custom model
        # so anything can be done here
        gen_config_params = config.get("params", {})
        self.max_length = gen_config_params.get("max_length", 256)

        print(f"Loaded model {config['model']} to {self.device}")

    def create(self, params):
        if params.get("stream", False) and "messages" in params:
            raise NotImplementedError("Local models do not support streaming.")
        else:
            num_of_responses = params.get("n", 1)

            # can create my own data response class
            # here using SimpleNamespace for simplicity
            # as long as it adheres to the ClientResponseProtocol

            response = SimpleNamespace()
            
    #         # Tokenize the input text
    # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # # Move inputs to GPU if available
    # inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # # Make prediction
    # with torch.no_grad():
    #     outputs = model(**inputs)
    
    # # Get the predicted class
    # predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # predicted_class = torch.argmax(predictions, dim=-1).item()

    # predicted_label = label_mapping[predicted_class]
    

            inputs = self.tokenizer.apply_chat_template(
                params["messages"], return_tensors="pt", add_generation_prompt=True
            ).to(self.device)
            inputs_length = inputs.shape[-1]

            # add inputs_length to max_length
            max_length = self.max_length + inputs_length
            generation_config = GenerationConfig(
                max_length=max_length,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id,
            )

            response.choices = []
            response.model = self.model_name

            for _ in range(num_of_responses):
                outputs = self.model.generate(inputs, generation_config=generation_config)
                # Decode only the newly generated text, excluding the prompt
                text = self.tokenizer.decode(outputs[0, inputs_length:])
                choice = SimpleNamespace()
                choice.message = SimpleNamespace()
                choice.message.content = text
                choice.message.function_call = None
                response.choices.append(choice)

            return response

    def message_retrieval(self, response):
        """Retrieve the messages from the response."""
        choices = response.choices
        return [choice.message.content for choice in choices]

    def cost(self, response) -> float:
        """Calculate the cost of the response."""
        response.cost = 0
        return 0
    
    def predictMathType(self, text):
        try:
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {key: val.to(self.device) for key, val in inputs.items()}

            with torch.no_grad():
                outputs = self.model(**inputs)

            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_class = torch.argmax(predictions, dim=-1).item()
            return self.label_mapping[predicted_class]
        except Exception as e:
            predicted_class = random.randint(0,6)
            return self.label_mapping[predicted_class]

    @staticmethod
    def get_usage(response):
        # returns a dict of prompt_tokens, completion_tokens, total_tokens, cost, model
        # if usage needs to be tracked, else None
        return {}

In [8]:
bert = BertModelClient(BERT_CONFIGURATION)

CustomModelClient config: {'model': '../inputs/bert-finetuned-math-prob-classification', 'cpuapi_key': 'EMPTY', 'max_tokens': 1000}


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at ../inputs/bert-finetuned-math-prob-classification and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model ../inputs/bert-finetuned-math-prob-classification to cpu


In [11]:
# bert.predictMathType("What is 5 + 5")
bert.predictMathType("What is the sum of all the prime numbers between 0 and 100?")


'Counting & Probability'