In [1]:
pip install --upgrade transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer

model_path = "./flan"
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [7]:
import requests
import torch
from transformers import AutoTokenizer

# Initialize tokenizer
model_path = "./flan"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Define inference URL
infer_url = "http://modelmesh-serving.llm.svc.cluster.local:8008/v2/models/t5-pytorch/infer"

# Test input text
input_text = "Instruction: Explain the feature.\nInput: ANNUAL_INCOME_POSITIVE"

# Tokenize the input
tokenized_input = tokenizer(
    input_text,
    return_tensors="pt",
    max_length=128,
    padding="max_length",
    truncation=True
)

# Extract tokenized tensors
input_ids = tokenized_input["input_ids"]
attention_mask = tokenized_input["attention_mask"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# Initialize decoder input IDs with <pad> token
decoder_input_ids = torch.tensor([[tokenizer.pad_token_id]], device=device)

# Iterative decoding
max_length = 80  # Maximum generation length
generated_ids = []

for _ in range(max_length):
    # Prepare payload for the current step
    payload = {
        "inputs": [
            {
                "name": "input_ids",
                "shape": list(input_ids.shape),
                "datatype": "INT64",
                "data": input_ids.flatten().tolist()
            },
            {
                "name": "attention_mask",
                "shape": list(attention_mask.shape),
                "datatype": "INT64",
                "data": attention_mask.flatten().tolist()
            },
            {
                "name": "decoder_input_ids",
                "shape": list(decoder_input_ids.shape),
                "datatype": "INT64",
                "data": decoder_input_ids.flatten().tolist()
            }
        ]
    }

    # Send POST request
    headers = {"Content-Type": "application/json"}
    response = requests.post(infer_url, json=payload, headers=headers, verify=False)

    if response.status_code == 200:
        response_data = response.json()
        logits = response_data["outputs"][0]["data"]
        logits_tensor = torch.tensor(logits, device=device).view(1, -1, 32128)

        # Get the next token
        next_token_id = torch.argmax(logits_tensor[:, -1, :], dim=-1)
        generated_ids.append(next_token_id.item())

        # Break if EOS token is generated
        if next_token_id.item() == tokenizer.eos_token_id:
            break

        # Update decoder_input_ids for the next iteration
        decoder_input_ids = torch.cat([decoder_input_ids, next_token_id.unsqueeze(0)], dim=-1)

    else:
        print(f"Error {response.status_code}: {response.text}")
        break

# Decode the generated IDs to text
generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
print("Generated Output:", generated_text)


Generated Output: The feature reflects that the client has a low annual income. This means that the client does not have a good financial stability and cash flow, which increases default risk.
