# Deploy LLM for DEMO

## Table of Contents

1. [Download and Save a Model Locally](#Download-a-LLM-model-and-store-in-local-storage)
2. [Set Up MLflow Credential and Environment values](#Set-Up-MLflow-Credential-and-Environment-values)
3. [Logging the Downloaded Model as an Artifact](#Logging-the-Downloaded-Model-as-an-Artifact)
4. [Register the model to MLflow Model registry](#Register-the-model-to-MLflow-Model-registry)


In [1]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer 
import requests
import mlflow
import mlflow.pyfunc
from mlflow.tracking import MlflowClient
import torch
import ipywidgets as widgets

from IPython.display import display

# Download and Save a Model Locally

Download a model from hugging face and save it locally.

In [5]:
# Add heading
heading = widgets.HTML("<h2>Model Name and Local Directory to Store</h2>")
display(heading)

modelname_input = widgets.Text(description='Model name:', placeholder="sentence-transformers/all-MiniLM-L6-v2")
modeldir_input = widgets.Text(description='Save to:', placeholder="./models/model")

submit_button = widgets.Button(description='Submit')
success_message = widgets.Output()

model_name = None
model_dir = None

def submit_button_clicked(b):
    global model_name, model_dir
    model_name = modelname_input.value
    model_dir = modeldir_input.value
    with success_message:
        success_message.clear_output()
        print("Configuration submitted successfully!")
    submit_button.disabled = True

submit_button.on_click(submit_button_clicked)

# Set margin on the submit button
submit_button.layout.margin = '20px 0 20px 0'

# Display inputs and button
display(modelname_input, modeldir_input, submit_button, success_message)

HTML(value='<h2>Model Name and Local Directory to Store</h2>')

Text(value='', description='Model name:', placeholder='sentence-transformers/all-MiniLM-L6-v2')

Text(value='', description='Save to:', placeholder='./models/model')

Button(description='Submit', layout=Layout(margin='20px 0 20px 0'), style=ButtonStyle())

Output()

In [6]:
print(model_name, model_dir)

yanolja/EEVE-Korean-10.8B-v1.0 ./models/model


In [13]:
#model_name = "gpt2"
#model_dir = "./models/"+model_name

In [7]:
# Download and save the tokenizer, model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

os.makedirs(model_dir, exist_ok=True)

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

tokenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

('./models/model/tokenizer_config.json',
 './models/model/special_tokens_map.json',
 './models/model/tokenizer.json')

# Set Up MLflow Credential and Environment values

Set up MLflow for model tracking and register the model. Include authentication if necessary.

In [8]:
# Add heading
heading = widgets.HTML("<h2>MLflow Credentials</h2>")
display(heading)

domain_input = widgets.Text(description='Domain:', placeholder="ua.ezm.host")
username_input = widgets.Text(description='Username:')
password_input = widgets.Password(description='Password:')
submit_button = widgets.Button(description='Submit')
success_message = widgets.Output()

domain = None
mlflow_username = None
mlflow_password = None

def submit_button_clicked(b):
    global domain, mlflow_username, mlflow_password
    domain = domain_input.value
    mlflow_username = username_input.value
    mlflow_password = password_input.value
    with success_message:
        success_message.clear_output()
        print("Credentials submitted successfully!")
    submit_button.disabled = True

submit_button.on_click(submit_button_clicked)

# Set margin on the submit button
submit_button.layout.margin = '20px 0 20px 0'

# Display inputs and button
display(domain_input, username_input, password_input, submit_button, success_message)

HTML(value='<h2>MLflow Credentials</h2>')

Text(value='', description='Domain:', placeholder='ua.ezm.host')

Text(value='', description='Username:')

Password(description='Password:')

Button(description='Submit', layout=Layout(margin='20px 0 20px 0'), style=ButtonStyle())

Output()

In [9]:
token_url = f"https://keycloak.{domain}/realms/UA/protocol/openid-connect/token"

data = {
    "username" : mlflow_username,
    "password" : mlflow_password,
    "grant_type" : "password",
    "client_id" : "ua-grant",
}

token_responce = requests.post(token_url, data=data, allow_redirects=True, verify=False)

token = token_responce.json()["access_token"]



In [10]:
os.environ['MLFLOW_TRACKING_TOKEN'] = token
os.environ["AWS_ACCESS_KEY_ID"] = os.environ['MLFLOW_TRACKING_TOKEN']
os.environ["AWS_SECRET_ACCESS_KEY"] = "s3"
os.environ["AWS_ENDPOINT_URL"] = 'http://local-s3-service.ezdata-system.svc.cluster.local:30000'
os.environ["MLFLOW_S3_ENDPOINT_URL"] = os.environ["AWS_ENDPOINT_URL"]
os.environ["MLFLOW_S3_IGNORE_TLS"] = "true"
os.environ["MLFLOW_TRACKING_INSECURE_TLS"] = "true"
os.environ["MLFLOW_TRACKING_URI"] = "http://mlflow.mlflow.svc.cluster.local:5000"

# Logging the Downloaded Model as an Artifact

To begin, you create a new experiment or utilize an existing one and log the model as an artifact of this
experiment. Ultimately, you retrieve the URI that points to this artifact's location and provide it to the custom
predictor component. By doing this, the custom predictor component understands how to fetch the artifact and serve it
effectively.

In [11]:
def get_or_create_experiment(exp_name):
    """Register an experiment in MLFlow.
    
    args:
      exp_name (str): The name of the experiment.
    """
    try:
        mlflow.set_experiment(exp_name)
    except Exception as e:
        raise RuntimeError(f"Failed to set the experiment: {e}")

In [12]:
# Create a new MLFlow experiment or re-use an existing one
get_or_create_experiment('text-generation-models')

In [13]:
#  Log the downloaded model as an artifact of the experiment
class LLMModelWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(context.artifacts["model_dir"])
        self.model = AutoModelForCausalLM.from_pretrained(context.artifacts["model_dir"])

    def predict(self, context, model_input):
        inputs = self.tokenizer(model_input, return_tensors="pt")
        outputs = self.model.generate(inputs["input_ids"], max_length=50)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
with mlflow.start_run() as run:
    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=LLMModelWrapper(),
        artifacts={"model_dir": model_dir},
    )
    uri_path = "model"
    
    model_uri = f"runs:/{run.info.run_id}/{uri_path}"

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

2024/06/07 06:59:42 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


# Register the model to MLflow Model registry

In [14]:
result = mlflow.register_model(model_uri, model_name)

print(f"Model registered with name: {model_name} and version: {result.version}")

Registered model 'yanolja/EEVE-Korean-10.8B-v1.0' already exists. Creating a new version of this model...
2024/06/07 07:27:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: yanolja/EEVE-Korean-10.8B-v1.0, version 1


Model registered with name: yanolja/EEVE-Korean-10.8B-v1.0 and version: 1


Created version '1' of model 'yanolja/EEVE-Korean-10.8B-v1.0'.


In [None]:
#loaded_model = mlflow.pyfunc.load_model(model_uri)
#sample_input = ["This is a sample input for the gpt2 model."]
#predicted_probs = loaded_model.predict(sample_input)
#print(predicted_probs)