In [2]:
#! mkdir service_project

In [7]:
%cd service_project

/home/jupyter/workshop/2.service_image/service_project


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [8]:
%%writefile main.py
import os
from typing import Dict, Any, List
import logging
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer
from unsloth import FastLanguageModel
import asyncio
from google.cloud import storage
import shutil
import subprocess

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Get environment variables
AIP_HTTP_PORT = int(os.getenv("AIP_HTTP_PORT", "8080"))
AIP_HEALTH_ROUTE = os.getenv("AIP_HEALTH_ROUTE", "/health")
AIP_PREDICT_ROUTE = os.getenv("AIP_PREDICT_ROUTE", "/predict")
AIP_STORAGE_URI = os.getenv("AIP_STORAGE_URI", "")  # GCS path to model
LOCAL_MODEL_PATH = "/tmp/model"  # Local path to save the model

# Model configuration
MAX_SEQ_LENGTH = int(os.getenv("MAX_SEQ_LENGTH", "8192"))
DTYPE = os.getenv("DTYPE", None)
LOAD_IN_4BIT = os.getenv("LOAD_IN_4BIT", "True").lower() == "true"

class PredictRequest(BaseModel):
    instances: List[Dict[str, Any]]
    parameters: Dict[str, Any] = {
        "temperature": 0.2,
        "max_output_tokens": 512
    }

class PredictResponse(BaseModel):
    predictions: List[Dict[str, Any]]
    deployed_model_id: str = ""
    model_version_id: str = ""
    model_resource_name: str = ""

def download_model_from_gcs():
    """Download model files from GCS to local directory."""
    try:
        logger.info(f"Downloading model from {AIP_STORAGE_URI} to {LOCAL_MODEL_PATH}")
        
        if AIP_STORAGE_URI.startswith("gs://"):
            # Parse bucket and prefix from GCS path
            bucket_name = AIP_STORAGE_URI.split("/")[2]
            prefix = "/".join(AIP_STORAGE_URI.split("/")[3:])
            
            # Initialize GCS client
            storage_client = storage.Client()
            bucket = storage_client.bucket(bucket_name)
            
            # Create local model directory if it doesn't exist
            os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
            
            # Download all files from the GCS path
            blobs = bucket.list_blobs(prefix=prefix)
            for blob in blobs:
                # Get the relative path of the file
                relative_path = blob.name[len(prefix):].lstrip("/")
                local_file_path = os.path.join(LOCAL_MODEL_PATH, relative_path)
                
                # Create directories if they don't exist
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                
                # Download the file
                blob.download_to_filename(local_file_path)
                logger.info(f"Downloaded {blob.name} to {local_file_path}")
        else:
            # If not a GCS path, try using gsutil for more complex scenarios
            os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
            command = f"gsutil -m cp -r {AIP_STORAGE_URI}/* {LOCAL_MODEL_PATH}/"
            subprocess.run(command, shell=True, check=True)
            
        logger.info("Model download completed successfully")
    except Exception as e:
        logger.error(f"Error downloading model: {str(e)}")
        raise

def format_prompt(instruction: str, input_text: str = None) -> str:
    """Format the prompt according to the model's expected format."""
    if input_text:
        return f"""Below is an instruction that describes a task. \
Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
"""
    else:
        return f"""Below is an instruction that describes a task. \
Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""

# Initialize FastAPI app
app = FastAPI(title="Vertex AI PEFT Model Prediction Service")

@app.on_event("startup")
async def startup_event():
    """Download model from GCS and load it on startup."""
    global model, tokenizer
    
    try:
        logger.info("Starting model initialization...")
        
        # First, download the model from GCS
        download_model_from_gcs()
        
        # Load model and tokenizer from local path
        logger.info("Loading model and tokenizer...")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=LOCAL_MODEL_PATH,
            max_seq_length=MAX_SEQ_LENGTH,
            dtype=DTYPE,
            load_in_4bit=LOAD_IN_4BIT,
        )
        
        # Enable faster inference
        FastLanguageModel.for_inference(model)
        
        logger.info("Model and tokenizer loaded successfully")
    except Exception as e:
        logger.error(f"Error during startup: {str(e)}")
        raise

@app.on_event("shutdown")
async def shutdown_event():
    """Cleanup on shutdown."""
    try:
        # Clean up downloaded model files
        if os.path.exists(LOCAL_MODEL_PATH):
            shutil.rmtree(LOCAL_MODEL_PATH)
        logger.info("Cleaned up model files")
    except Exception as e:
        logger.error(f"Error during cleanup: {str(e)}")

@app.get(AIP_HEALTH_ROUTE)
async def health():
    """Health check endpoint required by Vertex AI"""
    return {"status": "healthy"}

@app.post(AIP_PREDICT_ROUTE, response_model=PredictResponse)
async def predict(request: PredictRequest):
    """Prediction endpoint"""
    try:
        # Log incoming request
        logger.info(f"Received prediction request with {len(request.instances)} instances")
        
        predictions = []
        
        for instance in request.instances:
            # Format prompt
            prompt = instance.get("prompt", None)

            if prompt:
                
                # Tokenize input
                inputs = tokenizer(
                    [prompt],
                    return_tensors="pt"
                ).to("cuda")
                
                # Generate response
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=request.parameters.get("max_output_tokens", 512),
                    temperature=request.parameters.get("temperature", 0.2),
                )
                
                # Decode response
                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
                
                predictions.append({
                    "content": response
                })
            else:
                predictions.append({
                    "content": "Need to use 'prompt' parameter"
                })
        
        # Get deployment details from environment variables
        deployed_model_id = os.getenv("AIP_DEPLOYED_MODEL_ID", "")
        model_version = os.getenv("AIP_MODEL_VERSION", "")
        model_name = os.getenv("AIP_MODEL_NAME", "")
        
        return PredictResponse(
            predictions=predictions,
            deployed_model_id=deployed_model_id,
            model_version_id=model_version,
            model_resource_name=model_name
        )
        
    except Exception as e:
        logger.error(f"Error during prediction: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    
    # Log startup configuration
    logger.info(f"Starting server on port {AIP_HTTP_PORT}")
    logger.info(f"Health check endpoint: {AIP_HEALTH_ROUTE}")
    logger.info(f"Prediction endpoint: {AIP_PREDICT_ROUTE}")
    
    # Start the server
    uvicorn.run(
        "main:app",
        host="0.0.0.0",
        port=AIP_HTTP_PORT,
        log_level="info"
    )

Overwriting main.py


In [9]:
%%writefile requirements.txt
fastapi>=0.68.0
uvicorn>=0.15.0
pydantic>=1.8.0
python-multipart>=0.0.5
typing-extensions>=4.0.0
torch>=2.4.0
xformers>=0.0.27.post2
unsloth==2025.1.1
transformers==4.47.1
accelerate>=0.34.1
bitsandbytes
triton>=3.0.0
sentencepiece>=0.2.0
google-cloud-storage>=2.0.0

Overwriting requirements.txt


In [13]:
%%writefile Dockerfile
FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    git \
    wget \
    build-essential \
    gcc \
    g++ \
    curl \
    gnupg \
    ca-certificates \
    && rm -rf /var/lib/apt/lists/*

# Install Google Cloud SDK
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
    echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
    apt-get update -y && \
    apt-get install google-cloud-sdk -y && \
    rm -rf /var/lib/apt/lists/*

# Set environment variable for C compiler
ENV CC=/usr/bin/gcc

# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY main.py .

# Default environment variables
ENV AIP_HTTP_PORT=8080
ENV AIP_HEALTH_ROUTE=/health
ENV AIP_PREDICT_ROUTE=/predict

# Model configuration environment variables
ENV MAX_SEQ_LENGTH=8192
ENV DTYPE=None
ENV LOAD_IN_4BIT=True
ENV LOCAL_DOWNLOAD_PATH=/tmp/model

# Create model directory
RUN mkdir -p /tmp/model

# Expose the port
EXPOSE ${AIP_HTTP_PORT}

# Start the application
CMD ["python", "main.py"]

Overwriting Dockerfile


In [14]:
! docker build -t vertex-prediction-peft .

Sending build context to Docker daemon  20.99kB
Step 1/18 : FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
 ---> 1bfcece93725
Step 2/18 : WORKDIR /app
 ---> Using cache
 ---> cad19167b04d
Step 3/18 : RUN apt-get update && apt-get install -y     git     wget     build-essential     gcc     g++     curl     gnupg     ca-certificates     && rm -rf /var/lib/apt/lists/*
 ---> Running in 5554ea5059ba
Get:1 http://security.ubuntu.com/ubuntu focal-security InRelease [128 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal InRelease [265 kB]
Get:3 http://security.ubuntu.com/ubuntu focal-security/multiverse amd64 Packages [30.9 kB]
Get:4 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [1297 kB]
Get:5 http://archive.ubuntu.com/ubuntu focal-updates InRelease [128 kB]
Get:6 http://archive.ubuntu.com/ubuntu focal-backports InRelease [128 kB]
Get:7 http://archive.ubuntu.com/ubuntu focal/restricted amd64 Packages [33.4 kB]
Get:8 http://archive.ubuntu.com/ubuntu focal/main amd

# Docker 이미지를 이용한 테스트 수행

터미널을 하나 띄우고, 아래와 같이, Docker Image를 실행합니다.

    docker run --gpus "device=1" -p 8085:8080 \
      -e MAX_SEQ_LENGTH=8192 \
      -e LOAD_IN_4BIT=True \
      -e AIP_STORAGE_URI="gs://vertexai-unsloth-yourname/fine_tuned_gemma_model" \
      vertex-prediction-peft

아래와 같은, 로그가 발생하는지 확인하세요. 

    /app/main.py:111: DeprecationWarning: 
            on_event is deprecated, use lifespan event handlers instead.
    
            Read more about it in the
            [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
            
      @app.on_event("startup")
    /app/main.py:139: DeprecationWarning: 
            on_event is deprecated, use lifespan event handlers instead.
    
            Read more about it in the
            [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
            
      @app.on_event("shutdown")
    INFO:__main__:Starting server on port 8080
    INFO:__main__:Health check endpoint: /health
    INFO:__main__:Prediction endpoint: /predict
    INFO:     Started server process [1]
    INFO:     Waiting for application startup.
    INFO:main:Starting model initialization...
    INFO:main:Downloading model from gs://vertexai-unsloth-yourname/fine_tuned_gemma_model to /tmp/model
    INFO:main:Downloaded fine_tuned_gemma_model/README.md to /tmp/model/README.md
    INFO:main:Downloaded fine_tuned_gemma_model/adapter_config.json to /tmp/model/adapter_config.json
    INFO:main:Downloaded fine_tuned_gemma_model/adapter_model.safetensors to /tmp/model/adapter_model.safetensors
    INFO:main:Downloaded fine_tuned_gemma_model/special_tokens_map.json to /tmp/model/special_tokens_map.json
    INFO:main:Downloaded fine_tuned_gemma_model/tokenizer.json to /tmp/model/tokenizer.json
    INFO:main:Downloaded fine_tuned_gemma_model/tokenizer.model to /tmp/model/tokenizer.model
    INFO:main:Downloaded fine_tuned_gemma_model/tokenizer_config.json to /tmp/model/tokenizer_config.json
    INFO:main:Downloaded fine_tuned_gemma_model/training_args.bin to /tmp/model/training_args.bin
    INFO:main:Model download completed successfully
    INFO:main:Loading model and tokenizer...
    `config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
    Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
    `config.hidden_activation` if you want to override this behaviour.
    See https://github.com/huggingface/transformers/pull/29402 for more details.
    Unsloth 2025.1.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
    INFO:main:Model and tokenizer loaded successfully
    INFO:     Application startup complete.
    INFO:     Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
    🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
    🦥 Unsloth Zoo will now patch everything to make training faster!
    ==((====))==  Unsloth 2025.1.1: Fast Gemma patching. Transformers: 4.47.1.
       \\   /|    GPU: NVIDIA L4. Max memory: 21.951 GB. Platform: Linux.
    O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
    \        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
     "-____-"     Free Apache license: http://github.com/unslothai/unsloth
    Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

이후에 또 다른 터미널을 하나 띄우고, 아래와 같이, CURL을 이용하여, Health Check를 수행합니다. 

    curl -X GET http://localhost:8085/health

결과가 아래와 같이 나오는지 확인해 보세요. 

    {"status":"healthy"}

실제로 Prediction이 정상적으로 되는지 확인해 보도록 하겠습니다. 

    curl -X POST http://localhost:8085/predict \
      -H "Content-Type: application/json" \
      -d '{
        "instances": [
          {
            "prompt": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a way to encrypt a message using a key in Python.\n\n### Input:\nmessage = \"Hello world!\"\nkey = \"secret\"\n\n### Response:"
          }
        ],
        "parameters": {
          "temperature": 0.2,
          "max_output_tokens": 512
        }
      }'

결과가 아래와 같이 나오는지 확인해 보세요.

    {"predictions":[{"content":"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a way to encrypt a message using a key in Python.\n\n### Input:\nmessage = \"Hello world!\"\nkey = \"secret\"\n\n### Response:\ndef encrypt(message, key):\n  encrypted_message = \"\"\n  for i in range(len(message)):\n    encrypted_message += chr(ord(message[i]) + ord(key[i % len(key)]))\n  return encrypted_message"}],"deployed_model_id":"","model_version_id":"","model_resource_name":""}    


이제 해당 Container 이미지를 완성하였습니다. 
해당 이미지를 이용하여 Deploy 대상 모델을 만들어서 Upload 합니다. 

## Upload container image & artifact

Container를 docker repository에 먼저 upload하여야 합니다. 

In [18]:
# 만약, artifact registry 서비스가 비활성화 되어 있다면, enable을 수행한다. 필요하면 아래 내용에 uncomment
# gcloud services enable artifactregistry.googleapis.com

# 만약, docker repository가 없다면 생성한다.
# Docker repository 생성
# ! gcloud artifacts repositories create osllm-repo \
#     --repository-format=docker \
#     --location=us-central1 \
#     --description="Repository for prediction models"

# 있는지 확인하고 싶다면, 
# ! gcloud artifacts repositories list


Create request issued for: [osllm-repo]
Waiting for operation [projects/turnkey-charter-358922/locations/us-central1/op
erations/12da9324-f2c3-423f-b576-dcaf36e7a4ef] to complete...done.             
Created repository [osllm-repo].


In [21]:
# 환경 변수 설정
import os

REGION = "us-central1"
PROJECT_ID = !gcloud config get-value project
PROJECT_ID = PROJECT_ID[0]
REPOSITORY = "osllm-repo"
IMAGE = "vertex-prediction-peft"
TAG = "latest"



In [22]:
! docker images

REPOSITORY                                                                                                                TAG                             IMAGE ID       CREATED          SIZE
asia-northeast3-docker.pkg.dev/turnkey-charter-358922/osllm/vertex-prediction-peft                                        latest                          69739fa6b41c   27 minutes ago   14.5GB
vertex-prediction-peft                                                                                                    latest                          69739fa6b41c   27 minutes ago   14.5GB
<none>                                                                                                                    <none>                          d73327f1ad57   16 hours ago     13.7GB
asia-northeast3-docker.pkg.dev/turnkey-charter-358922/osllm/vertex-prediction-peft                                        <none>                          31e24081d213   18 hours ago     13.7GB
<none>                               

In [24]:
# Docker 이미지 태그 설정
!docker tag {IMAGE}:{TAG} {REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:{TAG}

# Artifact Registry 인증
!gcloud auth configure-docker --quiet {REGION}-docker.pkg.dev

# 이미지 푸시
!docker push {REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:{TAG}

# 최종 이미지 URI 출력
CONTAINER_IMAGE_URI = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:{TAG}"
print(f"\nContainer Image URI: {CONTAINER_IMAGE_URI}")


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "asia-northeast3-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
Docker configuration file updated.
The push refers to repository [us-central1-docker.pkg.dev/turnkey-charter-358922/osllm-repo/vertex-prediction-peft]

[1Bb8601705: Preparing 
[1Bbda190d5: Preparing 
[1B9e7b987f: Preparing 
[1Bcfffd8c2: Preparing 
[1Bc0557cd1: Preparing 
[1B99f91062: Preparing 
[1B7ad60040: Preparing 
[1B8b3fbde8: Preparing 
[1Bbf18a086: Preparing 
[1B146fed80: Preparing 
[1B772bca01: Preparing 
latest: digest: sha256:f4fccc8fd982eb28d94358cec6e1a8765bbd1f19323bfbd745688ba73e3e538d size: 2835

Container Image URI: us-central1-docker.pkg.dev/turnkey-charter-358922/osllm-repo/vertex-prediction-peft:latest


## Vertex AI에, 만든 이미지와, Model Artifact를 이용하여 등록

이 코드는 Vertex AI에 커스텀 컨테이너를 사용하여 모델을 업로드하는 과정을 보여줍니다. 주로 다음과 같은 목적을 가집니다.

1.  **Vertex AI 초기화**: 프로젝트 ID와 리전을 설정하여 Vertex AI 환경을 초기화합니다.
2.  **모델 아티팩트 위치 지정**: GCS (Google Cloud Storage)에 저장된 모델 아티팩트의 위치를 지정합니다. 이 예제에서는 `MODEL_ARTIFACT_URI` 변수를 통해 위치를 설정합니다.
3.  **컨테이너 이미지 URI 설정**: 모델을 배포할 때 사용할 컨테이너 이미지의 URI를 설정합니다. `CONTAINER_IMAGE_URI` 변수를 통해 설정합니다.
4.  **모델 등록**: `aiplatform.Model.upload()` 함수를 사용하여 모델을 Vertex AI에 등록합니다. 이 때 모델의 이름, 아티팩트 위치, 컨테이너 이미지, 헬스 체크 및 예측 엔드포인트, 컨테이너 포트, 환경 변수 등을 설정합니다.
5.  **환경 변수 설정**: 모델 서버에 필요한 환경 변수를 설정합니다. 이 예제에서는 `MAX_SEQ_LENGTH`, `LOAD_IN_4BIT`, `HTTP_PORT` 등의 환경 변수를 설정합니다.
6.  **모델 업로드 확인**: 모델이 성공적으로 업로드되면 모델의 리소스 이름을 출력합니다.

**중심적인 변수 활용:**

*   `PROJECT_ID`: Google Cloud 프로젝트 ID
*   `REGION`: Vertex AI 리전
*   `CONTAINER_IMAGE_URI`: 모델을 실행할 컨테이너 이미지 URI
*   `MODEL_ARTIFACT_URI`: GCS에 저장된 모델 아티팩트 위치
*   `MODEL_DISPLAY_NAME`: Vertex AI에 표시될 모델 이름
*   `serving_cont`: 모델 서빙 컨테이너 관련 설정 (헬스 체크, 예측 엔드포인트, 포트, 환경 변수)

이 코드를 통해 Vertex AI에 모델을 배포하고, 추론을 수행할 수 있는 환경을 구성할 수 있습니다.

In [25]:
from google.cloud import aiplatform

# Vertex AI 초기화
aiplatform.init(
    project=PROJECT_ID,
    location=REGION
)

# 컨테이너 이미지 URI
CONTAINER_IMAGE_URI = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:{TAG}"
MODEL_DISPLAY_NAME = f"{IMAGE}-model"
MODEL_ARTIFACT_URI = "gs://vertexai-unsloth-yourname/fine_tuned_gemma_model"


# 모델 등록
model = aiplatform.Model.upload(
    display_name=MODEL_DISPLAY_NAME,
    artifact_uri=MODEL_ARTIFACT_URI,       # GCS의 모델 아티팩트 위치 지정
    serving_container_image_uri=CONTAINER_IMAGE_URI,
    serving_container_predict_route="/predict",
    serving_container_health_route="/health",
    serving_container_ports=[8080],
    serving_container_environment_variables={
        "MAX_SEQ_LENGTH": "8192",
        "LOAD_IN_4BIT": "True",
        "HTTP_PORT": "8080"
    },
)

print(f"Model uploaded. Resource name: {model.resource_name}")

Creating Model
Create Model backing LRO: projects/547505032058/locations/us-central1/models/3838277444821843968/operations/7813048160036585472
Model created. Resource name: projects/547505032058/locations/us-central1/models/3838277444821843968@1
To use this Model in another session:
model = aiplatform.Model('projects/547505032058/locations/us-central1/models/3838277444821843968@1')
Model uploaded. Resource name: projects/547505032058/locations/us-central1/models/3838277444821843968


In [26]:
# 모델 엔드포인트 배포를 위한 파라미터 설정
deployment_params = {
    "machine_type": "g2-standard-4",    # G2 시리즈 머신 타입
    "accelerator_type": "NVIDIA_L4",    # L4 GPU
    "accelerator_count": 1,
    "min_replica_count": 1,
    "max_replica_count": 1,
    "traffic_split": {"0": 100}
}

# 엔드포인트 생성
endpoint = aiplatform.Endpoint.create(
    display_name=f"{MODEL_DISPLAY_NAME}-endpoint"
)

# 모델 배포
endpoint.deploy(
    model=model,
    machine_type=deployment_params["machine_type"],
    accelerator_type=deployment_params["accelerator_type"],
    accelerator_count=deployment_params["accelerator_count"],
    min_replica_count=deployment_params["min_replica_count"],
    max_replica_count=deployment_params["max_replica_count"],
    traffic_split=deployment_params["traffic_split"]
)

print(f"Model deployed to endpoint: {endpoint.resource_name}")



Creating Endpoint
Create Endpoint backing LRO: projects/547505032058/locations/us-central1/endpoints/6122424890596261888/operations/1983138442405478400
Endpoint created. Resource name: projects/547505032058/locations/us-central1/endpoints/6122424890596261888
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/547505032058/locations/us-central1/endpoints/6122424890596261888')
Deploying Model projects/547505032058/locations/us-central1/models/3838277444821843968 to Endpoint : projects/547505032058/locations/us-central1/endpoints/6122424890596261888
Deploy Endpoint model backing LRO: projects/547505032058/locations/us-central1/endpoints/6122424890596261888/operations/5736888731818786816
Endpoint model deployed. Resource name: projects/547505032058/locations/us-central1/endpoints/6122424890596261888
Model deployed to endpoint: projects/547505032058/locations/us-central1/endpoints/6122424890596261888


AttributeError: 'Endpoint' object has no attribute 'uri'

In [27]:
from google.cloud import aiplatform
from typing import Dict, Union
import json

def predict_text(
    project_id: str,
    endpoint_id: str,
    location: str,
    text: str,
    temperature: float = 0.1,
    max_tokens: int = 256,
    ) -> Dict[str, Union[str, float]]:
    """
    Vertex AI 엔드포인트를 통해 텍스트 생성을 수행합니다.
    
    Args:
        project_id (str): GCP 프로젝트 ID
        endpoint_id (str): 엔드포인트 ID (마지막 숫자 문자열)
        location (str): 리전 (예: 'us-central1')
        text (str): 입력 텍스트
        temperature (float): 생성 텍스트의 무작위성 정도 (0.0 ~ 1.0)
        max_tokens (int): 생성할 최대 토큰 수
    
    Returns:
        Dict: 생성된 텍스트와 관련 메타데이터
    """
    
    # 엔드포인트 초기화
    endpoint = aiplatform.Endpoint(
        endpoint_name=f"projects/{project_id}/locations/{location}/endpoints/{endpoint_id}"
    )
    
    # 요청 인스턴스 구성
    instance = {
        "prompt": text,
        "temperature": temperature,
        "max_tokens": max_tokens
    }
    
    # 예측 수행
    response = endpoint.predict([instance])
    
    return response.predictions[0]


In [28]:
# 설정값
PROJECT_ID = "547505032058"  # 위 출력에서 확인한 프로젝트 ID
ENDPOINT_ID = "6122424890596261888"  # 위 출력에서 확인한 엔드포인트 ID
LOCATION = "us-central1"

# 테스트 텍스트
test_prompt = """다음 이메일에 대한 답장을 작성해주세요:

제목: 제품 배송 지연 관련 문의
내용: 안녕하세요. 지난주 주문한 상품이 아직도 배송이 시작되지 않았네요. 
언제쯤 받아볼 수 있을까요?"""

try:
    # 예측 실행
    result = predict_text(
        project_id=PROJECT_ID,
        endpoint_id=ENDPOINT_ID,
        location=LOCATION,
        text=test_prompt,
        temperature=0.3,
        max_tokens=512
    )
    
    # 결과 출력
    print("생성된 답변:")
    print(result)
    
except Exception as e:
    print(f"에러 발생: {str(e)}")

생성된 답변:
{'content': '다음 이메일에 대한 답장을 작성해주세요:\n\n제목: 제품 배송 지연 관련 문의\n내용: 안녕하세요. 지난주 주문한 상품이 아직도 배송이 시작되지 않았네요. \n언제쯤 받아볼 수 있을까요?'}
