# [Sample] Model management /Test automation in part of ML Ops

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms

from google.cloud import aiplatform

In [2]:
# Hyper Parameter
num_epochs = 10         
num_batch = 100         
learning_rate = 0.001   
image_size = 28*28

# GPU(CUDA)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
# Timestamp Function
from datetime import datetime
def get_timestamp():
    return datetime.now().strftime("%Y%m%d%H%M%S")

## Create Dataset for Train/Eval

In [4]:
transform = transforms.Compose([
    transforms.ToTensor()
])

In [5]:
# For train
train_dataset = datasets.MNIST(
    './data',           
    train = True,        
    download = True,   
    transform = transform
    )
# For eval
test_dataset = datasets.MNIST(
    './data', 
    train = False,
    transform = transform
    )

# Data Loader
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = num_batch,
    shuffle = True)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,     
    batch_size = num_batch,
    shuffle = True)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


## Training

In [6]:
# Define NN
class Net(nn.Module):
    def __init__(self, input_size, output_size):
        super(Net, self).__init__()

        self.fc1 = nn.Linear(input_size, 100)
        self.fc2 = nn.Linear(100, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.sigmoid(x)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# Generate
model = Net(image_size, 10).to(device)

# Loss Func
criterion = nn.CrossEntropyLoss() 

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate) 

In [7]:
# Train
model.train() 

for epoch in range(num_epochs): 
    loss_sum = 0

    for inputs, labels in train_dataloader:

        # GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # optimizer
        optimizer.zero_grad()

        # NN Process
        inputs = inputs.view(-1, image_size) # 画像データ部分を一次元へ並び変える
        outputs = model(inputs)

        # Loss
        loss = criterion(outputs, labels)
        loss_sum += loss

        # Gradient
        loss.backward()

        # Waight
        optimizer.step()

    # Display learning status
    print(f"Epoch: {epoch+1}/{num_epochs}, Loss: {loss_sum.item() / len(train_dataloader)}")

    # Save model waight
    torch.save(model.state_dict(), 'model_weights.pth')


Epoch: 1/10, Loss: 0.6680443318684895
Epoch: 2/10, Loss: 0.26977516174316407
Epoch: 3/10, Loss: 0.21305072784423829
Epoch: 4/10, Loss: 0.1789398956298828
Epoch: 5/10, Loss: 0.15450818379720052
Epoch: 6/10, Loss: 0.1344214630126953
Epoch: 7/10, Loss: 0.11916908264160156
Epoch: 8/10, Loss: 0.10646553039550781
Epoch: 9/10, Loss: 0.09521687825520833
Epoch: 10/10, Loss: 0.08577755610148112


## Evaluation

In [8]:
# Eval
model.eval()

loss_sum = 0
correct = 0

with torch.no_grad():
    for inputs, labels in test_dataloader:

        # GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # NN
        inputs = inputs.view(-1, image_size)
        outputs = model(inputs)

        # Loss
        loss_sum += criterion(outputs, labels)

        # Count number of correct
        pred = outputs.argmax(1)
        correct += pred.eq(labels.view_as(pred)).sum().item()

print(f"Loss: {loss_sum.item() / len(test_dataloader)}, Accuracy: {100*correct/len(test_dataset)}% ({correct}/{len(test_dataset)})")

Loss: 0.10480978012084961, Accuracy: 96.71% (9671/10000)


## Save Model as Pickl

In [9]:
# Get Model timestamp
TIMESTAMP = get_timestamp()
print(f"TIMESTAMP = {TIMESTAMP}")
!mkdir -p models/$TIMESTAMP

TIMESTAMP = 20230317091445


In [10]:
import cloudpickle
MODEL_PATH = 'models/'+TIMESTAMP+'/model.pkl'
with open(MODEL_PATH, mode='wb') as f:
    cloudpickle.dump(model, f)

## Upload the Model

### Set Google Cloud environment variable

In [11]:
### EDIT HERE ###
PROJECT_ID = 'mlops-pipeline-demo'
BUCKET_NAME = 'keihoshino-mlops-test-bucket'
LOCATION = 'us-central1'

MODEL_NAME = "upload-test-model-khoshino"
SERVING_CONTAINER_IMAGE_URI = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest"
VERSION_DESCRIPTION = """
This is my first model.
"""
#################

### Upload the model to Storage

In [12]:
!gcloud config set project $PROJECT_ID

Updated property [core/project].


In [13]:
# Create Bucket if not exist
import subprocess
import sys
s=subprocess.run(["gsutil","ls","gs://"+BUCKET_NAME])
if s.returncode == 0:
    print('Bucket Already Exist', file=sys.stdout)
else:
    !gsutil mb gs://$BUCKET_NAME
    print('Bucket Created')

gs://keihoshino-mlops-test-bucket/models/
Bucket Already Exist


In [14]:
ARTIFACT_URI = f'gs://{BUCKET_NAME}/models/{TIMESTAMP}'
!gsutil cp $MODEL_PATH $ARTIFACT_URI/model.pkl

Copying file://models/20230317091445/model.pkl [Content-Type=application/octet-stream]...
/ [1 files][313.5 KiB/313.5 KiB]                                                
Operation completed over 1 objects/313.5 KiB.                                    


In [15]:
# Upload the model to Model Registry
aiplatform.init(project=PROJECT_ID, location=LOCATION)
model = aiplatform.Model.upload(
    display_name=MODEL_NAME,
    artifact_uri=ARTIFACT_URI,
    serving_container_image_uri=SERVING_CONTAINER_IMAGE_URI,
    version_description=VERSION_DESCRIPTION,
)
print(model)

Creating Model
Create Model backing LRO: projects/485539447870/locations/us-central1/models/4364676333200998400/operations/2357328534537502720
Model created. Resource name: projects/485539447870/locations/us-central1/models/4364676333200998400@1
To use this Model in another session:
model = aiplatform.Model('projects/485539447870/locations/us-central1/models/4364676333200998400@1')
<google.cloud.aiplatform.models.Model object at 0x7f3dbc07b650> 
resource name: projects/485539447870/locations/us-central1/models/4364676333200998400


### Get Model ID

In [16]:
!gcloud ai models list --project=$PROJECT_ID --region=$LOCATION |grep $MODEL_NAME

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
4364676333200998400  upload-test-model-khoshino


## Upload New version

In [21]:
MODEL_ID='4364676333200998400' # Get in previous step

# Timestamp of new model
TIMESTAMP = get_timestamp()
print(TIMESTAMP)

NEW_MODEL_PATH = 'models/'+TIMESTAMP+'/model.pkl'
ARTIFACT_URI = f'gs://{BUCKET_NAME}/models/{TIMESTAMP}'
VERSION_DESCRIPTION = """
This is New model. Tuned hyper parameters as .......
"""

20230317092101


In [22]:
# 学習省略のため、既存のモデルをコピーしてタイムスタンプだけ変更
!mkdir -p models/$TIMESTAMP
!cp -ip $MODEL_PATH $NEW_MODEL_PATH

In [23]:
!gsutil cp $NEW_MODEL_PATH $ARTIFACT_URI/model.pkl

Copying file://models/20230317092101/model.pkl [Content-Type=application/octet-stream]...
/ [1 files][313.5 KiB/313.5 KiB]                                                
Operation completed over 1 objects/313.5 KiB.                                    


In [26]:
model = aiplatform.Model.upload(
    parent_model=MODEL_ID,
    artifact_uri=ARTIFACT_URI,
    serving_container_image_uri=SERVING_CONTAINER_IMAGE_URI,
    version_description=VERSION_DESCRIPTION,
)

Creating Model
Create Model backing LRO: projects/485539447870/locations/us-central1/models/4364676333200998400/operations/7610777499865186304
Model created. Resource name: projects/485539447870/locations/us-central1/models/4364676333200998400@4
To use this Model in another session:
model = aiplatform.Model('projects/485539447870/locations/us-central1/models/4364676333200998400@4')
