In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

import bentoml

In [49]:
# Homework

# Note: sometimes your answer might not match one of the options exactly. That's fine. Select the option that's closest to your solution.
# The goal of this homework is to familiarize you with BentoML and how to build and test an ML production service.

# Background
# You are a new recruit at ACME corp. Your manager is emailing you about your first assignment.
# Email from your manager

# Good morning recruit! It's good to have you here! I have an assignment for you. 
# I have a data scientist that's built a credit risk model in a jupyter notebook. 
# I need you to run the notebook and save the model with BentoML and see how big the model is. 
# If it's greater than a certain size, I'm going to have to request additional resources from our infra team. 
# Please let me know how big it is.
# Thanks,
# Mr McManager

# Question 1

# Install BentoML
# What's the version of BentoML you installed?
# Use --version to find out

# --> installed bentoml in hw7 subfolder: pip install bentoml

# also in terminal then ran
# > bentoml --version
# --> bentoml, version 1.0.7

In [50]:
# Question 2

# Run the notebook from module 6 and 
# save the credit risk model with BentoML



In [51]:
# final credit risk model from module 6 found here
# https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/course-zoomcamp/06-trees/notebook.ipynb


In [52]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv'
!wget $data

df = pd.read_csv(data)
df.columns = df.columns.str.lower()

--2022-10-22 16:13:30--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182489 (178K) [text/plain]
Saving to: ‘CreditScoring.csv.4’


2022-10-22 16:13:30 (70.1 MB/s) - ‘CreditScoring.csv.4’ saved [182489/182489]



In [53]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)
    
df = df[df.status != 'unk'].reset_index(drop=True)

In [54]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.status == 'default').astype('int').values
y_val = (df_val.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

del df_train['status']
del df_val['status']
del df_test['status']



In [55]:
train_dicts = df_train.fillna(0).to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

features = dv.get_feature_names()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)




In [56]:
df_full_train = df_full_train.reset_index(drop=True)
y_full_train = (df_full_train.status == 'default').astype(int).values
del df_full_train['status']

In [57]:
dicts_full_train = df_full_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

dicts_test = df_test.to_dict(orient='records')
X_test = dv.transform(dicts_test)

In [58]:
#max_depth = 10
#min_samples_leaf = 3

#rf = RandomForestClassifier(n_estimators=200,
#                            max_depth=max_depth,
#                            min_samples_leaf=min_samples_leaf,
#                            random_state=1)
#rf.fit(X_train, y_train)

In [59]:
dfulltrain = xgb.DMatrix(X_full_train, label=y_full_train,
                    feature_names=dv.get_feature_names())

dtest = xgb.DMatrix(X_test, feature_names=dv.get_feature_names())

xgb_params = {
    'eta': 0.1, 
    'max_depth': 3,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dfulltrain, num_boost_round=175)

In [60]:
saved_model = bentoml.xgboost.save_model("credit_risk_model", model,
                                        custom_objects = {
                                            "dictVectorizer" : dv
                                        })
print(f"Model saved: {saved_model}")

Model saved: Model(tag="credit_risk_model:qxxzgucseszs5r7u")


In [None]:
# How big approximately is the saved BentoML model?

# 924kb
# 724kb
# --> 114kb <-- closest to 197KiB
# 8kb

In [None]:
# Another email from your manager
# Great job recruit! Looks like I won't be having to go back to the procurement team. 
# Thanks for the information.
# However, I just got word from one of the teams that's using one of our ML services 
# and they're saying our service is "broken" and their trying to blame our model. 
# I looked at the data their sending and it's completely bogus. 
# I don't want them to send bad data to us and blame us for our models. Could you write a pydantic schema for the data that they should be sending? That way next time it will tell them it's their data that's bad and not our model.
# Thanks,
# Mr McManager

In [None]:
# Question 3

# Say you have the following data that you're sending to your service:

# {
#   "name": "Tim",
#   "age": 37,
#   "country": "US",
#   "rating": 3.14
# }
# What would the pydantic class look like? You can name the class UserProfile.

# Answer:
# class UserProfile(BaseModel):
#     name: str
#     age: int
#     country: str
#     rating: float

In [None]:
# Email from your CEO
# Good morning! I hear you're the one to go to if I need something done well! 
# We've got a new model that a big client needs deployed ASAP. I need you to build a service with it and test it against the old model and make sure that it performs better, otherwise we're going to lose this client. All our hopes are with you!
# Thanks,
# CEO of Acme Corp

In [None]:
# Question 4
# We've prepared a model for you that you can import using:
# curl -O https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel.bentomodel
# bentoml models import coolmodel.bentomodel
# What version of scikit-learn was this model trained with?
# additional step (in bash): bentoml models get mlzoomcamp_homework:qtzdz3slg6mwwdu5
# which will return

# name: mlzoomcamp_homework                                                                                           
# version: qtzdz3slg6mwwdu5                                                                                           
# module: bentoml.sklearn                                                                                             
# labels: {}                                                                                                          
# options: {}                                                                                                         
# metadata: {}                                                                                                        
# context:                                                                                                            
#   framework_name: sklearn                                                                                           
#   framework_versions:                                                                                               
#     scikit-learn: 1.1.1                                                                                             
#   bentoml_version: 1.0.7                                                                                            
#   python_version: 3.9.12                                                                                            
# signatures:                                                                                                         
#   predict:                                                                                                          
#     batchable: false                                                                                                
# api_version: v1                                                                                                     
# creation_time: '2022-10-13T20:42:14.411084+00:00'

# --> 1.1.1 <--
# 1.1.2
# 1.1.3
# 1.1.4
# 1.1.5


In [None]:
# Question 5
# Create a bento out of this scikit-learn model. This will require installing scikit-learn like this:
# pip install scikit-learn
# Hint: The and output type for this endpoint should be NumpyNdarray()
# Send this array to the bento:
# [[6.4,3.5,4.5,1.2]]
# You can use curl or the Swagger UI. What value does it return?
# 0
# --> 1 <--
# 2
# 3

In [1]:
# Question 6
# Ensure to serve your bento with --production for this question
# Install locust using:
# pip install locust
# Use the following locust file: locustfile.py
locustfile = 'https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/course-zoomcamp/cohorts/2022/07-bento-production/locustfile.py'
!wget $locustfile
# Ensure that it is pointed at your bento's endpoint (In case you didn't name your endpoint "classify")

--2022-10-23 13:45:19--  https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/course-zoomcamp/cohorts/2022/07-bento-production/locustfile.py
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘locustfile.py’

locustfile.py           [ <=>                ] 168.59K  --.-KB/s    in 0.004s  

2022-10-23 13:45:19 (41.7 MB/s) - ‘locustfile.py’ saved [172638]



In [None]:
# Configure 100 users with ramp time of 10 users per second. Click "Start Swarming" and ensure that it is working
# Now download a second model with this command:
# curl -O https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel2.bentomodel
# Or you can download with this link as well: 
# https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel2.bentomodel
# Now import the model:
# bentoml models import coolmodel2.bentomodel
# Update your bento's runner tag and test with both models. 
# Which model allows more traffic (more throughput) as you ramp up the traffic? 
# Remember to turn off and turn on your bento service between changing the model tag. 
# Use Ctl-C to close the service. Then call bentoml serve

In [None]:
# with async and for n around e.g. 200-500, on my cloud machine 2nd model performs about 10% better in terms of RPS

In [None]:
# Test out the first model and the second model, which one performance better at higher volumes?
# The first model
# --> The second model <--

In [None]:
# Email from marketing
# Hello ML person! I hope this email finds you well. 
# I've heard there's this cool new ML model called Stable Diffusion. 
# I hear if you give it a description of a picture it will generate an image. 
# We need a new company logo and I want it to be fierce but also cool, think you could help out?
# Thanks,
# Mike Marketer

In [None]:
# Question 7 (optional)
# Go to this Bento deployment of Stable Diffusion: http://54.176.205.174/ (or deploy it yourself)
# Use the txt2image endpoint and update the prompt to: "A cartoon dragon with sunglasses". Don't change the seed, it should be 0 by default
# What is the resulting image?
# 1
# 2
# 3
# 4