In [1]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.6.2-py3-none-win_amd64.whl (125.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2


In [4]:
!pip install pandas scikit-learn

Collecting pandas
  Using cached pandas-1.5.1-cp310-cp310-win_amd64.whl (10.4 MB)
Installing collected packages: pandas
Successfully installed pandas-1.5.1


In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

### Data preparation

In [6]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv'
df = pd.read_csv(data)

In [7]:
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [8]:
df.columns = df.columns.str.lower()

status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

df = df[df.status != 'unk'].reset_index(drop=True)

In [9]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=11)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

del df_train['status']
del df_test['status']

In [10]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train.fillna(0).to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

test_dicts = df_test.fillna(0).to_dict(orient='records')
X_test = dv.transform(test_dicts)

### Random forest

In [11]:
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=10,
                            min_samples_leaf=3,
                            random_state=1)
rf.fit(X_train, y_train)

### Question 1


* Install BentoML
* What's the version of BentoML you installed?
* Use --version to find out

In [12]:
!pip install bentoml



In [13]:
!bentoml --version

bentoml, version 1.0.0


### Question 2

Run the notebook which contains random forest model from module 6 i.e previous module and save the model with BentoML. To make it easier for you we have prepared this notebook.

How big approximately is the saved BentoML model? Size can slightly vary depending on your local development environment. Choose the size closest to your model.

* 924kb
* 724kb
* 114kb
* 8kb


In [14]:
import bentoml

In [16]:
bentoml.sklearn.save_model("random_forest_model", rf)

Model(tag="random_forest_model:zyglkmkqek6avugg", path="C:\Users\Hoe\bentoml\models\random_forest_model\zyglkmkqek6avugg\")

In [22]:
!dir C:\Users\Hoe\bentoml\models\random_forest_model\zyglkmkqek6avugg

 Volume in drive C is OS
 Volume Serial Number is 9AE6-88EA

 Directory of C:\Users\Hoe\bentoml\models\random_forest_model\zyglkmkqek6avugg

20/10/2022  10:56 AM    <DIR>          .
20/10/2022  10:56 AM    <DIR>          ..
20/10/2022  10:56 AM               353 model.yaml
20/10/2022  10:56 AM         5,792,512 saved_model.pkl
               2 File(s)      5,792,865 bytes
               2 Dir(s)  149,069,045,760 bytes free


### Question 3

Say you have the following data that you're sending to your service:

{
  "name": "Tim",
  "age": 37,
  "country": "US",
  "rating": 3.14
}

What would the pydantic class look like? You can name the class UserProfile.

In [25]:
!pip install pydantic

Collecting pydantic
  Downloading pydantic-1.10.2-cp310-cp310-win_amd64.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 9.5 MB/s eta 0:00:00
Installing collected packages: pydantic
Successfully installed pydantic-1.10.2


In [27]:
from pydantic import BaseModel

class UserProfile(BaseModel):
    name: str
    age: int
    country: str
    rating: float

### Question 4

We've prepared a model for you that you can import using:

* curl -O https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel.bentomodel
* bentoml models import coolmodel.bentomodel

In [28]:
!curl -O https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel.bentomodel

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100  1724  100  1724    0     0    707      0  0:00:02  0:00:02 --:--:--   707
100  1724  100  1724    0     0    707      0  0:00:02  0:00:02 --:--:--   707


In [29]:
!bentoml models import coolmodel.bentomodel

Model(tag="mlzoomcamp_homework:qtzdz3slg6mwwdu5") imported


In [31]:
!bentoml models list

 Tag                            Module           Size      Creation Time       
 random_forest_model:zyglkmkq���  bentoml.sklearn  5.52 MiB  2022-10-20 10:56:33 
 mlzoomcamp_homework:qtzdz3sl���  bentoml.sklearn  5.79 KiB  2022-10-14 04:42:14 


In [32]:
!bentoml models get mlzoomcamp_homework:qtzdz3sl

name: mlzoomcamp_homework                                                      
version: qtzdz3slg6mwwdu5                                                      
module: bentoml.sklearn                                                        
labels: {}                                                                     
options: {}                                                                    
metadata: {}                                                                   
context:                                                                       
  framework_name: sklearn                                                      
  framework_versions:                                                          
    scikit-learn: 1.1.1                                                        
  bentoml_version: 1.0.7                                                       
  python_version: 3.9.12                                                       
signatures:                             

### Question 5

Create a bento out of this scikit-learn model. This will require installing scikit-learn like this:

- pip install scikit-learn

Hint: The output type for this endpoint should be NumpyNdarray()

Send this array to the bento:

[[6.4,3.5,4.5,1.2]]

You can use curl or the Swagger UI. What value does it return?

- 0
- 1
- 2
- 3


In [None]:
# bentofile.yaml
# bentoml build
# bentoml containerize mlzoomcamp_model:latest
# docker run -it --rm -p 3000:3000 mlzoomcamp_model:2jmj5jsqhoripugg serve --production".

### Question 6

Ensure to serve your bento with --production for this question

Install locust using:

- pip install locust

Use the following locust file: locustfile.py

Ensure that it is pointed at your bento's endpoint (In case you didn't name your endpoint "classify")

Configure 100 users with ramp time of 10 users per second. Click "Start Swarming" and ensure that it is working.

Now download a second model with this command:

- curl -O https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel2.bentomodel

Or you can download with this link as well: https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel2.bentomodel

Now import the model:

bentoml models import coolmodel2.bentomodel

Update your bento's runner tag and test with both models. Which model allows more traffic (more throughput) as you ramp up the traffic?

Hint 1: Remember to turn off and turn on your bento service between changing the model tag. Use Ctl-C to close the service in between trials.

Hint 2: Increase the number of concurrent users to see which one has higher throughput

Which model has better performance at higher volumes?

    The first model
    The second model


In [35]:
!pip3 install locust

Collecting locust
  Using cached locust-2.12.2-py3-none-any.whl (823 kB)
Collecting gevent>=20.12.1
  Downloading gevent-22.10.1-cp310-cp310-win_amd64.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 23.6 MB/s eta 0:00:00
Collecting Werkzeug>=2.0.0
  Downloading Werkzeug-2.2.2-py3-none-any.whl (232 kB)
     ------------------------------------- 232.7/232.7 kB 13.9 MB/s eta 0:00:00
Collecting ConfigArgParse>=1.0
  Using cached ConfigArgParse-1.5.3-py3-none-any.whl (20 kB)
Collecting msgpack>=0.6.2
  Downloading msgpack-1.0.4-cp310-cp310-win_amd64.whl (61 kB)
     ---------------------------------------- 61.3/61.3 kB ? eta 0:00:00
Collecting Flask-BasicAuth>=0.2.0
  Using cached Flask-BasicAuth-0.2.0.tar.gz (16 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting Flask-Cors>=3.0.10
  Using cached Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Collecting roundrobin>=0.0.2
  Using cached roundrobin-0.0.4.t

In [39]:
!curl -O https://s3.us-west-2.amazonaws.com/bentoml.com/mlzoomcamp/coolmodel2.bentomodel

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1728  100  1728    0     0   1717      0  0:00:01  0:00:01 --:--:--  1717
100  1728  100  1728    0     0   1715      0  0:00:01  0:00:01 --:--:--  1715


In [None]:
#Model 1

#bentoml models import coolmodel.bentomodel
#mlzoomcamp_homework:qtzdz3slg6mwwdu5

#bentoml models import coolmodel2.bentomodel
#mlzoomcamp_homework:jsi67fslz6txydu5

### XGBoost

Note:

We removed feature names

It was 

```python
features = dv.get_feature_names_out()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
```

Now it's

```python
dtrain = xgb.DMatrix(X_train, label=y_train)
```

In [7]:
dtrain = xgb.DMatrix(X_train, label=y_train)

In [8]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 3,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=175)

### BentoML

In [9]:
import bentoml

In [10]:
bentoml.xgboost.save_model(
    'credit_risk_model',
    model,
    custom_objects={
        'dictVectorizer': dv
    })

Model(tag="credit_risk_model:f4iee2cn5whmqtih", path="/home/alexey/bentoml/models/credit_risk_model/f4iee2cn5whmqtih/")

Test

In [11]:
import json

In [12]:
request = df_test.iloc[0].to_dict()
print(json.dumps(request, indent=2))

{
  "seniority": 3,
  "home": "owner",
  "time": 36,
  "age": 26,
  "marital": "single",
  "records": "no",
  "job": "freelance",
  "expenses": 35,
  "income": 0.0,
  "assets": 60000.0,
  "debt": 3000.0,
  "amount": 800,
  "price": 1000
}
