1. Load necessary packages and modules

In [1]:
# Azure Machine Learning SDK core
from azureml.core import Workspace
from azureml.core.model import Model

# Scikit-learn and others
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

2. Train and save a model
Model training here is provided only as an example, it assumes you don't have a trained model already.

Before running the code below for your own model:

* Load in your trained model, or copy the last line of the code below into the notebook where your trained model is located.
* Save your trained model as a .pkl file called model.pkl, as shown below.

In [2]:
import pandas as pd

# Load the file into a DataFrame
df_preprocessed_salaries = pd.read_csv('https://github.com/hzha579/NZMSA-2022-Phase-2/raw/main/Resources/datasets/preprocessed_salaries.csv')
df_preprocessed_salaries.head(5)

Unnamed: 0,work_year,experience_level,job_title,salary_in_usd
0,3,3,84,85847
1,3,2,66,30000
2,3,2,66,25500
3,3,3,47,175000
4,3,3,47,120000


In [3]:
df_preprocessed_salaries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   work_year         3755 non-null   int64
 1   experience_level  3755 non-null   int64
 2   job_title         3755 non-null   int64
 3   salary_in_usd     3755 non-null   int64
dtypes: int64(4)
memory usage: 117.5 KB


In [4]:
df_preprocessed_salaries.describe()

Unnamed: 0,work_year,experience_level,job_title,salary_in_usd
count,3755.0,3755.0,3755.0,3755.0
mean,2.373635,2.469241,39.661784,137570.38988
std,0.691448,0.906261,18.536695,63055.625278
min,0.0,0.0,0.0,5132.0
25%,2.0,2.0,27.0,95000.0
50%,2.0,3.0,33.0,135000.0
75%,3.0,3.0,47.0,175000.0
max,3.0,3.0,92.0,450000.0


In [5]:
# Split data into a training set and a test set (test set not used in this example)
X = df_preprocessed_salaries[['work_year', 'experience_level', 'job_title']]
y = df_preprocessed_salaries['salary_in_usd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and save model
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)
pickle.dump(model, open("model.pkl", "wb"))

3. Load and connect to workspace

In [6]:
ws = Workspace.from_config(path="config.json")

4. Register model onto Azure

In [7]:
model = Model.register(ws, model_name="salaries-GradientBoostingRegressor", 
                       model_path="model.pkl")

Registering model salaries-GradientBoostingRegressor


5. Create entry script

In [1]:
!conda list

# packages in environment at /anaconda:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                        main  
_openmp_mutex             5.1                       1_gnu  
applicationinsights       0.11.10                  pypi_0    pypi
boltons                   23.0.0          py310h06a4308_0  
brotlipy                  0.7.0           py310h7f8727e_1002  
bzip2                     1.0.8                h7b6447c_0  
ca-certificates           2023.5.7             hbcca054_0    conda-forge
certifi                   2023.5.7           pyhd8ed1ab_0    conda-forge
cffi                      1.15.1          py310h5eee18b_3  
charset-normalizer        2.0.4              pyhd3eb1b0_0  
conda                     23.5.0          py310hff52083_1    conda-forge
conda-content-trust       0.1.3           py310h06a4308_0  
conda-package-handling    2.1.0           py310h06a4308_0  
conda-package-streaming   0.8.0           py310