In [1]:
import requests

import boto3
import mlflow
from mlflow import pyfunc as ml_pyfunc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [2]:
leads_dataset = pd.read_csv('/Users/kevinpeng/Downloads/archive/leads.csv')
leads_dataset.columns = map(str.lower, leads_dataset.columns)

In [3]:
leads_categorical_columns = ['lead origin',
                             'lead source',
                             'last activity',
                             'specialization',
                             'what is your current occupation',
                             'what matters most to you in choosing a course',
                             'city',
                             'last notable activity']

leads_numeric_columns = ['totalvisits',
                         'total time spent on website',
                         'page views per visit']

leads_response_columns = ['converted']

In [4]:
leads_x = leads_dataset.drop(leads_response_columns, axis=1)
leads_y = leads_dataset[leads_response_columns]

leads_x_train, leads_x_test, leads_y_train, leads_y_test = train_test_split(leads_x,
                                                                            leads_y,
                                                                            train_size=0.7,
                                                                            test_size=0.3,
                                                                            random_state=5050)

In [5]:
scaler = StandardScaler()
scaler = scaler.fit(leads_x_train[leads_numeric_columns])

In [6]:
def pre_process_leads_data(df,
                           numeric_columns,
                           categorical_columns,
                           fitted_scaler,
                           train_df_columns = None):

    ## create new df with selected columns
    df.columns = map(str.lower, df.columns)
    _df = df[list(set(numeric_columns + categorical_columns))].copy()
    
    ## scale the numeric columns with the pre-built scaler
    _df[numeric_columns] = fitted_scaler.transform(_df[numeric_columns])
         
    # First, make categorical text lowercase
    _df[categorical_columns] = _df[categorical_columns].apply(lambda x: x.str.lower())
    # Next, create one-hot-encoded variables, add to dataframe, drop old columns
    _df_dummies = pd.get_dummies(_df[categorical_columns], drop_first=True)
    _df = pd.concat([_df, _df_dummies], axis=1)
    _df.drop(categorical_columns, axis=1, inplace = True)

    if train_df_columns:
        _df = _df.reindex(columns=train_df_columns, fill_value=0)
    
    _df[numeric_columns] = _df[numeric_columns].fillna(_df[numeric_columns].mean())
    
    return _df

In [7]:
leads_x_train_clean = pre_process_leads_data(df = leads_x_train,
                                            numeric_columns = leads_numeric_columns,
                                            categorical_columns = leads_categorical_columns,
                                            fitted_scaler = scaler)

leads_x_test_clean = pre_process_leads_data(df = leads_x_test,
                                           numeric_columns = leads_numeric_columns,
                                           categorical_columns = leads_categorical_columns,
                                           fitted_scaler = scaler,
                                           train_df_columns = leads_x_train_clean.columns.tolist())
leads_y_test

Unnamed: 0,converted
5597,1
3518,0
1228,0
1059,0
7222,1
...,...
6043,0
919,0
4512,1
1635,0


In [8]:
num_estimators = 100
min_samples = 4

rf = RandomForestClassifier(n_estimators=num_estimators,
                            min_samples_split=min_samples)
rf.fit(leads_x_train_clean, leads_y_train.values.ravel())

In [9]:
leads_y_test_predicted = rf.predict(leads_x_test_clean)

accuracy = metrics.accuracy_score(leads_y_test, leads_y_test_predicted)
auc_score = metrics.roc_auc_score(leads_y_test, leads_y_test_predicted)

print(accuracy)
print(auc_score)

0.8243145743145743
0.8076734235813986


In [11]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("LeadScoringProcessedNew2")

<Experiment: artifact_location='./mlruns/956479994013149774', creation_time=None, experiment_id='956479994013149774', last_update_time=None, lifecycle_stage='active', name='LeadScoringProcessedNew2', tags={}>

In [12]:
class leadsModel(mlflow.pyfunc.PythonModel):
   
    ## defining objects needed for leadsModel prediction. 
    def __init__(self,
                 train_df_columns,
                 model,
                 leads_categorical_columns,
                 leads_numeric_columns,
                 fitted_scaler,
                 pre_process_leads_data):
        
        ## Setting up all needed objects
        self.train_df_columns = train_df_columns
        self.model = model
        self.leads_categorical_columns = leads_categorical_columns
        self.leads_numeric_columns = leads_numeric_columns
        self.fitted_scaler = fitted_scaler
        self.pre_process_leads_data = pre_process_leads_data
    
    ## define function with processing and feeding data into prediction at the end
    def predict(self,context,model_input):
        
        # make sure all inputted columns are lowercase
        model_input.columns = map(str.lower, model_input.columns)
        
        # run inputted dataset through our processing function
        # note: we are excluding the response columns here since not needed for deploy
        model_input_processed = self.pre_process_leads_data(
                                   df = model_input,
                                   numeric_columns = self.leads_numeric_columns,
                                   categorical_columns = self.leads_categorical_columns,
                                   fitted_scaler = self.fitted_scaler,
                                   train_df_columns = self.train_df_columns)       
        
        # finally input the cleaned/adjusted dataset into our model for prediction
        return self.model.predict(model_input_processed)

In [13]:
mlflow_conda_env = {
 'name': 'mlflow_env_1',
 'channels': ['defaults'],
 'dependencies': ['python=3.11.4', {'pip': ['mlflow==2.6.0','scikit-learn==1.3.0','cloudpickle==2.2.1']}]
}

In [14]:
m = leadsModel(train_df_columns = leads_x_train_clean.columns.tolist(),
                                  model = rf,
                                  leads_categorical_columns = leads_categorical_columns,
                                  leads_numeric_columns = leads_numeric_columns,
                                  fitted_scaler = scaler,
                                  pre_process_leads_data = pre_process_leads_data)
model_input = leads_x.head(1)
model_output = m.predict(None,model_input)
print(model_output)

[0]


In [15]:
with mlflow.start_run(run_name="Leads Model with Processing v1") as run:
    # log the parameters that we defined for the model training
    mlflow.log_param("num_estimators", num_estimators)
    mlflow.log_param("min_samples", min_samples)
    
    # log the performance metrics that we calculated earlier
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("auc_score", auc_score)
    
    # log model with all objects referenced in the leadsModel class
    mlflow.pyfunc.log_model(
        artifact_path = "leads_pyfunc",
        python_model = leadsModel(train_df_columns = leads_x_train_clean.columns.tolist(),
                                  model = rf,
                                  leads_categorical_columns = leads_categorical_columns,
                                  leads_numeric_columns = leads_numeric_columns,
                                  fitted_scaler = scaler,
                                  pre_process_leads_data = pre_process_leads_data
                                 ),
        conda_env = mlflow_conda_env
    )
    
    # save run_id and experiment_id for deployment
    run_id = run.info.run_uuid
    experiment_id = run.info.experiment_id
    
    # end the mlflow run!
    mlflow.end_run()



In [17]:
run_id

'826da2e2e8554bd98225f660d42c107d'

In [21]:
from mlflow.deployments import get_deploy_client

# we pull the run and experiment id's from above to create this mlflow location
model_uri = "mlruns/956479994013149774/826da2e2e8554bd98225f660d42c107d/artifacts/leads_pyfunc"

# The region is chosen, pick whats close to you or your systems!
region = "us-east-1"
# The aws account id can be found in the console
aws_account_id = "166772677246"
# We use these inputs to automatically reference the sagemaker docker container
image_url = aws_account_id \
            + ".dkr.ecr." \
            + region \
            + ".amazonaws.com/mlflow-pyfunc:2.6.0"

# now we specify the role that we setup for sagemaker in the previous step
sagemaker_arn = "arn:aws:iam::166772677246:role/sagemaker"


# finally, we pick a name for our endpoint within sagemaker
endpoint_name = "leads" 

config = dict(
    execution_role_arn=sagemaker_arn,
    image_url=image_url,
    region_name=region,
    instance_type="ml.t2.medium",
    timeout_seconds=300,
)

client = get_deploy_client("sagemaker")
client.create_deployment(
    "my-deployment-2",
    model_uri=model_uri,
    flavor="python_function",
    config=config,
)

2023/09/06 23:38:36 INFO mlflow.sagemaker: Using the python_function flavor for deployment!
2023/09/06 23:38:36 INFO mlflow.sagemaker: No model data bucket specified, using the default bucket
2023/09/06 23:38:37 INFO mlflow.sagemaker: Default bucket `mlflow-sagemaker-us-east-1-166772677246` already exists. Skipping creation.
2023/09/06 23:38:50 INFO mlflow.sagemaker: tag response: {'ResponseMetadata': {'RequestId': 'EPMWZ08ZB6NC3TYJ', 'HostId': 'A58GXtHRFbTlJ0odmhAmCswinIVnK9Wt82PbxRXTEvsbkQXSSd0bE2TG/x1GElw3sD/FPD5lHNE=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'A58GXtHRFbTlJ0odmhAmCswinIVnK9Wt82PbxRXTEvsbkQXSSd0bE2TG/x1GElw3sD/FPD5lHNE=', 'x-amz-request-id': 'EPMWZ08ZB6NC3TYJ', 'date': 'Thu, 07 Sep 2023 06:38:51 GMT', 'server': 'AmazonS3', 'content-length': '0'}, 'RetryAttempts': 0}}
2023/09/06 23:38:50 INFO mlflow.sagemaker: Creating new endpoint with name: my-deployment-2 ...
2023/09/06 23:38:51 INFO mlflow.sagemaker: Created model with arn: arn:aws:sagemaker:us-east-1

MlflowException: The deployment operation failed with the following error message: "Timed out after waiting 304.97485733032227 seconds for the operation to complete. This operation may still be in progress. Please check the AWS console for more information."

In [118]:
mlflow.pyfunc.get_model_dependencies(model_uri)

2023/09/06 14:34:30 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r /Users/kevinpeng/Desktop/mlruns/956479994013149772/d14f947beff54015bcf17d42229b2d39/artifacts/leads_pyfunc/requirements.txt'.


'/Users/kevinpeng/Desktop/mlruns/956479994013149772/d14f947beff54015bcf17d42229b2d39/artifacts/leads_pyfunc/requirements.txt'