In this notebook, we will go through common classification models provided by scikit-learn and AWS SageMaker, including

- Logistic Regression
- Support Vector Machine
- Decision Tree
- Random Forest
- XGBoost (SageMaker)

We still use the credit approval data (crx.data) in this example.

<h4>Import and Preprocess Data</h4>

We first import the data and use the preprocessing pipeline that was built in module 4.

In [1]:
import pandas as pd

crx = pd.read_csv('crx.data',header=None) #the data doesn't come with header names, we don't need them anyway :)

print(crx.shape)                          #the shape() function gives the data size (rows,columns)
crx.head()                                #the head() function display a few first rows in the data

(690, 16)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [2]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=10)

for train_index, test_index in split.split(crx, crx[15]):
    strat_train_set = crx.loc[train_index]
    strat_test_set = crx.loc[test_index]

#in general, we call input data X, and label data Y, so we first create trainX and trainY
trainX = strat_train_set.loc[:,:14]
trainY = strat_train_set.loc[:,15].values
trainY = (trainY=='+')*1

testX = strat_test_set.loc[:,:14]
testY = strat_test_set.loc[:,15].values
testY = (testY=='+')*1

trainX.shape, trainY.shape, testX.shape, testY.shape

((517, 15), (517,), (173, 15), (173,))

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

num_cols = trainX.columns[(trainX.dtypes==np.int64) | (trainX.dtypes==np.float64)]

num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
])

cat_cols = trainX.columns[trainX.dtypes==object]

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant',fill_value='missing')),
    ('encode', OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ('numeric', num_pipeline, num_cols),
    ('class', cat_pipeline, cat_cols)
])

In [4]:
trainX_prc = full_pipeline.fit_transform(trainX)
trainX_prc = np.array(trainX_prc.todense())
traindata = np.concatenate([trainY.reshape(-1,1),trainX_prc],axis=1)
pd.DataFrame(traindata).to_csv('crx_train.csv', index=False, header=False)


testX_prc = full_pipeline.transform(testX)
testX_prc = np.array(testX_prc.todense())
testdata = np.array(testX_prc)

<h3>Using Scikit-Learn Algorithms</h3>

<h4>Logistic Regression</h4>

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score

#create new model
logistic = LogisticRegression()

#train 
logistic.fit(trainX_prc, trainY)

#get training accuracy
train_accuracy_lg = logistic.score(trainX_prc, trainY)

#get testing accuracy
test_accuracy_lg = logistic.score(testX_prc, testY)

print('Training Accuracy:', train_accuracy_lg)
print('Testing Accuracy:', test_accuracy_lg)

Training Accuracy: 0.8878143133462283
Testing Accuracy: 0.8497109826589595


<h4>Linear Support Vector Machine</h4>

In [6]:
from sklearn.svm import LinearSVC

lsvc = LinearSVC(max_iter=5000, C=0.01)
lsvc.fit(trainX_prc, trainY)

train_accuracy_lsvm = lsvc.score(trainX_prc, trainY)

#get testing accuracy
test_accuracy_lsvm = lsvc.score(testX_prc, testY)

print('Training Accuracy:', train_accuracy_lsvm)
print('Testing Accuracy:', test_accuracy_lsvm)

Training Accuracy: 0.8762088974854932
Testing Accuracy: 0.8497109826589595


<h4>Kernel Support Vector Machine</h4>

In [7]:
from sklearn.svm import SVC

svc = SVC(max_iter=5000, kernel='rbf', gamma=0.1, C=0.1)
svc.fit(trainX_prc, trainY)

train_accuracy_svm = svc.score(trainX_prc, trainY)

#get testing accuracy
test_accuracy_svm = svc.score(testX_prc, testY)

print('Training Accuracy:', train_accuracy_svm)
print('Testing Accuracy:', test_accuracy_svm)

Training Accuracy: 0.8858800773694391
Testing Accuracy: 0.8439306358381503


<h4>Decision Tree</h4>

In [8]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(trainX_prc, trainY)

train_accuracy_dt = dt.score(trainX_prc, trainY)

#get testing accuracy
test_accuracy_dt = dt.score(testX_prc, testY)

print('Training Accuracy:', train_accuracy_dt)
print('Testing Accuracy:', test_accuracy_dt)

Training Accuracy: 1.0
Testing Accuracy: 0.791907514450867


<h4>Random Forest</h4>

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(trainX_prc, trainY)

train_accuracy_rf = rf.score(trainX_prc, trainY)

#get testing accuracy
test_accuracy_rf = rf.score(testX_prc, testY)

print('Training Accuracy:', train_accuracy_rf)
print('Testing Accuracy:', test_accuracy_rf)

Training Accuracy: 1.0
Testing Accuracy: 0.8497109826589595


<h3>AWS Algorithm - XGBoost</h3>

<h4>Prepare the SageMaker Environment</h4>

In [11]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
my_region = boto3.session.Session().region_name # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [12]:
bucket_name = 'lle13bucket1' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


<h4>Prepare the Training Data</h4>

In [13]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/crx_train.csv')).upload_file('crx_train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

<h4>Create and Train a New XGBoost Model</h4>

In [14]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, instance_count=1, instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

In [15]:
xgb.fit({'train': s3_input_train})

2022-02-14 04:59:36 Starting - Starting the training job...ProfilerReport-1644814775: InProgress
...
2022-02-14 05:00:25 Starting - Preparing the instances for training......
2022-02-14 05:01:29 Downloading - Downloading input data......
2022-02-14 05:02:30 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-02-14:05:02:33:INFO] Running standalone xgboost training.[0m
[34m[2022-02-14:05:02:33:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-02-14:05:02:33:INFO] File size need to be processed in the node: 0.15mb. Available memory size in the node: 8494.33mb[0m
[34m[2022-02-14:05:02:33:INFO] Determined delimiter of CSV input is ','[0m
[34m[05:02:33] S3DistributionType set as FullyReplicated[0m
[34m[05:02:33] 517x51 matrix with 26367 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[05:02:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 

<h4>Deploy the Model</h4>

In [72]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

----------!

<h4>Prepare the Testing Data</h4>

In [82]:
testX = strat_test_set.loc[:,:14]
testY = strat_test_set.loc[:,15:16]
testY = (testY=='+')*1
testX_prc = full_pipeline.transform(testX)
testdata = np.array(testX_prc.todense())

<h4>Test the Trained XGBoost Model</h4>

In [84]:
from sagemaker.serializers import CSVSerializer

xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(testdata).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(173,)


In [89]:
cm = pd.crosstab(index=testY.values.flatten(), columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "Not Approved", "Approved"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 86.1%

Predicted      Not Approved   Approved
Observed
No Purchase    91% (80)    19% (16)
Purchase        9% (8)     81% (69) 



<h4>Cleaning up</h4>

In [90]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'V4G0Q3C0K45F1WGY',
   'HostId': 'NIb8lEMTny9ZFbLqPvc8OGhYd2aZLMriZq1UxVkv3L1BgEFvR+a75aQe00+PK+Buo8yby414NoA=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'NIb8lEMTny9ZFbLqPvc8OGhYd2aZLMriZq1UxVkv3L1BgEFvR+a75aQe00+PK+Buo8yby414NoA=',
    'x-amz-request-id': 'V4G0Q3C0K45F1WGY',
    'date': 'Sun, 13 Feb 2022 17:59:57 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-02-13-17-09-46-006/profiler-output/system/training_job_end.ts'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-02-13-17-09-46-006/profiler-output/system/incremental/2022021317/1644772260.algo-1.json'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-02-13-17-09-46-006/rule-output/ProfilerReport-1644772186/profiler-output/profiler-reports/OverallFrameworkMetrics.json'},
   {'Ke