In [50]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input,Session
import os
import numpy as np
from sklearn.model_selection import train_test_split

In [51]:
bucket_name = "loaneligibilityapplicationv3"
my_region = boto3.session.Session().region_name
print(my_region)

ap-south-1


In [52]:
s3 = boto3.resource('s3')
try:
    if my_region == "ap-south-1":
        location = {'LocationConstraint':my_region}
        s3.create_bucket(Bucket = bucket_name ,CreateBucketConfiguration = location)
    print("S3 bucket is created")
except Exception as e:
    print('S3 error: ',e)

S3 bucket is created


In [53]:
prefix = "xgboost"
output_path = "s3://{}/{}/output".format(bucket_name,prefix)
print(output_path)

s3://loaneligibilityapplicationv3/xgboost/output


In [54]:
import pandas as pd
try:
    df = pd.read_csv('Data/P_Loan_Train.csv')
except Exception as e:
    print("Data load error: ",e)

In [55]:
df.head()

Unnamed: 0.1,Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
1,2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
2,3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
3,4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1
4,5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2,1


In [56]:
df.columns

Index(['Unnamed: 0', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [57]:
df.drop('Unnamed: 0',axis = 1,inplace = True)

In [58]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
1,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
2,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
3,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1
4,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2,1


In [59]:
# Sagemaker needs Target to be first attribute
# Removing Loan_Status and concatnating with remaining dataset
loan_status = df['Loan_Status']
df.drop('Loan_Status',axis = 1,inplace = True)
df = pd.concat([loan_status,df]
          ,axis = 1,)

train_df,validation_df = train_test_split(df,test_size = 0.2,random_state = 27)

In [60]:
train_df.head()


Unnamed: 0,Loan_Status,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
464,1,1,1,0,0,1,16120,0.0,260.0,360.0,1.0,2
457,0,1,1,1,0,0,4283,3000.0,172.0,84.0,1.0,0
359,0,1,1,0,0,0,8334,0.0,160.0,360.0,1.0,1
86,1,1,0,0,0,0,5316,0.0,136.0,360.0,1.0,2
371,1,1,1,2,0,0,6700,1750.0,230.0,300.0,1.0,1


In [61]:
print(train_df.shape)

(384, 12)


In [62]:
validation_df.shape

(96, 12)

In [63]:
validation_df.head()

Unnamed: 0,Loan_Status,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
249,1,1,1,2,0,0,3717,0.0,120.0,360.0,1.0,1
453,1,1,1,2,0,0,6540,0.0,205.0,360.0,1.0,1
195,0,1,1,0,1,0,1668,3890.0,201.0,360.0,0.0,1
383,1,1,0,0,1,0,3691,0.0,110.0,360.0,1.0,0
223,1,1,1,2,0,0,2301,985.799988,78.0,180.0,1.0,2


In [64]:
train_df.to_csv("train.csv",header = False,index = False)

In [65]:
validation_df.to_csv("validation.csv",header = False,index = False)

In [66]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix,'train/train.csv')).upload_file('train.csv')

In [67]:
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name,prefix),content_type = 'csv')

In [68]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix,'validation/validation.csv')).upload_file('validation.csv')

In [69]:
s3_input_validation = sagemaker.TrainingInput(s3_data='s3://{}/{}/validation'.format(bucket_name,prefix),content_type = 'csv')

In [70]:
container = get_image_uri(boto3.Session().region_name,
                         'xgboost',
                         repo_version = '1.3-1')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [71]:
# initialize hyperparameters
hyperparameters = {
    "max_depth":"5",
    "eta":"0.2",
    "gamma":"4",
    "min_child_weight":"6",
    "subsample":"0.7",
    "objective":"binary:logistic",
    "num_round":50
}

In [72]:
#estimator calls xgboost container
estimator = sagemaker.estimator.Estimator(image_uri=container,
                                          hyperparameters = hyperparameters,
                                          role = sagemaker.get_execution_role(),
                                          instance_count=1,
                                          instance_type = 'ml.m4.xlarge',
                                          volume_size = 5,
                                          output_path = output_path,
                                          use_spot_instances = True,
                                          max_run = 300,
                                          max_wait = 600)

In [73]:
estimator.fit({'train':s3_input_train,'validation':s3_input_validation})

2021-10-03 12:39:47 Starting - Starting the training job...
2021-10-03 12:40:12 Starting - Launching requested ML instancesProfilerReport-1633264787: InProgress
......
2021-10-03 12:41:13 Starting - Preparing the instances for training............
2021-10-03 12:43:13 Downloading - Downloading input data...
2021-10-03 12:43:33 Training - Downloading the training image..[34m[2021-10-03 12:43:53.970 ip-10-0-181-58.ap-south-1.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-10-03:12:43:54:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-10-03:12:43:54:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2021-10-03:12:43:54:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-10-03:12:43:54:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2021-10-03:12:43:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-10-03:12:

In [86]:
xgb_deployment = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

------!

In [87]:
xgb_deployment.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'sagemaker-xgboost-2021-10-07-04-36-25-426'

In [93]:
from sagemaker.serializers import CSVSerializer

xgb_deployment.serializer = CSVSerializer()

In [94]:
xgb_deployment.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'sagemaker-xgboost-2021-10-07-04-36-25-426'

In [97]:
from sagemaker.predictor import csv_serializer
test_data_array = validation_df.drop(['Loan_Status'], axis=1).values #load the data into an array
predictions = xgb_deployment.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(96,)


In [98]:
predictions

'0.8611176013946533,0.8811799883842468,0.2203277051448822,0.7132605314254761,0.8641104698181152,0.8052459955215454,0.8785168528556824,0.9103271961212158,0.882922887802124,0.7269582748413086,0.6999669671058655,0.7666990756988525,0.266672283411026,0.7275015711784363,0.2967309057712555,0.9122282266616821,0.7292051315307617,0.6444319486618042,0.8242905735969543,0.9122282266616821,0.7779185175895691,0.7344366908073425,0.7376565337181091,0.8578895330429077,0.758962869644165,0.8590726256370544,0.18846845626831055,0.8611176013946533,0.7450196146965027,0.8136629462242126,0.7808809876441956,0.5645716190338135,0.5116123557090759,0.1856885701417923,0.8194150924682617,0.7292051315307617,0.8221940398216248,0.1440105140209198,0.8221940398216248,0.822127640247345,0.758962869644165,0.6927008032798767,0.24735845625400543,0.19506345689296722,0.8969494700431824,0.8641104698181152,0.8915001153945923,0.8194150924682617,0.8262336254119873,0.7808809876441956,0.2296258807182312,0.17943911254405975,0.1337167024

In [99]:
cm = pd.crosstab(index=validation_df['Loan_Status'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "Not Approved", "Approved"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("Not Approved", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Approved", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 77.1%

Predicted      Not Approved   Approved
Observed
Not Approved   84% (16)    25% (19)
Approved        16% (3)     75% (58) 

