# The files have been generated in Databricks and provided on the S3 bucket

In [None]:
!pip install boto3

In [1]:
import boto3

session = boto3.Session() # Grabs session details directly from aws configuration in EC2 instance running the Notebook server
session

Session(region_name='eu-central-1')

In [2]:
# Open S3 bucket
s3 = boto3.resource('s3')

bucket_name = 'sjf-project'

bucket = s3.Bucket(bucket_name)
response = f'Bucket \'{bucket_name}\' is accessable.'
print(response)

Bucket 'sjf-project' is accessable.


In [5]:
# Use a previously-built, AWS XGBoost model for training

#!pip install sagemaker

import sagemaker

from sagemaker.amazon.amazon_estimator import get_image_uri
# container = get_image_uri(***insert correct arguments here***) 

container = get_image_uri(region_name=boto3.Session().region_name,
                          repo_name='xgboost',
                          repo_version='1.0-1')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [6]:
# Create pointers to the S3 train and test datasets

s3_path = f"s3://{bucket_name}/"


from sagemaker.session import s3_input

s3_input_train = sagemaker.session.s3_input(s3_data=f"{s3_path}train_sample.csv", content_type="text/csv")
s3_input_valid = sagemaker.session.s3_input(s3_data=f"{s3_path}valid_sample.csv", content_type="text/csv")
#s3_input_test = sagemaker.session.s3_input(s3_data=f"{s3_path}test.csv", content_type="text/csv")

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [7]:
# Create a SageMaker Session

# INSERT CODE HERE
##From LiveCoding

sagemaker_session = sagemaker.Session(boto_session=session)

#The following IAM role ARN was taken from the flutz notebook listed in the SageMaker console:
sm_execution_role = 'arn:aws:iam::898627427171:role/service-role/AmazonSageMaker-ExecutionRole-20201106T104926'


In [8]:
# Create an XGBoost Estimator

# INSERT CODE HERE

output_path = s3_path + "estimator"


# From https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html 
# XGBoost ecr path is: 492215442770.dkr.ecr.eu-central-1.amazonaws.com
ecr_path = '492215442770.dkr.ecr.eu-central-1.amazonaws.com'


xgboost = sagemaker.estimator.Estimator(f"{ecr_path}/sagemaker-xgboost:1.0-1-cpu-py3",
                              sm_execution_role,
                              train_instance_count=1,
                              train_instance_type="ml.m5.large",
                              output_path=output_path,
                              sagemaker_session=sagemaker_session,
                              base_job_name="sjf-xgboost")


Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [9]:
# Select the your specific hyperparameters (Optional)

# INSERT CODE HERE

# From https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html

xgboost.set_hyperparameters(
    eta=0.3,                         # default 0.3 range 0 - 1
    num_round=30,                    # required | valid values: int | The number of rounds to run the training.
    objective = 'reg:squarederror',  # Examples: reg:logistic, multi:softmax, reg:squarederror [default] 
    )


In [10]:
# Fit the model

# INSERT CODE HERE
xgboost.fit({"train": s3_input_train})


2020-11-17 13:24:20 Starting - Starting the training job...
2020-11-17 13:24:23 Starting - Launching requested ML instances......
2020-11-17 13:25:48 Starting - Preparing the instances for training......
2020-11-17 13:26:33 Downloading - Downloading input data...
2020-11-17 13:27:03 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[13:27:29] 1037x32 matrix with 33184 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34mINFO:ro

In [11]:
# Deploy your model to an endpoint to perform predictions
xgb_predictor = xgboost.deploy(
    initial_instance_count = 1, 
    instance_type = 'ml.t2.medium')     #'ml.m4.xlarge') | 'ml.t2.medium'
    

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!

In [12]:
xgb_predictor.endpoint

'sjf-xgboost-2020-11-17-13-24-20-660'

In [13]:
# Configure the predictor's serializer and deserializer

from sagemaker.predictor import csv_serializer, csv_deserializer

xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = csv_deserializer
xgb_predictor.content_type = "text/csv"

In [14]:
xgb_predictor.endpoint

'sjf-xgboost-2020-11-17-13-24-20-660'

# Predictions

In [15]:
s3_path

's3://sjf-project/'

In [22]:
# Import the validate dataset into a dataframe

import pandas as pd

validate_df = pd.read_csv(f"{s3_path}valid_sample.csv", header=None)
display(validate_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,2.6,10340,0,1964,81,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4.0,34329,0,1959,79,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,4.4,30964,0,1982,115,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,60969,0,1983,125,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5.2,27540,0,1980,114,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,8.4,10422,0,1975,146,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
209,8.4,12970,0,1954,124,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
210,8.8,24004,0,1971,122,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211,8.8,641119,0,1966,161,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
validate_df.iloc[0, 1:]

1     10340.0
2         0.0
3      1964.0
4        81.0
5         0.0
6         0.0
7         1.0
8         0.0
9         0.0
10        1.0
11        0.0
12        0.0
13        0.0
14        1.0
15        0.0
16        0.0
17        0.0
18        0.0
19        0.0
20        0.0
21        0.0
22        0.0
23        0.0
24        0.0
25        0.0
26        0.0
27        0.0
28        0.0
29        0.0
30        0.0
31        0.0
32        0.0
Name: 0, dtype: float64

In [28]:
prediction = xgb_predictor.predict(validate_df.iloc[0:, 1:].values)
len(prediction[0])

213

In [48]:
y_true = validate_df[0].values
display(y_true)

array([2.6, 4. , 4.4, 5. , 5.2, 5.4, 5.5, 5.7, 5.9, 5.9, 5.9, 5.9, 5.9,
       5.9, 6. , 6.1, 6.1, 6.3, 6.3, 6.3, 6.3, 6.3, 6.4, 6.4, 6.5, 6.5,
       6.5, 6.5, 6.5, 6.6, 6.6, 6.7, 6.7, 6.7, 6.7, 6.8, 6.8, 6.8, 6.8,
       6.8, 6.8, 6.8, 6.9, 6.9, 6.9, 6.9, 6.9, 6.9, 6.9, 7. , 7. , 7. ,
       7. , 7. , 7.1, 7.1, 7.1, 7.1, 7.2, 7.2, 7.2, 7.2, 7.2, 7.2, 7.2,
       7.2, 7.2, 7.2, 7.2, 7.2, 7.3, 7.3, 7.3, 7.3, 7.3, 7.3, 7.3, 7.3,
       7.3, 7.3, 7.3, 7.3, 7.3, 7.4, 7.4, 7.4, 7.4, 7.4, 7.4, 7.4, 7.4,
       7.4, 7.4, 7.4, 7.4, 7.4, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5,
       7.5, 7.5, 7.5, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6,
       7.6, 7.6, 7.6, 7.6, 7.6, 7.7, 7.7, 7.7, 7.7, 7.7, 7.7, 7.7, 7.7,
       7.7, 7.7, 7.7, 7.7, 7.7, 7.8, 7.8, 7.8, 7.8, 7.8, 7.8, 7.8, 7.8,
       7.8, 7.8, 7.8, 7.8, 7.8, 7.9, 7.9, 7.9, 7.9, 7.9, 7.9, 7.9, 7.9,
       7.9, 7.9, 7.9, 8. , 8. , 8. , 8. , 8. , 8. , 8. , 8. , 8. , 8. ,
       8. , 8. , 8. , 8. , 8. , 8. , 8. , 8. , 8. , 8.1, 8.1, 8.

In [49]:
y_predicted = list(map(float, prediction[0]))
display(y_predicted)

[7.069362163543701,
 7.003239154815674,
 7.150574207305908,
 6.756973743438721,
 6.842678070068359,
 5.847489356994629,
 6.527228832244873,
 6.770012378692627,
 6.280209064483643,
 5.800378322601318,
 7.265439033508301,
 6.091572284698486,
 6.379053115844727,
 7.249086856842041,
 5.806080341339111,
 6.837289810180664,
 5.522181034088135,
 6.317848205566406,
 6.785048007965088,
 6.856626510620117,
 6.476921558380127,
 6.968724250793457,
 6.213685989379883,
 6.704216003417969,
 6.693097114562988,
 6.998255729675293,
 6.7994608879089355,
 6.581653118133545,
 6.307485580444336,
 7.3730878829956055,
 7.52493143081665,
 6.877690315246582,
 7.196727752685547,
 7.053366184234619,
 6.938945770263672,
 7.862077236175537,
 6.420969009399414,
 7.008478164672852,
 7.150251865386963,
 6.541346073150635,
 7.392608642578125,
 6.571640968322754,
 7.16414213180542,
 6.974667549133301,
 6.567122459411621,
 7.274566650390625,
 6.852047920227051,
 7.31311559677124,
 6.569287300109863,
 7.412439346313477,
 

In [53]:
from sklearn.metrics import r2_score

print(r2_score(y_true, y_predicted))

0.4110718400507807


In [54]:
pred=[]
pred_df = pd.DataFrame(y_predicted, columns=['prediction'])
pred_df['ground_truth']=y_true
display(pred_df.tail(10))

Unnamed: 0,prediction,ground_truth
203,8.068649,8.2
204,8.166977,8.2
205,8.193645,8.3
206,8.091828,8.3
207,8.003078,8.3
208,7.458429,8.4
209,7.867491,8.4
210,7.688304,8.8
211,8.52274,8.8
212,7.064924,9.0
