# The files have been generated in Databricks and provided on the S3 bucket

In [1]:
#!pip install boto3 # not required for Jupyter Notebooks on AWS infrastructure

In [2]:
import boto3

session = boto3.Session() # Grabs session details directly from aws configuration in EC2 instance running the Notebook server
session

Session(region_name='eu-central-1')

In [3]:
# Open S3 bucket
s3 = boto3.resource('s3')

bucket_name = 'sjf-project'

bucket = s3.Bucket(bucket_name)
response = f'Bucket \'{bucket_name}\' is accessable.'
print(response)

Bucket 'sjf-project' is accessable.


In [4]:
#!pip install sagemaker
import sagemaker

# Create a SageMaker Session

# INSERT CODE HERE
##From LiveCoding

sagemaker_session = sagemaker.Session(boto_session=session)# Create an XGBoost Estimator


#The following IAM role ARN was taken from the flutz notebook listed in the SageMaker console:
sm_execution_role = 'arn:aws:iam::898627427171:role/service-role/AmazonSageMaker-ExecutionRole-20201106T104926'



## Select cells for either XGBooster or Linear Learner

In [5]:
# Define variables for paths
s3_path = f"s3://{bucket_name}/"
output_path = s3_path + "estimator"

In [6]:
# Use a previously-built, AWS XGBoost model for training

from sagemaker.amazon.amazon_estimator import get_image_uri
# container = get_image_uri(***insert correct arguments here***) 

container = get_image_uri(region_name=boto3.Session().region_name,
                          repo_name='xgboost',
                          repo_version='1.0-1')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [7]:
# # Use the predefined Linear learner model


# ecr_path = "664544806723.dkr.ecr.eu-central-1.amazonaws.com"

# linear = sagemaker.estimator.Estimator(f"{ecr_path}/linear-learner:latest",
#                               sm_execution_role,
#                               train_instance_count=1,
#                               train_instance_type="ml.m5.xlarge",
#                               output_path=output_path,
#                               sagemaker_session=sagemaker.Session(),
#                               base_job_name="sjf-linear")

In [8]:

# From https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html 
# XGBoost ecr path is: 492215442770.dkr.ecr.eu-central-1.amazonaws.com
ecr_path = '492215442770.dkr.ecr.eu-central-1.amazonaws.com'

#'ml.m4.xlarge') | 'ml.t2.medium' || train_instance_type= "ml.m5.large" -> "ml.m5.xlarge"
#Instance Name	vCPU	ECU	RAM   	Instance-GB	Linux/UNIX-Nutzung
#t2.medium	2	Var.	4 GiB	Nur EBS 	0,0536 USD pro Stunde
#m4.xlarge	4	13	16 GiB	Nur EBS 	0,24 USD pro Stunde
#m5.large	2	10	8 GiB	Nur EBS 	0,115 USD pro Stunde
#m5.xlarge	4	16	16 GiB	Nur EBS 	0,23 USD pro Stunde
xgboost = sagemaker.estimator.Estimator(f"{ecr_path}/sagemaker-xgboost:1.0-1-cpu-py3",
                              sm_execution_role,
                              train_instance_count=1,
                              train_instance_type="ml.m5.xlarge",
                              output_path=output_path,
                              sagemaker_session=sagemaker_session,
                              base_job_name="sjf-xgboost")


Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [9]:
# Create pointers to the S3 train and test datasets

from sagemaker.session import s3_input

Train = 'train_data.csv'
Valid = 'valid_data.csv'

s3_input_train = sagemaker.session.s3_input(s3_data=f"{s3_path}{Train}", content_type="text/csv")
s3_input_valid = sagemaker.session.s3_input(s3_data=f"{s3_path}{Valid}", content_type="text/csv")
#s3_input_train = sagemaker.session.s3_input(s3_data=f"{s3_path}train_sample.csv", content_type="text/csv")
#s3_input_valid = sagemaker.session.s3_input(s3_data=f"{s3_path}valid_sample.csv", content_type="text/csv")
#s3_input_test = sagemaker.session.s3_input(s3_data=f"{s3_path}test.csv", content_type="text/csv")

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


## Select cells for XGBooster or linear learner

In [10]:
# Create an XGBoost Estimator

# INSERT CODE HERE

output_path = s3_path + "estimator"


# From https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html 
# XGBoost ecr path is: 492215442770.dkr.ecr.eu-central-1.amazonaws.com
ecr_path = '492215442770.dkr.ecr.eu-central-1.amazonaws.com'

#'ml.m4.xlarge') | 'ml.t2.medium' || train_instance_type= "ml.m5.large" -> "ml.m5.xlarge"
#Instance Name	vCPU	ECU	RAM   	Instance-GB	Linux/UNIX-Nutzung
#t2.medium	2	Var.	4 GiB	Nur EBS 	0,0536 USD pro Stunde
#m4.xlarge	4	13	16 GiB	Nur EBS 	0,24 USD pro Stunde
#m5.large	2	10	8 GiB	Nur EBS 	0,115 USD pro Stunde
#m5.xlarge	4	16	16 GiB	Nur EBS 	0,23 USD pro Stunde
xgboost = sagemaker.estimator.Estimator(f"{ecr_path}/sagemaker-xgboost:1.0-1-cpu-py3",
                              sm_execution_role,
                              train_instance_count=1,
                              train_instance_type="ml.m5.xlarge",
                              output_path=output_path,
                              sagemaker_session=sagemaker_session,
                              base_job_name="sjf-xgboost")


Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [11]:
# Select the your specific hyperparameters (Optional)

# INSERT CODE HERE

# From https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html

xgboost.set_hyperparameters(
    eta=0.3,                         # default 0.3 range 0 - 1   
    num_round=120,                    # required | valid values: int | The number of rounds to run the training.
    objective = 'reg:squarederror'  # Examples: reg:linear, reg:logistic, multi:softmax, reg:squarederror [default] 
    )


In [12]:
# # Select the your specific hyperparameters (Optional) - from Live Coding
# linear.set_hyperparameters(
#     predictor_type="regressor",
#     normalize_data=False,
#     mini_batch_size=50
# )

In [13]:
# Fit the model

# INSERT CODE HERE
xgboost.fit({"train": s3_input_train})
#linear.fit({"train": s3_input_train})


2020-11-18 13:52:57 Starting - Starting the training job...
2020-11-18 13:52:58 Starting - Launching requested ML instances......
2020-11-18 13:54:20 Starting - Preparing the instances for training......
2020-11-18 13:55:20 Downloading - Downloading input data
2020-11-18 13:55:20 Training - Downloading the training image...
2020-11-18 13:55:44 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[13:55:46] 10347x408 matrix with 4221576 entries loaded 

In [15]:
# Deploy your model to an endpoint to perform predictions
#Predictor = linear.deploy(
Predictor = xgboost.deploy(
    initial_instance_count = 1, 
    instance_type = 'ml.t2.medium')
    

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: sjf-xgboost-2020-11-18-13-52-57-046


---------------!

In [16]:
Predictor.endpoint

'sjf-xgboost-2020-11-18-13-52-57-046'

In [17]:
# Configure the predictor's serializer and deserializer

from sagemaker.predictor import csv_serializer, csv_deserializer

Predictor.serializer = csv_serializer
Predictor.deserializer = csv_deserializer
Predictor.content_type = "text/csv"

In [18]:
Predictor.endpoint

'sjf-xgboost-2020-11-18-13-52-57-046'

# Predictions

In [19]:
s3_path

's3://sjf-project/'

In [20]:
# Import the validate dataset into a dataframe

import pandas as pd

validate_df = pd.read_csv(f"{s3_path}{Valid}", header=None)
display(validate_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,399,400,401,402,403,404,405,406,407,408
0,2.9,5015,0,2010,94,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,3.4,8176,0,2012,95,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.4,69208,0,2009,92,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.8,24182,0,2004,102,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5.1,5112,0,2016,88,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1835,8.0,54123,0,1954,108,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1836,8.1,21999,0,2003,107,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1837,8.1,146694,0,1988,124,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1838,8.7,1554836,0,1999,136,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
validate_df.iloc[0, 1:]

1      5015.0
2         0.0
3      2010.0
4        94.0
5         0.0
        ...  
404       0.0
405       0.0
406       0.0
407       0.0
408       0.0
Name: 0, Length: 408, dtype: float64

In [22]:
prediction = Predictor.predict(validate_df.iloc[0:, 1:].values)
len(prediction[0])

1840

In [23]:
y_true = validate_df[0].values
display(y_true)

array([2.9, 3.4, 4.4, ..., 8.1, 8.7, 8.8])

In [36]:
#y_predicted = list(map(lambda pred: round(float(pred),1), prediction[0])) # prediction-Werte werde auf eine Kommastelle gerundet
y_predicted = list(map(float, prediction[0]))
display(y_predicted)

[5.768695831298828,
 5.4769697189331055,
 5.893097400665283,
 5.716073989868164,
 5.178792476654053,
 5.960776329040527,
 4.802578449249268,
 5.282993793487549,
 5.3817362785339355,
 5.807811737060547,
 5.756734371185303,
 7.371767997741699,
 5.7194037437438965,
 6.395280838012695,
 6.1217360496521,
 6.006982326507568,
 6.993508338928223,
 6.771149158477783,
 5.3792266845703125,
 5.471765995025635,
 6.681862831115723,
 6.964267253875732,
 6.4682393074035645,
 7.22316837310791,
 5.916210174560547,
 6.428529262542725,
 5.818057060241699,
 6.56323766708374,
 6.71758508682251,
 6.401329517364502,
 6.092157363891602,
 6.549886703491211,
 6.287076473236084,
 6.968883037567139,
 6.0098419189453125,
 6.694821834564209,
 7.088303089141846,
 6.05542516708374,
 6.993236064910889,
 5.985969543457031,
 7.180700778961182,
 6.12484884262085,
 6.672567844390869,
 7.071694850921631,
 6.911713600158691,
 7.5306806564331055,
 7.315674781799316,
 7.359110355377197,
 6.868598461151123,
 7.532516002655029,


In [37]:
from sklearn.metrics import r2_score
print(r2_score(y_true, y_predicted))

0.4457430034753619


In [38]:
from sklearn.metrics import explained_variance_score
print(explained_variance_score(y_true, y_predicted))

0.44764147171747637


In [39]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_true, y_predicted))

0.5861626001483868


In [40]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_true, y_predicted))

0.5480244908643805


In [41]:
pred=[]
pred_df = pd.DataFrame(y_predicted, columns=['prediction'])
pred_df['ground_truth']=y_true
display(pred_df.tail(10))

Unnamed: 0,prediction,ground_truth
1830,7.195535,7.8
1831,7.56843,7.8
1832,7.562454,7.9
1833,7.319224,7.9
1834,7.848004,8.0
1835,8.115566,8.0
1836,8.042473,8.1
1837,7.82285,8.1
1838,8.127211,8.7
1839,8.613341,8.8
