### XGBoost Model - Predict best puppy based on house area
#### Note: Run this code in SageMaker Notebook, not locally

In [None]:
# Install Necessary Libraries inside the env
import sys
!{sys.executable} -m pip install sagemaker pandas xgboost --upgrade

#### How we would train the model locally to predict best puppy based on person's house area
#### X = house area in ft2
#### y = puppy type (0 = Beagle, 1 = German Shepherd)

In [None]:
import os
import pandas as pd

from xgboost import XGBClassifier

df_train = pd.DataFrame({'X':[0,100,200,400,450,  550,600,800,1600],
                         'y':[0,0,  0,  0,  0,    1,  1,  1,  1]})

df_test = pd.DataFrame({'X':[10,90,240,459,120,  650,700,1800,1300],
                        'y':[0,0,  0,  0,  0,    1,  1,  1,  1]})
                    
ml_model = XGBClassifier()
ml_model.fit(df_train.X, df_train.y)

In [None]:
#Let's see what a person with 300ft2 and 600ft2 will get recommended

print(ml_model.predict([[300]])[0], ml_model.predict([[600]])[0])

### XGBoost training in SageMaker

In [None]:
import sagemaker, boto3
from sagemaker import get_execution_role

from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer


sm_sess = sagemaker.Session()
bucket = sm_sess.default_bucket()
file_path = "puppy_match"

#Need this role to perform stuff
role = get_execution_role()

### Store Datasets

#### SageMaker Expectations for training & validation datasets
#### 1. Target variable (y) to be i the first column
#### 2. Remove column names (i.e. headers)
#### 3. Remove index
#### 4. Stored in an S3 bucket

In [None]:
df_train[['y','X']].to_csv('train.csv', header=False, index=False)
df_val[['y','X']].to_csv('val.csv', header=False, index=False)

In [None]:
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(file_path, "train.csv")
).upload_file("train.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(file_path, "val.csv")
).upload_file("val.csv")

container = sagemaker.image_uris.retrieve("xgboost", sm_sess.boto_region_name, "latest")


s3_input_train = TrainingInput(
    s3_data="s3://{}/{}/train".format(bucket, file_path), content_type="csv"
)

s3_input_validation = TrainingInput(
    s3_data="s3://{}/{}/val".format(bucket, file_path), content_type="csv"
)

### Training

In [None]:
xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, file_path),
    sagemaker_session=sm_sess,
)
xgb.set_hyperparameters(
    objective="multi:softmax", #NOTE: "binary:logistic" only predicts probabilities!  
    num_class=2, 
    num_round=100,
)

xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

### Deployment

In [None]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer()
)

### Prediction

In [None]:
xgb_predictor.predict([750])

### Cleanup to stop incurring Costs!

#### 1. Delete the deployed endpoint by running
xgb_predictor.delete_endpoint()

#### 2. Stop the SageMaker Notebook