In [1]:
import sagemaker
import boto3
import os 
 
bucket = sagemaker.Session().default_bucket()                     
prefix = 'sagemaker/xgboost-whitewinequality'

# Role when working on a notebook instance
role = sagemaker.get_execution_role()

ModuleNotFoundError: No module named 'sagemaker'

In [5]:
#this will get the database from UCI directly into your notebook. I picked a dataset of white wine characteristics
!wget -N https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv

--2020-12-02 11:18:09--  https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘winequality-white.csv’ not modified on server. Omitting download.



In [6]:
!head winequality-white.csv

"fixed acidity";"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6
8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6
7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6
7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6
8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6
6.2;0.32;0.16;7;0.045;30;136;0.9949;3.18;0.47;9.6;6
7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6


In [7]:
import numpy as np  # For matrix operations and numerical processing
import pandas as pd # For munging tabular data

In [8]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
data = pd.read_csv('winequality-white.csv', sep=';')
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50)         # Keep the output on one page
data[:5]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [9]:
data.shape # (number of lines, number of columns)

(4898, 12)

In [10]:
label = 'quality'
#setting up the main characteristic that my model will work to predict via linear regression. I want to know an estimation of white wine quality

In [11]:
# Change the order of the columns and write the file without headers
cols = data.columns.tolist()
colIdx = data.columns.get_loc(label)
cols = cols[colIdx:colIdx+1] + cols[0:colIdx] + cols[colIdx+1:]
modified_data = data[cols]
modified_data[:5]

Unnamed: 0,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,6,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,6,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,6,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,6,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [12]:
# Split the file into train and test (80% train and 20% test)
from sklearn.model_selection import train_test_split
train, test= train_test_split(modified_data, test_size=0.2)
# Save the train file, please change preicison in fmt as needed
np.savetxt("train.csv", train, delimiter=",", fmt='%1.3f')
# Save the test file, please change preicison in fmt as needed
np.savetxt("test.csv", test, delimiter=",", fmt='%1.3f')

In [13]:
 !ls -l *.csv

-rw-r--r--  1 itsencrypted  staff   74153 Dec  2 11:20 test.csv
-rw-r--r--  1 itsencrypted  staff  296355 Dec  2 11:20 train.csv
-rw-r--r--  1 itsencrypted  staff  264426 Oct 16  2009 winequality-white.csv


In [14]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

NameError: name 'boto3' is not defined

In [15]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test/'.format(bucket, prefix), content_type='csv')
s3_data = {'train': s3_input_train, 'validation': s3_input_validation}

NameError: name 'sagemaker' is not defined

In [16]:
from sagemaker.amazon.amazon_estimator import get_image_uri
    
sess = sagemaker.Session()

region = boto3.Session().region_name    
container = get_image_uri(region, 'xgboost', '0.90-2')

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.2xlarge',
                                    input_mode="File",
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

ModuleNotFoundError: No module named 'sagemaker'

In [17]:
xgb.set_hyperparameters(objective='reg:linear', 
                        num_round=100,
                        booster='gbtree',
                        eta=0.3,
                        max_depth=6,
                        min_child_weight=1,
                        scale_pos_weight=1,
                        eval_metric=rmse)

NameError: name 'xgb' is not defined

In [18]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter

hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(2, 8)
                        }

ModuleNotFoundError: No module named 'sagemaker'

In [19]:
objective_metric_name = 'validation:auc'
objective_type = 'Maximize'

In [20]:
from sagemaker.tuner import HyperparameterTuner

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            objective_type=objective_type,
                            max_jobs=10,
                            max_parallel_jobs=1)

ModuleNotFoundError: No module named 'sagemaker'

In [21]:
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

NameError: name 'tuner' is not defined