# Introduction to Data Formats and S3

In [1]:
import pandas as pd
import numpy as np

import boto3
import sagemaker.amazon.common as smac

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/codespace/.config/sagemaker/config.yaml


In [2]:
np.random.seed(5)

In [3]:
# NOTE: Specify your bucket
s3_bucket_name = 'jaelin-ml-sagemaker'

## Sample DataSet
### Three features x1,x2,x3 and a target variable y

In [4]:
n = 10

x1 = np.random.random_sample(n)       # n floating point numbers between 0 and 1
x2 = np.random.randint(100,200,n)     # n integers
x3 = np.random.random_sample(n) * 10  # n floating point numbers between 0 and 10
y = np.random.randint(0,2,n)          # Response variable 0 or 1  

In [5]:
x3

array([2.04154748, 1.19095357, 8.77903071, 5.2367529 , 4.92135999,
       7.318711  , 0.14580751, 0.93363034, 8.26554249, 8.33492742])

In [6]:
y

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

In [7]:
df = pd.DataFrame({'x1':x1,
              'x2':x2, 
              'x3':x3,
              'y':y})

In [8]:
df

Unnamed: 0,x1,x2,x3,y
0,0.221993,153,2.041547,0
1,0.870732,180,1.190954,0
2,0.206719,127,8.779031,1
3,0.918611,144,5.236753,1
4,0.488411,177,4.92136,1
5,0.611744,175,7.318711,1
6,0.765908,165,0.145808,0
7,0.518418,147,0.93363,0
8,0.296801,130,8.265542,0
9,0.187721,184,8.334927,1


In [9]:
# Write to SageMaker Notebook Instance
df.to_csv('demo_file.csv',index=False)

In [11]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [12]:
# http://boto3.readthedocs.io/en/latest/guide/s3.html
def download_from_s3(filename, bucket, key):
    with open(filename,'wb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).download_fileobj(f)

In [13]:
write_to_s3('demo_file.csv', s3_bucket_name, 'data_format/demo_file.csv')

In [None]:
download_from_s3('demo_file_from_s3.csv',s3_bucket_name,'data_format/demo_file.csv')

## RecordIO Format

We will use SageMaker SDK write_numpy_to_dense_tensor() method to create RecordIO files


Data Types: Int32, Float32, Float64  

Reference:
https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/amazon/common.py

In [None]:
df.head()

In [None]:
# X must be an array
X = df[['x1','x2','x3']].to_numpy()

In [None]:
X

In [None]:
type(X)

In [None]:
# Response/Target variable needs to a vector
# y must be a vector 
y = df[['y']].to_numpy()

In [None]:
# it is right now a array of dimensions 10x1
y.shape

In [None]:
y

In [None]:
# Flatten to a single dimension array of 10 elements
y = y.ravel()

In [None]:
y

In [None]:
def write_recordio_file (filename, x, y=None):
    with open(filename, 'wb') as f:
        smac.write_numpy_to_dense_tensor(f, x, y)

In [None]:
def read_recordio_file (filename, recordsToPrint = 10):
    with open(filename, 'rb') as f:
        record = smac.read_records(f)
        for i, r in enumerate(record):
            if i >= recordsToPrint:
                break
            print ("record: {}".format(i))
            print(r)

In [None]:
write_recordio_file('demo_file.recordio',X,y)

In [None]:
df.head(3)

In [None]:
read_recordio_file('demo_file.recordio',3)

In [None]:
write_to_s3('demo_file.recordio', s3_bucket_name, 'data_format/demo_file.recordio')

In [None]:
download_from_s3('demo_file_from_s3.recordio',s3_bucket_name,'data_format/demo_file.recordio')