## Data Formats

In [1]:
import pandas as pd
import numpy as np
import boto3 # Python SDK library for Amazon to download files (1.11.10 as 2/5/2020)
import sagemaker.amazon.common as smac # SageMaker Common Core Library

In [2]:
np.random.seed(5) # entered seed to generate expected results

In [3]:
# Note: specify your project bucket here
s3_bucket_name = 's3-1-ml-sagemaker'

### Sample DataSet
#### Show with three features and a target variable

In [4]:
n = 10 # sample size
x1 = np.random.random_sample(n) # n floating point numbers from 0 to 1
x2 = np.random.randint(100,200,n) # n integers between 100 and 200
x3 = np.random.random_sample(n) * 10 # same as x1, but x10
y = np.random.randint(0,2,n) # variable response 0 or 1

In [5]:
y # check the value of y

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

### Adding variables into a panda dataframe (df)

In [6]:
df = pd.DataFrame({'x1':x1,
                  'x2':x2,
                  'x3':x3,
                  'y':y})

In [7]:
df # check value of df

Unnamed: 0,x1,x2,x3,y
0,0.221993,153,2.041547,0
1,0.870732,180,1.190954,0
2,0.206719,127,8.779031,1
3,0.918611,144,5.236753,1
4,0.488411,177,4.92136,1
5,0.611744,175,7.318711,1
6,0.765908,165,0.145808,0
7,0.518418,147,0.93363,0
8,0.296801,130,8.265542,0
9,0.187721,184,8.334927,1


In [8]:
df.to_csv('demo_file.csv', index=False)

### Writing and Reading from S3 bucket
* Files are referred as objects in S3
* File names are referred to as key names
* File stored in an S3 are replicated across three AWS availability zones

### Using Boto3 to send and receive files in S3

In [9]:
def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [10]:
def download_from_s3(filename, bucket, key):
    with open(filename, 'wb') as f: # write in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).download_fileobj(f)

In [11]:
write_to_s3('demo_file.csv', s3_bucket_name, 'dataFormats/demo_file.csv')

In [12]:
download_from_s3('demo_file_from_s3.csv', s3_bucket_name, 'dataFormats/demo_file.csv')

## Record IO Format

This section will use SageMaker SDK *write_numpy_to_dense_tensor()* method to create RecordIO files.
The format is used for frameworks like Apache mxnet, Apache Mesos, and AWS Sagemaker that divides data into 'chunks', referred to as records.
[More Information from mxnet documentation](https://mxnet.apache.org/api/architecture/note_data_loading#data-format)

Data Type Allowed:
* Int32
* Float32
* Float64

Reference: [Amazon SageMaker Common.py](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/amazon/common.py)

### Format the x1, x2, x3 variables

In [13]:
df.head() # returns the first 5 (default) results

Unnamed: 0,x1,x2,x3,y
0,0.221993,153,2.041547,0
1,0.870732,180,1.190954,0
2,0.206719,127,8.779031,1
3,0.918611,144,5.236753,1
4,0.488411,177,4.92136,1


In [14]:
# X must be in array format to start. numpy requirement.
X = df[['x1','x2','x3']].to_numpy()

In [15]:
X # check the value of X, results to scientific notation.

array([[2.21993171e-01, 1.53000000e+02, 2.04154748e+00],
       [8.70732306e-01, 1.80000000e+02, 1.19095357e+00],
       [2.06719155e-01, 1.27000000e+02, 8.77903071e+00],
       [9.18610908e-01, 1.44000000e+02, 5.23675290e+00],
       [4.88411189e-01, 1.77000000e+02, 4.92135999e+00],
       [6.11743863e-01, 1.75000000e+02, 7.31871100e+00],
       [7.65907856e-01, 1.65000000e+02, 1.45807511e-01],
       [5.18417988e-01, 1.47000000e+02, 9.33630336e-01],
       [2.96800502e-01, 1.30000000e+02, 8.26554249e+00],
       [1.87721229e-01, 1.84000000e+02, 8.33492742e+00]])

In [16]:
type(X) # identify the information in variable

numpy.ndarray

### Format y into a vector

In [17]:
y = df[['y']].to_numpy()

#### y at this stage is 10x1 array

In [18]:
y.shape

(10, 1)

In [19]:
# Flatten to a single dimension of 10 elements
y = y.ravel()

In [20]:
y # values are now horizontal

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

#### RecordIO functions from numpy to tensor

In [21]:
def write_recordio_file (filename, x, y=None):
    with open(filename, 'wb') as f: # again, as binary mode
        smac.write_numpy_to_dense_tensor(f, x, y)

In [22]:
def read_recordio_file (filename, recordsToPrint = 10):
    with open(filename, 'rb') as f:
        record = smac.read_records(f) # use of SageMake Core Functions
        for interation, rec in enumerate(record):
            if interation >= recordsToPrint:
                break
            print("record: {}".format(interation))
            print(rec)

In [23]:
write_recordio_file('demoFile.recordio', X, y) # write the new array and tensor

In [24]:
df.head(3)

Unnamed: 0,x1,x2,x3,y
0,0.221993,153,2.041547,0
1,0.870732,180,1.190954,0
2,0.206719,127,8.779031,1


In [25]:
read_recordio_file('demoFile.recordio', 3)

record: 0
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.22199317108973948
      values: 153.0
      values: 2.0415474783059215
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 0
    }
  }
}

record: 1
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.8707323061773764
      values: 180.0
      values: 1.1909535747826039
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 0
    }
  }
}

record: 2
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.20671915533942642
      values: 127.0
      values: 8.779030712603621
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 1
    }
  }
}



Displays in a JSON format

In [26]:
write_to_s3('demoFile.recordio', s3_bucket_name, 'dataFormats/demoFile.recordio')

In [27]:
download_from_s3('demoFile.recordio', s3_bucket_name, 'dataFormats/demoFile.recordio')