# Introduction to Data Formats and S3

In [1]:
import pandas as pd
import numpy as np

import boto3 # Amazon Web Services Library SDK for Python
import sagemaker.amazon.common as smac # SageMaker Core Library

In [2]:
np.random.seed(5) # random seed assigned for consistent results

In [3]:
# NOTE: Specify bucket
s3_bucket_name = 's3-1-ml-sagemaker'

## Sample Dataset (Random Numbers)

### Three features x1, x2, x3 and target variable y

In [4]:
n = 10 # Number of created samples
x1 = np.random.random_sample(n) # n floating point numbers from 0 to 1
x2 = np.random.randint(100,200,n) # Random integers from 100 to 200, size n
x3 = np.random.random_sample(n) * 10 # same as x1, but x10
y = np.random.randint(0,2,n) # Response variable 0 or 1

In [6]:
y # y array returned here

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 1])

### pandas DataFrame (df) of the variables (remember, capital letters matter)

In [8]:
df = pd.DataFrame({'x1':x1,
                  'x2':x2,
                  'x3':x3,
                  'y':y})

In [9]:
df # return dataframe here

Unnamed: 0,x1,x2,x3,y
0,0.221993,153,2.041547,0
1,0.870732,180,1.190954,0
2,0.206719,127,8.779031,1
3,0.918611,144,5.236753,1
4,0.488411,177,4.92136,1
5,0.611744,175,7.318711,1
6,0.765908,165,0.145808,0
7,0.518418,147,0.93363,0
8,0.296801,130,8.265542,0
9,0.187721,184,8.334927,1


In [10]:
### Write to SageMaker Notebook

In [11]:
df.to_csv('demoDataFormats.csv', index=False)

In [12]:
### Write / Read from S3 Bucket

In [15]:
''' Remember: Files are objects of S3. File names are keys. Files stored in an S3
 replicates across three Amazon availability zones in the bucket's region.
'''

" Remember: Files are objects of S3. File names are keys. Files stored in an S3\nreplicates across three Amazon availability zones in the bucket's region.\n"

In [None]:
### Boto3 Framing / Download a file from S3

In [16]:
'''This JupyterLab used Boto3 1.11.9
https://boto3.amazonaws.com/v1/documentation/api/latest/index.html
'''

' This JupyterLab used Boto3 1.11.9'

In [None]:
def download_from_s3(filename, bucket, key):
    with open(filename, 'wb') as f: # open the file in binary mode!
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)