# Imports

In [1]:
import boto3
import math
import pandas as pd
from io import StringIO

# Constants

In [2]:
BUCKET = "amazon-sagemaker-ai-by-example"
FILE_KEY = 'chicago_taxi_train.csv'
S3 = boto3.client("s3")
TEST_LOCAL = "../data/test.csv"
TEST_REMOTE = "data/test/test.csv"
TRAIN_LOCAL = "../data/train.csv"
TRAIN_REMOTE = "data/train/train.csv"
VALIDATION_LOCAL = "../data/validation.csv"
VALIDATION_REMOTE = "data/validation/validation.csv"

# Load

In [3]:
def read_csv_from_s3(bucket_name, file_key):
    try:
        s3_client = boto3.client("s3")
        response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
        csv_content = response["Body"].read().decode("utf-8")
        df = pd.read_csv(StringIO(csv_content))
        return df
    except Exception as e:
        print(f"Error reading CSV file from S3: {str(e)}")
        raise


working_df = read_csv_from_s3(BUCKET, FILE_KEY)

# Organize

In [4]:
working_df = working_df.loc[:, ['FARE', 'TRIP_MILES', 'TRIP_SECONDS']]
working_df["TRIP_MINUTES"] = working_df["TRIP_SECONDS"] / 60.0
working_df.pop("TRIP_SECONDS")
working_df["FARE"] = working_df['FARE'].astype('float32')
working_df["TRIP_MILES"] = working_df['TRIP_MILES'].astype('float32')
working_df["TRIP_MINUTES"] = working_df['TRIP_MINUTES'].astype('float32')
working_df.head()

Unnamed: 0,FARE,TRIP_MILES,TRIP_MINUTES
0,31.99,2.57,39.016666
1,9.75,1.18,17.9
2,10.25,1.29,19.549999
3,23.75,3.7,56.0
4,10.0,1.15,17.4


# Split

In [5]:
length_all = len(working_df)
length_train_validation = math.trunc(length_all * 0.8)
length_train = math.trunc(length_train_validation * 0.75)
end_train = length_train - 1
start_validation = length_train
end_validation = length_train_validation - 1
start_test = length_train_validation
train_df = working_df.iloc[:end_train]
validation_df = working_df.iloc[start_validation:end_validation]
test_df = working_df.iloc[start_test:]
print(f"train percent of data: {math.trunc((len(train_df) / length_all) * 100)}")
print(f"validation percent of data: {math.trunc((len(validation_df) / length_all) * 100)}")
print(f"test percent of data: {math.trunc((len(test_df) / length_all) * 100)}")

train percent of data: 59
validation percent of data: 19
test percent of data: 20


# Upload

In [6]:
train_df.to_csv(TRAIN_LOCAL, index=False, header=False, columns=["FARE", "TRIP_MILES"])
validation_df.to_csv(VALIDATION_LOCAL, index=False, header=False, columns=["FARE", "TRIP_MILES"])
test_df.to_csv(TEST_LOCAL, index=False, header=False, columns=["FARE", "TRIP_MILES"])
S3.upload_file(TRAIN_LOCAL, BUCKET, TRAIN_REMOTE)
S3.upload_file(VALIDATION_LOCAL, BUCKET, VALIDATION_REMOTE)
S3.upload_file(TEST_LOCAL, BUCKET, TEST_REMOTE)