In [121]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
import time
import boto3

glueContext = GlueContext(SparkContext.getOrCreate())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# 1. Connect to Glue database/table

In [69]:
taxi_data_DyF = glueContext.create_dynamic_frame.from_catalog(database="bbb-glue-crawler-taxi-db", \
                                                        table_name="taxidata_csv")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# 2. Do basic feature engineering

In [89]:
taxi_data_DyF.toDF().createOrReplaceTempView("taxi")


features_DF = spark.sql("SELECT PULocationID, DOLocationID, passenger_count, trip_distance, RatecodeID, \
                    total_amount, payment_type, trip_type, fare_amount, \
                    ROUND(CAST(tip_amount/fare_amount AS DOUBLE), 4) as tip_percent, \
                    CAST(from_unixtime(unix_timestamp(lpep_pickup_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'yyyy') AS INT) as pickup_year, \
                    CAST(from_unixtime(unix_timestamp(lpep_pickup_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'MM') AS INT) as pickup_month,\
                    CAST(from_unixtime(unix_timestamp(lpep_pickup_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'dd') AS INT) as pickup_day, \
                    CAST(from_unixtime(unix_timestamp(lpep_pickup_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'hh') AS INT) as pickup_hour, \
                    CAST(from_unixtime(unix_timestamp(lpep_pickup_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'mm') AS INT) as pickup_minute, \
                    CAST(from_unixtime(unix_timestamp(lpep_dropoff_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'yyyy') AS INT) as dropoff_year, \
                    CAST(from_unixtime(unix_timestamp(lpep_dropoff_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'MM') AS INT) as dropoff_month,\
                    CAST(from_unixtime(unix_timestamp(lpep_dropoff_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'dd') AS INT) as dropoff_day, \
                    CAST(from_unixtime(unix_timestamp(lpep_dropoff_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'hh') AS INT) as dropoff_hour, \
                    CAST(from_unixtime(unix_timestamp(lpep_dropoff_datetime, 'MM/dd/yyyy hh:mm:ss aa'), 'mm') AS INT) as dropoff_minute, \
                    ROUND(CAST((unix_timestamp(lpep_dropoff_datetime, 'MM/dd/yyyy hh:mm:ss aa') - unix_timestamp(lpep_pickup_datetime, 'MM/dd/yyyy hh:mm:ss aa'))/360 AS DOUBLE), 4) as tripdurr, tip_amount \
                    FROM taxi WHERE fare_amount > 2.50").na.drop()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [90]:
features_DF.createOrReplaceTempView("taxi")
features_DF = spark.sql("SELECT *, \
    ROUND(CAST(trip_distance/tripdurr AS DOUBLE), 4) as avg_speed \
    FROM taxi WHERE pickup_month in (1, 2, 3) AND pickup_year=2017 AND tip_percent<1")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [91]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def check_airport_id(id):
     return int((id == 1) | (id == 2))
    
check_airport_id_udf = udf(check_airport_id, IntegerType())
features_DF = features_DF.withColumn("is_airport", check_airport_id_udf(features_DF['RateCodeID']))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [92]:
features_DF.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'RatecodeID', 'total_amount', 'payment_type', 'trip_type', 'fare_amount', 'tip_percent', 'pickup_year', 'pickup_month', 'pickup_day', 'pickup_hour', 'pickup_minute', 'dropoff_year', 'dropoff_month', 'dropoff_day', 'dropoff_hour', 'dropoff_minute', 'tripdurr', 'tip_amount', 'avg_speed', 'is_airport']

In [93]:
features_DF.createOrReplaceTempView("taxi")
features_DF = spark.sql("SELECT PULocationID, DOLocationID, passenger_count, trip_distance, RatecodeID, total_amount, payment_type, trip_type, fare_amount, tip_percent, pickup_year, pickup_month, pickup_day, pickup_hour, pickup_minute, dropoff_year, dropoff_month, dropoff_day, dropoff_hour, dropoff_minute, tripdurr, avg_speed, is_airport, tip_amount FROM taxi")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [94]:
features_DF.createOrReplaceTempView("taxi")
train_DF = spark.sql("SELECT * FROM taxi WHERE pickup_month in (1) and pickup_year=2017")
train_DF = train_DF.drop("pickup_year").drop("pickup_month").drop("dropoff_year").drop("dropoff_month")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [95]:
validation_DF = spark.sql("SELECT * FROM taxi WHERE pickup_month in (2) and pickup_year=2017")
validation_DF = validation_DF.drop("pickup_year").drop("pickup_month").drop("dropoff_year").drop("dropoff_month")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [96]:
test_DF = spark.sql("SELECT * FROM taxi WHERE pickup_month in (3) and pickup_year=2017")
test_DF = test_DF.drop("pickup_year").drop("pickup_month").drop("dropoff_year").drop("dropoff_month")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# 3. Save to S3

In [124]:
bucket = 'aws-emr-resources-507786327009-us-east-1'
bucket_prefix = 'taxidata_v{}'.format(time.strftime("%Y%m%d%H%M%S", time.gmtime()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [99]:
train_DF.repartition(1).write.csv('s3://{}/{}/train/train.csv'.format(bucket, bucket_prefix))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [100]:
validation_DF.repartition(1).write.csv('s3://{}/{}/validation/validation.csv'.format(bucket, bucket_prefix))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
test_DF.repartition(1).write.csv('s3://{}/{}/test/test.csv'.format(bucket, bucket_prefix))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# 4. Update the bucket prefix in Dynamo

In [126]:
table = boto3.resource('dynamodb', region_name='us-east-1').Table('taxi_training_data_location')
#response = table.get_item(Key={'bucketid': 'validation'})

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [118]:
esponse=table.update_item(
    Key={'bucketid': 'validation'},
    UpdateExpression="SET prefix= :var1",
    ExpressionAttributeValues={
            ':var1': bucket_prefix
            },
    ReturnValues="UPDATED_NEW"

        )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…