In [10]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

glueContext = GlueContext(SparkContext.getOrCreate())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
users = glueContext.create_dynamic_frame.from_catalog(database="userdata", table_name="streaming_data_test_123dasdf3")
print("Count: " + str(users.count()))
users.printSchema()
users.toDF().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Count: 429
root
|-- first_name: string
|-- last_name: string
|-- age: int
|-- gender: string
|-- latitude: string
|-- longitude: string
|-- partition_0: string
|-- partition_1: string
|-- partition_2: string
|-- partition_3: string

+----------+---------+---+------+--------+---------+-----------+-----------+-----------+-----------+
|first_name|last_name|age|gender|latitude|longitude|partition_0|partition_1|partition_2|partition_3|
+----------+---------+---+------+--------+---------+-----------+-----------+-----------+-----------+
|      Jake|  Bradley| 71|  male| 17.9325| -55.2647|       2021|         01|         10|         06|
|     Hymke|       Hu| 59|female| 47.1300| 157.8048|       2021|         01|         10|         06|
|      Jack|     Wong| 26|  male|-67.9421|-118.7996|       2021|         01|         10|         06|
|     Dylan|   Arnold| 33|  male| 23.8569|-158.2738|       2021|         01|         10|         06|
|     یاسمن|     گلشن| 60|female| 32.5081|  40.4050|       2

In [17]:
users = users.drop_fields(['partition_0', 'partition_1', 'partition_2', 'partition_3'])
users.toDF().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+---------+---+------+--------+---------+
|first_name|last_name|age|gender|latitude|longitude|
+----------+---------+---+------+--------+---------+
|      Jake|  Bradley| 71|  male| 17.9325| -55.2647|
|     Hymke|       Hu| 59|female| 47.1300| 157.8048|
|      Jack|     Wong| 26|  male|-67.9421|-118.7996|
|     Dylan|   Arnold| 33|  male| 23.8569|-158.2738|
|     یاسمن|     گلشن| 60|female| 32.5081|  40.4050|
|     Jenny|   Leroux| 24|female|-49.2292| 155.5982|
|    Ceyhun|  Çamdalı| 65|  male| 73.6430|-130.8003|
|      Milo|   Bonnet| 42|  male| -3.3526| -45.2023|
|     Tyler|     Ross| 29|  male| 15.5969|-105.8160|
|  Fernando|   Murray| 61|  male|-83.4109|-129.3067|
|     Kuzey|Karabulut| 39|  male| 77.5563| -82.6551|
|      Ülkü| Aşıkoğlu| 39|female| 75.8411| 107.0488|
|    Andrea|  Fuentes| 69|female|-18.4573|  46.6898|
|     Grace|    Adams| 24|female|-81.2294| 122.2057|
|      Olea| Birkedal| 41|female| 54.4272|  91.2478|
|   Yasemin|    Öymen| 70|female| 73.1539|  31

In [18]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="gender", outputCol="genderIndex")
indexed = indexer.fit(users.toDF()).transform(users.toDF())
indexed.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+---------+---+------+--------+---------+-----------+
|first_name|last_name|age|gender|latitude|longitude|genderIndex|
+----------+---------+---+------+--------+---------+-----------+
|      Jake|  Bradley| 71|  male| 17.9325| -55.2647|        0.0|
|     Hymke|       Hu| 59|female| 47.1300| 157.8048|        1.0|
|      Jack|     Wong| 26|  male|-67.9421|-118.7996|        0.0|
|     Dylan|   Arnold| 33|  male| 23.8569|-158.2738|        0.0|
|     یاسمن|     گلشن| 60|female| 32.5081|  40.4050|        1.0|
|     Jenny|   Leroux| 24|female|-49.2292| 155.5982|        1.0|
|    Ceyhun|  Çamdalı| 65|  male| 73.6430|-130.8003|        0.0|
|      Milo|   Bonnet| 42|  male| -3.3526| -45.2023|        0.0|
|     Tyler|     Ross| 29|  male| 15.5969|-105.8160|        0.0|
|  Fernando|   Murray| 61|  male|-83.4109|-129.3067|        0.0|
|     Kuzey|Karabulut| 39|  male| 77.5563| -82.6551|        0.0|
|      Ülkü| Aşıkoğlu| 39|female| 75.8411| 107.0488|        1.0|
|    Andrea|  Fuentes| 69

In [19]:
from awsglue.dynamicframe import DynamicFrame

newDf = DynamicFrame.fromDF(indexed, glueContext, 'newDf')
newDf = newDf.drop_fields(["gender"]).rename_field("genderIndex", "gender")
newDf.toDF().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+---------+---+--------+---------+------+
|first_name|last_name|age|latitude|longitude|gender|
+----------+---------+---+--------+---------+------+
|      Jake|  Bradley| 71| 17.9325| -55.2647|   0.0|
|     Hymke|       Hu| 59| 47.1300| 157.8048|   1.0|
|      Jack|     Wong| 26|-67.9421|-118.7996|   0.0|
|     Dylan|   Arnold| 33| 23.8569|-158.2738|   0.0|
|     یاسمن|     گلشن| 60| 32.5081|  40.4050|   1.0|
|     Jenny|   Leroux| 24|-49.2292| 155.5982|   1.0|
|    Ceyhun|  Çamdalı| 65| 73.6430|-130.8003|   0.0|
|      Milo|   Bonnet| 42| -3.3526| -45.2023|   0.0|
|     Tyler|     Ross| 29| 15.5969|-105.8160|   0.0|
|  Fernando|   Murray| 61|-83.4109|-129.3067|   0.0|
|     Kuzey|Karabulut| 39| 77.5563| -82.6551|   0.0|
|      Ülkü| Aşıkoğlu| 39| 75.8411| 107.0488|   1.0|
|    Andrea|  Fuentes| 69|-18.4573|  46.6898|   1.0|
|     Grace|    Adams| 24|-81.2294| 122.2057|   1.0|
|      Olea| Birkedal| 41| 54.4272|  91.2478|   1.0|
|   Yasemin|    Öymen| 70| 73.1539|  31.4908| 

In [20]:
glueContext.write_dynamic_frame.from_options(frame = newDf,
              connection_type = "s3",
              connection_options = {"path": "s3://streaming-data-test-123dasdf3/processed-data"},
              format = "csv")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<awsglue.dynamicframe.DynamicFrame object at 0x7f100ade5f28>