In [None]:
import boto3

bucket = "sdl-immersion-day-644711630487"
path = f"s3://{bucket}/raw/"

s3 = boto3.resource('s3')
my_bucket = s3.Bucket(bucket)
prefix = 'raw'
list_files = []
for obj in my_bucket.objects.filter(Prefix=prefix):
    list_files.append(f"s3://{bucket}/" + obj.key)
print("Total files: ", len(list_files))

In [None]:
df = spark.read.json(list_files)

#### Pretty print a table

In [None]:
sample_data = df.sample(15 / df.count())
var1 = sample_data.collect()
%table var1

In [None]:
df.printSchema()

In [None]:
df.toPandas().info()

In [None]:
from pyspark.sql.functions import col, count, explode

df.select("color", "product")\
    .where(col("product") == "Chips")\
    .groupBy("product", "color")\
    .agg(count("color").alias("ones"))\
    .show()

In [None]:
# We can filter our data based on multiple conditions.
df.filter((df.campaign=='BlackFriday')).select('productName','product', 'department', 'price','campaign').limit(10).show()

#### Save to S3

In [None]:
path = f"s3://{bucket}/results/csv/"
df.coalesce(1).write.mode('overwrite').csv(path)
print(f"Write to {path} complete")

### Spark SQL

In [None]:
df.createOrReplaceTempView("products")

sql_consult = """
    SELECT count(product) as total, product, department
    FROM products
    WHERE department == 'Outdoors'
    GROUP BY department, product
"""
sqlDF = spark.sql(sql_consult)
sqlDF.show()

#### Create new database: spark_demo_database

In [None]:
spark.sql("create database if not exists spark_demo_database")
spark.sql("show databases").show()

#### Create new table 

Use the spark_demo_database and the taxi1_df dataframe created earlier to cerate a new table: select_taxi_table. The table is also saved to S3
Note: you will need to update IAM to have write permissions to S3://644711630487-us-east-1-athena-results-bucket-8usz9um3wp

In [None]:
spark.sql("use spark_demo_database")
sqlDF.write.mode("overwrite").format("csv").option("path",path).saveAsTable("outdoors_products")
print(f"Create new table from {path} complete")

#### Show the new table in the spark_demo_database
Note: you should be able to go to Glue console and see the new database and table.

In [None]:
spark.sql("show tables").show()

In [None]:
# Read the new table data
spark.sql("select * from select_taxi_table").show()

### Plot

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.clf()
df=sqlDF.toPandas()
df.sort_values('total',inplace=True)
plt.barh(df['product'], df['total'])
%matplot plt