# Exercise 3 - Data Lake on S3

In [1]:
from pyspark.sql import SparkSession
import os
import configparser

# Make sure that your AWS credentials are loaded as env vars

In [7]:
config = configparser.ConfigParser()

#Normally this file should be in ~/.aws/credentials
config.read('dl.cfg')

os.environ["AWS_ACCESS_KEY_ID"]= config['AWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['AWS']['AWS_SECRET_ACCESS_KEY']
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell"

# Create spark session with hadoop-aws package

In [8]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

# Load data from S3

In [27]:
#df = spark.read.json("data/log-data/*/*/*.json")
df = spark.read.json("data/song-data/*/*/*/*.json", inferSchema=True, header=True)

In [28]:
# print schema
#df.printSchema()
#df.show(5)
# Number of records
#df.count()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)

+------------------+---------------+--------------------+----------------+--------------------+----------+---------+------------------+--------------------+----+
|         artist_id|artist_latitude|     artist_location|artist_longitude|         artist_name|  duration|num_songs|           song_id|               title|year|
+------------------+---------------+--------------------+----------------+--------------------+----------+---------+------------------+--------------------+----+
|ARSUVLW12454A4C8B8|       35.83073|           Tennessee|       -85.97874|Royal Philharmoni..

904

In [31]:
artists_table = df.select("artist_id", "artist_name", "artist_longitude", "artist_latitude", "artist_location")
artists_table.show(5)

+------------------+--------------------+----------------+---------------+--------------------+
|         artist_id|         artist_name|artist_longitude|artist_latitude|     artist_location|
+------------------+--------------------+----------------+---------------+--------------------+
|ARSUVLW12454A4C8B8|Royal Philharmoni...|       -85.97874|       35.83073|           Tennessee|
|ARA04401187B991E6E|JOSEF LOCKE & ORC...|        -7.31923|       54.99241|Londonderry, Nort...|
|ARXQC081187FB4AD42|William Shatner_ ...|        -2.23001|       54.31407|                  UK|
|ARWUNH81187FB4A3E0|         Trick Daddy|            null|           null|     Miami , Florida|
|ARNU0OM1187FB3F14A|Larry Groce/Disne...|        -96.7954|       32.77815|          Dallas, TX|
+------------------+--------------------+----------------+---------------+--------------------+
only showing top 5 rows



In [30]:
artists_table.write.parquet("artists.parquet")

NameError: name 'artists_table' is not defined

# Infer schema, fix header and separator

In [None]:
df = spark.read.csv("s3a://udacity-dend/pagila/payment/payment.csv",sep=";", inferSchema=True, header=True)

In [None]:
df.printSchema()
df.show(5)

# Fix the data yourself 

In [None]:
import  pyspark.sql.functions as F
dfPayment = df.withColumn("paymen_date", F.to_timestamp("payment_date"))
dfPayment.show(5)

# Extract the month

In [None]:
dfPayment = dfPayment.withColumn("month", F.month("payment_date"))

# Computer aggregate revenue per month

In [None]:
dfPayment.createOrReplaceTempView("payment")
spark.sql("""
SELECT month, sum(amount) as revenue
FROM payment
GROUP BY month
ORDER BY revenue desc
""").show()

# Fix the schema

In [None]:
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, DateType
paymentSchema = StructType([
    StructField("payment_id", IntegerType()),
    StructField("customer_id", IntegerType()),
    StructField("staff_id", IntegerType()),
    StructField("rental_id", IntegerType()),
    StructField("amount", DoubleType()),
    StructField("paymnet_data", DateType()),
])

In [None]:
dfPaymentWithSchema = spark.csv.read("s3a://udacity-dend/pagila/payment/payment.csv",sep=";", Schema=paymentSchema, header=True)

In [None]:
dfPaymentWithSchema.printSchema()
df.show(5)

In [None]:
dfPaymentWithSchema.createOrReplaceTempView("payment")
spark.sql("""
SELECT month(payment_date) as m, sum(amount) as revenue
FROM payment
GROUP BY m
ORDER BY revenue desc
""").show()