In [64]:
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ShortType, DoubleType
from pyspark.sql.functions import col, asc

In [24]:
try:
    spark.stop()
except:
    print('spark not initialized')

In [25]:
spark = SparkSession \
    .builder \
    .master('spark://master:7077') \
    .appName("Sparkify Local ETL") \
    .getOrCreate()

spark.newSession()

In [55]:
# /opt/data/song_data/
# └── A
#     ├── A
#     │   ├── A
#     │   │   ├── TRAAAAK128F9318786.json
#     │   │   ├── TRAAAAV128F421A322.json
#     │   │   ├── TRAAABD128F429CF47.json
#     │   │   ├── TRAAACN128F9355673.json

song_schema = StructType([
    StructField("artist_id", StringType(), True),
    StructField("artist_name", StringType(), True),
    StructField("artist_location", StringType(), True),
    StructField("artist_latitude", DoubleType(), True),
    StructField("artist_longitude", DoubleType(), True),
    StructField("song_id", StringType(), True),
    StructField("duration", DoubleType(), True),
    StructField("title", StringType(), True),
    StructField("year", ShortType(), True),
])

song_df = spark.read.json('/opt/data/song_data/A/A/A/*.json', song_schema).cache()
# song_data_df = spark.read.json('/opt/data/song_data/*/*/*/*.json', song_schema).cache()
song_df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- song_id: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- title: string (nullable = true)
 |-- year: short (nullable = true)



In [56]:
print('song record count:', song_df.count())

song record count: 24


In [87]:
song_df.limit(3).toPandas()

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude,song_id,duration,title,year
0,ARTC1LV1187B9A4858,The Bonzo Dog Band,"Goldsmith's College, Lewisham, Lo",51.4536,-0.01802,SOAFBCP12A8C13CC7D,301.40036,King Of Scurf (2007 Digital Remaster),1972
1,ARA23XO1187B9AF18F,The Smithereens,"Carteret, New Jersey",40.57885,-74.21956,SOKTJDS12AF72A25E5,192.522,Drown In My Own Tears (24-Bit Digitally Remast...,0
2,ARSVTNL1187B992A91,Jonathan King,"London, England",51.50632,-0.12714,SOEKAZG12AB018837E,129.85424,I'll Slap Your Face (Entertainment USA Theme),2001


In [68]:
# workspace/data/log_data/
# └── 2018
#     └── 11
#         ├── 2018-11-01-events.json
#

event_df = spark.read.json('/opt/data/log_data/*/*/*.json') \
    .filter(col('page') == 'NextSong') \
    .cache()
event_df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [69]:
print('event (log) record count:', event_df.count())

event (log) record count: 6820


In [84]:
event_df.limit(3).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26


## Start Transform

In [94]:
user_df = event_df.select('userId', 'lastName', 'firstName', 'gender') \
    .dropDuplicates(['userId']) \
    .cache()


In [95]:
print('user dimension record count: ', user_df.count())

user dimension record count:  96


In [96]:
user_df.limit(3).toPandas()

Unnamed: 0,userId,lastName,firstName,gender
0,51,Burke,Maia,F
1,7,Jordan,Adelyn,F
2,15,Koch,Lily,F


In [104]:
artist_df = song_df.select('artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude') \
    .dropDuplicates(['artist_id']) \
    .cache()

In [105]:
print('artist dimension record count:', artist_df.count())

artist dimension record count: 24


In [106]:
artist_df.limit(3).toPandas()

Unnamed: 0,artist_id,artist_name,artist_location,artist_latitude,artist_longitude
0,AR9Q9YC1187FB5609B,Quest_ Pup_ Kevo,New Jersey,,
1,ARA23XO1187B9AF18F,The Smithereens,"Carteret, New Jersey",40.57885,-74.21956
2,ARZKCQM1257509D107,Dataphiles,,,
