In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql import functions as F
import zipfile

In [2]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

## Reading Songs Data

In [3]:
df_full_songs = spark.read.json('./data/song_data/*/*/*/*.json')

In [4]:
df_full_songs.limit(5).toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARDR4AC1187FB371A1,,,,Montserrat Caballé;Placido Domingo;Vicente Sar...,511.16363,1,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,0
1,AREBBGV1187FB523D2,,"Houston, TX",,Mike Jones (Featuring CJ_ Mello & Lil' Bran),173.66159,1,SOOLYAZ12A6701F4A6,Laws Patrolling (Album Version),0
2,ARMAC4T1187FB3FA4C,40.82624,"Morris Plains, NJ",-74.47995,The Dillinger Escape Plan,207.77751,1,SOBBUGU12A8C13E95D,Setting Fire to Sleeping Giants,2004
3,ARPBNLO1187FB3D52F,40.71455,"New York, NY",-74.00712,Tiny Tim,43.36281,1,SOAOIBZ12AB01815BE,I Hold Your Hand In Mine [Live At Royal Albert...,2000
4,ARNF6401187FB57032,40.79086,"New York, NY [Manhattan]",-73.96644,Sophie B. Hawkins,305.162,1,SONWXQJ12A8C134D94,The Ballad Of Sleeping Beauty,1994


In [5]:
df_full_songs.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [8]:
def create_select_exprs(json_cols: list, schema_cols: list) -> list:
    exprs = ["{} as {}".format(dfc, sc) for dfc, sc in zip(json_cols, schema_cols)]
    return exprs

In [9]:
songs = df_full_songs.selectExpr(create_select_exprs(['song_id', 'title', 'artist_id', 'year', 'duration'], ['song_id', 'title', 'artist_id', 'year', 'duration'])).distinct()

In [10]:
songs.limit(10).toPandas()

Unnamed: 0,song_id,title,artist_id,year,duration
0,SOGOSOV12AF72A285E,¿Dónde va Chichi?,ARGUVEV1187B98BA17,1997,313.12934
1,SOTTDKS12AB018D69B,It Wont Be Christmas,ARMBR4Y1187B9990EB,0,241.47546
2,SOBBUGU12A8C13E95D,Setting Fire to Sleeping Giants,ARMAC4T1187FB3FA4C,2004,207.77751
3,SOIAZJW12AB01853F1,Pink World,AR8ZCNI1187B9A069B,1984,269.81832
4,SONYPOM12A8C13B2D7,I Think My Wife Is Running Around On Me (Taco ...,ARDNS031187B9924F0,2005,186.48771
5,SOYMRWW12A6D4FAB14,The Moon And I (Ordinary Day Album Version),ARKFYS91187B98E58F,0,267.7024
6,SOAOIBZ12AB01815BE,I Hold Your Hand In Mine [Live At Royal Albert...,ARPBNLO1187FB3D52F,2000,43.36281
7,SOBCOSW12A8C13D398,Rumba De Barcelona,AR7SMBG1187B9B9066,0,218.38322
8,SOWTBJW12AC468AC6E,Broken-Down Merry-Go-Round,ARQGYP71187FB44566,0,151.84934
9,SOQHXMF12AB0182363,Young Boy Blues,ARGSJW91187B9B1D6B,0,218.77506


In [61]:
artists = df_full_songs.selectExpr(create_select_exprs(["artist_id", "artist_name", "coalesce(nullif(artist_location, ''), 'N/A')", "coalesce(artist_latitude, 0.0)", "coalesce(artist_longitude, 0.0)"],['artist_id', 'name', 'location', 'latitude', 'longitude'])).distinct()
artists.limit(10).toPandas()

Unnamed: 0,artist_id,name,location,latitude,longitude
0,ARPBNLO1187FB3D52F,Tiny Tim,"New York, NY",40.71455,-74.00712
1,ARBGXIG122988F409D,Steel Rain,California - SF,37.77916,-122.42005
2,AREVWGE1187B9B890A,Bitter End,Noci (BA),-13.442,-41.9952
3,ARH4Z031187B9A71F2,Faye Adams,"Newark, NJ",40.73197,-74.17418
4,ARKFYS91187B98E58F,Jeff And Sheri Easter,,0.0,0.0
5,ARD7TVE1187B99BFB1,Casual,California - LA,0.0,0.0
6,ARHHO3O1187B989413,Bob Azzam,,0.0,0.0
7,AREBBGV1187FB523D2,Mike Jones (Featuring CJ_ Mello & Lil' Bran),"Houston, TX",0.0,0.0
8,ARULZCI1241B9C8611,Luna Orbit Project,,0.0,0.0
9,ARGSJW91187B9B1D6B,JennyAnyKind,North Carolina,35.21962,-80.01955


## Reading Logs Data

In [14]:
df_full = spark.read.format("json").load("./data/log-data/*.json")

In [15]:
df_full.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,1542247071796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,1542252577796,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12


In [16]:
df_full = df_full.filter(df_full["page"]=='NextSong')

In [17]:
df_full.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,Sony Wonder,Logged In,Samuel,M,0,Gonzalez,218.06975,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1540493000000.0,597,Blackbird,200,1542253449796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",61
4,Van Halen,Logged In,Tegan,F,2,Levine,289.38404,paid,"Portland-South Portland, ME",PUT,NextSong,1540794000000.0,602,Best Of Both Worlds (Remastered Album Version),200,1542260935796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",80


In [18]:
schema_cols = ['user_id', 'first_name', 'last_name', 'gender', 'level']
df_cols = ['userId', 'firstName', 'lastName', 'gender', 'level']

In [20]:
users = df_full.selectExpr(create_select_exprs(df_cols, schema_cols)).distinct()

In [21]:
users.count()

104

In [27]:
import datetime

In [28]:
get_timestamp = udf(lambda x : datetime.datetime.fromtimestamp(x/ 1000.0).strftime("%Y-%m-%d %H:%M:%S"))

In [29]:
df_full = df_full.withColumn("date_time", get_timestamp(df_full['ts']))

In [30]:
df_full.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId,date_time
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:30:26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:41:21
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26,2018-11-15 00:45:41
3,Sony Wonder,Logged In,Samuel,M,0,Gonzalez,218.06975,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1540493000000.0,597,Blackbird,200,1542253449796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",61,2018-11-15 03:44:09
4,Van Halen,Logged In,Tegan,F,2,Levine,289.38404,paid,"Portland-South Portland, ME",PUT,NextSong,1540794000000.0,602,Best Of Both Worlds (Remastered Album Version),200,1542260935796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",80,2018-11-15 05:48:55


In [53]:
def create_extract_select_exprs(json_cols: list, schema_cols: list, extract_col: str) -> list:
    exprs = ["{}({}) as {}".format(dfc, extract_col,sc) for dfc, sc in zip(json_cols, schema_cols)]
    return exprs

In [56]:
time_exprs = ['date_time as start_time'] + create_extract_select_exprs(['hour', 'dayofmonth', 'weekofyear', 'month', 'year', 'dayofweek'],['hour', 'day', 'week', 'month', 'year', 'weekday'],'date_time')

In [57]:
time_df = df_full.selectExpr(time_exprs).distinct()

In [58]:
time_df.limit(5).toPandas()

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,2018-11-15 12:38:03,12,15,46,11,2018,5
1,2018-11-15 22:00:58,22,15,46,11,2018,5
2,2018-11-21 19:00:45,19,21,47,11,2018,4
3,2018-11-21 20:22:17,20,21,47,11,2018,4
4,2018-11-21 22:26:57,22,21,47,11,2018,4


In [67]:
songs= songs.withColumnRenamed("artist_id", "songs_artist_id")


artists = artists.withColumnRenamed("artist_id", "artists_artist_id") \
                       .withColumnRenamed("location", "artist_location")

In [69]:
songplays_table = df_full.select(df_full.date_time.alias("start_time"), 
                             df_full.userId.alias("user_id"), 
                             "level", 
                             "song", 
                             "artist", 
                             df_full.sessionId.alias("session_id"), 
                             "location",
                             df_full.userAgent.alias("user_agent")) \
                    .join(songs, df_full.song==songs.title, 'left_outer')   \
                    .join(artists, df_full.artist==artists.name, 'left_outer') \
                    .selectExpr("start_time",
                                "user_id",
                                "level",
                                "song_id",
                                "coalesce(artists_artist_id, songs_artist_id) as artist_id",
                                "session_id",
                                "location",
                                "user_agent",
                                "year(start_time) as year",
                                "month(start_time) as month") \
                    .dropDuplicates() \
                    .withColumn('songplay_id', F.monotonically_increasing_id())

In [70]:
songplays_table.limit(5).toPandas()

Unnamed: 0,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent,year,month,songplay_id
0,2018-11-15 11:29:21,16,paid,,,575,"Birmingham-Hoover, AL","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",2018,11,0
1,2018-11-15 17:29:48,97,paid,,,605,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",2018,11,1
2,2018-11-15 19:11:24,42,paid,,,404,"New York-Newark-Jersey City, NY-NJ-PA","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",2018,11,2
3,2018-11-15 19:30:17,44,paid,,,619,"Waterloo-Cedar Falls, IA",Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,2018,11,3
4,2018-11-15 19:35:33,97,paid,,,605,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",2018,11,4
