In [391]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, Row, ArrayType
from pyspark.sql import SQLContext
from pyspark.sql.functions import from_json, explode, col, udf
from pyspark.sql import functions as F
import requests
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
from dotenv import load_dotenv
from datetime import datetime
import datetime
import json


In [4]:
spark = SparkSession.builder\
.master('local')\
.appName('SportifyTracks')\
.getOrCreate()
sql_context = SQLContext(spark)



In [268]:
spark.stop()

In [269]:
sc = SparkContext(master="local", appName="spotify-etl")

In [None]:
sc

In [389]:
def convert_date(date):
    dt = date.split('T')[0]
    return dt

In [344]:
def extract():
    df = spark.read.json('./tracks.json', multiLine=True)
    return df


def transform(df):
    dataframe = df.withColumn('col', explode('items'))\
    .withColumn('track_name', col('col.track.name'))\
    .withColumn('col2', explode('col.track.artists'))\
    .withColumn('artist', col('col2.name'))\
    .withColumn('played_at', col('col.played_at'))\
    .drop('cursors', 'href', 'limit', 'next', 'items', 'col','col2')
    return dataframe

In [343]:
df = extract()
df.printSchema()

root
 |-- cursors: struct (nullable = true)
 |    |-- after: string (nullable = true)
 |    |-- before: string (nullable = true)
 |-- href: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- context: struct (nullable = true)
 |    |    |    |-- external_urls: struct (nullable = true)
 |    |    |    |    |-- spotify: string (nullable = true)
 |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- uri: string (nullable = true)
 |    |    |-- played_at: string (nullable = true)
 |    |    |-- track: struct (nullable = true)
 |    |    |    |-- album: struct (nullable = true)
 |    |    |    |    |-- album_group: string (nullable = true)
 |    |    |    |    |-- album_type: string (nullable = true)
 |    |    |    |    |-- artists: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |   

In [396]:
convert_date_udf = udf(lambda z : convert_date(z), StringType())
sql_context.udf.register('convertGender', convert_date_udf)
df_transformed.dropDuplicates(['track_name']).sort('played_at').select('track_name', 'artist', convert_date_udf('played_at').alias('played_at')).show(truncate=False)

+---------------------------------------------------------------+-----------------------+----------+
|track_name                                                     |artist                 |played_at |
+---------------------------------------------------------------+-----------------------+----------+
|Diles                                                          |Bad Bunny              |2023-04-01|
|Un Polvo (feat. Bad Bunny, Arcángel, Ñengo Flow & De La Ghetto)|Maluma                 |2023-04-01|
|Sacala                                                         |Daddy Yankee           |2023-04-01|
|El Telefono                                                    |Héctor "El Father"     |2023-04-01|
|The Bitter End                                                 |Placebo                |2023-04-04|
|A Prophecy                                                     |Asking Alexandria      |2023-04-04|
|You Only Live Once                                             |Suicide Silence        |20