# Ejercicio 2 –Spark SQL: funciones

In [1]:
# Import findspark
import findspark

# Configure the environment
findspark.init()

# Import the Spark components required for the session creation
from pyspark import SparkConf
from pyspark.sql import SparkSession

# Import functions from spark
from pyspark.sql import functions as F

# Configure and create the session
conf = SparkConf()
conf = conf.setAppName('mds-session')
conf = conf.setMaster('local[*]')
spark = SparkSession.builder.config(conf = conf).getOrCreate()

22/04/04 21:01:55 WARN Utils: Your hostname, mdsuser resolves to a loopback address: 127.0.1.1; using 192.168.242.129 instead (on interface ens33)
22/04/04 21:01:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/04 21:01:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/04/04 21:02:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# load albums
album_df = spark.read.options(sep='\t', header=False, inferSchema=True).csv('./data/albums.tsv')
album_df = album_df.toDF('id', 'title')
album_df.printSchema()

                                                                                

root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)



In [3]:
# load artists
artist_df = spark.read.options(sep='\t', header=False, inferSchema=True).csv('./data/artists.tsv')
artist_df = artist_df.toDF('id', 'name', 'hotness', 'familiarity', 'location')
artist_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- hotness: double (nullable = true)
 |-- familiarity: string (nullable = true)
 |-- location: string (nullable = true)



In [4]:
# load songs
song_df = spark.read.options(sep='\t', header=False, inferSchema=True).csv('./data/songs.tsv')
song_df = song_df.toDF('id', 
                       'title', 
                       'year',
                       'hotness',
                       'id_artist',
                       'id_album',
                       'duration',
                       'end_of_fade_on',
                       'start_of_fade_out',
                       'tempo',
                       'time_signature', 
                       'key', 
                       'loudness', 
                       'mode', 
                       'style')
song_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- hotness: string (nullable = true)
 |-- id_artist: string (nullable = true)
 |-- id_album: integer (nullable = true)
 |-- duration: double (nullable = true)
 |-- end_of_fade_on: double (nullable = true)
 |-- start_of_fade_out: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: integer (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- style: string (nullable = true)



### 1.¿Cuál es el estilo más lento(tempo) en media?

In [5]:
song_df.select('style', 'tempo').groupBy('style').agg(F.mean('tempo').alias('tempo')).orderBy('tempo').show(1)

[Stage 6:>                                                          (0 + 1) / 1]

+--------+------+
|   style| tempo|
+--------+------+
|rebetika|47.447|
+--------+------+
only showing top 1 row



                                                                                

### 2.¿Cuales  son  los  5  artistas,  ubicados  en  UK(cualquier  territorio  de  UK),  con mayornúmero de canciones en escala menor (mode = 1)?

In [6]:
filter_UK = (artist_df.location.like('%UK%') |
             artist_df.location.like('%United Kingdom%') |
             artist_df.location.like('%England%') |
             artist_df.location.like('%Wales%') |
             artist_df.location.like('%Scotland%') |
             artist_df.location.like('%Northern Ireland%'))

song_df.join(artist_df, (song_df.id_artist == artist_df.id)).\
    where(filter_UK & (song_df.mode == 1)).\
    groupBy('name').agg(F.count('name').alias('count')).orderBy(F.desc('count')).show(6)


+------------------+-----+
|              name|count|
+------------------+-----+
|        SNOWPATROL|   11|
|      Phil Collins|    9|
|         Radiohead|    7|
|The Rolling Stones|    7|
|       Bad Company|    6|
|              Seal|    6|
+------------------+-----+
only showing top 6 rows



### 3.Desde  1970  hasta  hoy,  ¿las  canciones  son  más  rápidas  (tempo),  altas (loudness)  y cortas  (duration)  en  media?  Ordena  los  resultados  por  año ascendente.

In [7]:
song_df.where(song_df.year>=1970).groupBy('year').agg(F.mean('tempo').alias('tempo'),
                                                       F.mean('loudness').alias('loudness'),
                                                       F.mean('duration').alias('duration')).orderBy('year').show()

+----+------------------+-------------------+------------------+
|year|             tempo|           loudness|          duration|
+----+------------------+-------------------+------------------+
|1970|121.34628571428576| -11.92847619047619|231.42578619047612|
|1971|136.16195999999997|-12.153000000000002|259.55428919999997|
|1972|129.17204166666667|-11.719291666666665|238.54539749999995|
|1973|        116.356125|-11.711541666666664|294.16444416666667|
|1974|125.08609090909091|-10.670681818181817|239.49134636363632|
|1975|125.41183333333332|-11.249541666666666| 277.4406354166666|
|1976|137.26139999999998|           -11.6584|210.99404933333338|
|1977|139.33685714285716|-11.820114285714288|255.30692799999997|
|1978|         134.38385|           -10.1125|247.85456749999997|
|1979|137.51694444444445|-11.879083333333332| 226.0566886111111|
|1980|126.89337499999999|-11.098531250000002|     210.438730625|
|1981|127.96074999999999| -11.57044444444444|211.69224499999987|
|1982|         125.14522|

### 4.¿Cuál es el estilo que menos abusa de los efectos de fade in y fade out (mayor número de segundos desde inicio al final del fade in más desde el inicio del fade out al final de la canción?

In [8]:
song_df.groupBy('style').agg(F.mean(song_df.end_of_fade_on + (song_df.duration - song_df.start_of_fade_out)).\
                             alias('fade')).orderBy('fade').show(5)

+------------------+--------------------+
|             style|                fade|
+------------------+--------------------+
|    power violence|-3.29999999998165...|
|christian hardcore|-1.95000000005052...|
|east coast hip hop|1.199999999812462...|
|    argentine rock|5.099999999913507E-4|
|    melodic trance|               0.103|
+------------------+--------------------+
only showing top 5 rows



Con Spark Queries habíamos visto que eran posibles valores negativos.

### 5.¿Cual es la canción menos popular (hotness) de los 5 artistas más populares (hotness)?

In [9]:
from pyspark.sql import window as W

popular_artists = artist_df.orderBy('hotness', ascending = False).select('id').limit(5)

rank_partition = F.rank().over(W.Window.partitionBy('id_artist').orderBy('hotness'))

song_rank = song_df.join(popular_artists, (popular_artists.id == song_df.id_artist)).\
    withColumn("rank", rank_partition)

song_rank.where(song_rank.rank == 1).select('title', 'hotness').show()


+--------------------+-----------+
|               title|    hotness|
+--------------------+-----------+
|             Da Funk|  0.8622545|
|Speed Of Sound (L...|0.454042766|
|Skit #2 (Kanye We...|  0.7801197|
| The Way I Loved You|0.853828893|
|               Magic|0.508602172|
+--------------------+-----------+



In [10]:
spark.stop()