# Sistema de Recomendação de Músicas em Tempo Real

## Ferramentas utilizadas:

- Api Spotify
- Machine Leraning
- PySpark
- Spark Streaming
- Apache Kafka

## Arquivo Consumer

In [67]:
# Versão da linguagem Python
from platform import python_version
print(python_version())

3.9.13


In [68]:
# Imports
import os
import time
import random
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Normalizer, StandardScaler

In [69]:
# Endereço do servidor kafka
SERVER = 'localhost:9092'

In [70]:
# Nome do tópico
TOPIC = 'dsaminiprojeto7'

In [71]:
# Buscando os conectores (através de uma tupla) na minha pasta jars
spark_jars =  ("{},{},{},{},{}".format(os.getcwd() + "/jars/spark-sql-kafka-0-10_2.12-3.2.1.jar",  
                                       os.getcwd() + "/jars/kafka-clients-2.1.1.jar", 
                                       os.getcwd() + "/jars/spark-streaming-kafka-0-10-assembly_2.12-3.3.2.jar", 
                                       os.getcwd() + "/jars/commons-pool2-2.8.0.jar",  
                                       os.getcwd() + "/jars/spark-token-provider-kafka-0-10_2.12-3.1.2.jar"))

In [72]:
# Inicializa sessão Spark
spark = SparkSession \
        .builder \
        .config("spark.jars", spark_jars) \
        .appName("Projeto-Spotify") \
        .getOrCreate()

In [73]:
spark.sparkContext.setLogLevel("ERROR")

In [74]:
# Usamos o Spark Streaming para leitura do streaming de dados do Kafka e salvamos em um dataframe
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", SERVER) \
        .option("subscribe", TOPIC) \
        .option("startingOffsets", "latest") \
        .load()

In [75]:
# Só quero que apareça mensagens de erro
df1 = df.selectExpr("CAST(value AS STRING)", "timestamp") 

In [76]:
# Usamos o Spark Straming para leitura do streaming de dados do kafka e salvamos em um dataframe
df = spark \
        .readStream \
        .format('kafka') \
        .option('kafka.bootstrap.servers', SERVER) \
        .option('subscribe', TOPIC) \
        .option('startingOffsets', 'latset') \
        .load()

In [77]:
# Para aparecer apenas mensagens de erro
spark.sparkContext.setLogLevel("ERROR")

In [78]:
# Usamos o Spark Streaming para leitura do streaming de dados do Kafka e salvamos em um dataframe
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", SERVER) \
        .option("subscribe", TOPIC) \
        .option("startingOffsets", "latest") \
        .load()

In [79]:
# Selecionamos a coluna timestamp como string e salvamos em um novo dataframe
df1 = df.selectExpr('CAST(value AS STRING)', 'timestamp')

In [80]:
# Definimos o schema com o nome de cada coluna e o tipo de dado
def_schema = "order_id INT, id STRING, name STRING, popularity INT, duration_ms DOUBLE, " \
             + "artists STRING, id_artists STRING, release_date STRING, " \
             + "danceability DOUBLE,energy DOUBLE, key INT, loudness DOUBLE, " \
             + "mode INT,speechiness DOUBLE," \
             + "acousticness DOUBLE, instrumentalness DOUBLE, liveness DOUBLE, " \
             + "valence DOUBLE, tempo DOUBLE, time_signature DOUBLE"

In [81]:
# Selecionamos o streaming de dados de acordo com o schema e salvamos um novo dataframe
df2 = df1.select(from_csv(col('value'), def_schema).alias('song'), 'timestamp')

In [82]:
# Criamos uma view (tabela temporária) na memória do Spark e visualizamos o schema
# Com o formato de tabela fica muito mais fácil de manipular os dados
df3 = df2.select('song.*', 'timestamp')
df3.createOrReplaceTempView('df3_View');
df3.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- artists: string (nullable = true)
 |-- id_artists: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [83]:
# Como os dados já estão estuturados posso aplicar SQL, através do spark,sql
# Selecionamos os dados com as músicas do stream
musicas_stream = spark.sql('SELECT * FROM df3_View')

In [84]:
# Não podemos visualizar ainda, pois temos que gerar o stram do Spark Streaming
# musicas_stream.show()

In [85]:
# Criamos o stram de dados do Spark Streaming
musicas_stream_spark = musicas_stream \
        .writeStream \
        .trigger(processingTime = '5 seconds') \
        .outputMode("append") \
        .option("truncate", "false") \
        .format("memory") \
        .queryName("tabela_spark") \
        .start()

musicas_stream_spark.awaitTermination(1)

IllegalArgumentException: Cannot start query with name tabela_spark as a query with that name is already active in this SparkSession

In [86]:
# Selecionamos as músicas da tabela de stram do Spark
spark_songs = spark.sql('SELECT * FROM tabela_spark')

In [87]:
# Visualizando o stream em tempo real com tabela do Spark
spark_songs.show(5)

+--------+--------------------+--------------------+----------+-----------+--------------------+----------+------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+--------------------+
|order_id|                  id|                name|popularity|duration_ms|             artists|id_artists|release_date|danceability|energy| key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|time_signature|           timestamp|
+--------+--------------------+--------------------+----------+-----------+--------------------+----------+------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-----+--------------+--------------------+
|     285|5pOumPAUcd7bgJi47...|       Nobody's Ting|         4|   213676.0|                NDAI|2018-05-04|        0.75|       0.525|   2.0|null|     1.0|null|      0.388|         0.0|           0.099|   0.528|143.912

In [88]:
# Visualizando apenas algumas colunas
spark_songs.select('order_id', 'id', 'name', 'popularity', 'duration_ms', 'artists').show(5)

+--------+--------------------+--------------------+----------+-----------+--------------------+
|order_id|                  id|                name|popularity|duration_ms|             artists|
+--------+--------------------+--------------------+----------+-----------+--------------------+
|     285|5pOumPAUcd7bgJi47...|       Nobody's Ting|         4|   213676.0|                NDAI|
|     286|4YY8nv8tzU7AKLyRx...|                 You|         4|   131152.0|         GylesBartle|
|     287|7fAJw7CNYOOMbYr3S...|           Amor Fati|         0|   214884.0|ChampagneSuperchi...|
|     288|4SzpxKCWRcKNNce1J...|Georgie Wants a G...|         9|   183533.0|            Blackaby|
|     289|40Ub2OAfCIxJa7OaO...|           So Simple|         6|   216173.0|      LucyandtheRats|
+--------+--------------------+--------------------+----------+-----------+--------------------+
only showing top 5 rows



In [89]:
# Contagem de músicas extraídas em tempo real (no outro jupyter notebook)
spark_songs.count()

1104

Importante aqui esperar um intervalo de tempo na continuação do projeto para que o streaming de dados possa ser coletado, 

#### Vamos agora trabalhar na extração de dados do Spotify.

In [90]:
# Instalação do pacote python para criar a conexão com o Spotify
# https://spotipy.readthedocs.io/en/2.22.1/
!pip install -q spotipy

In [96]:
# Imports
import os
import ujson
import spotipy
import spotipy.util
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [97]:
# Colocando as chaves da API do Spotify
os.environ['SPOTIPY_CLIENT_ID'] = 'e7badbd497da423692af1d504a6bee62'
os.environ['SPOTIPY_CLIENT_SECRET'] = '7f0136b895fb4614b55c50f7e0ccd920'
os.environ['SPOTIPY_REDIRECT_URI'] = 'http://localhost:7777/callback'

In [98]:
# Escopo de extração das preferências do usuário
# https://developer.spotify.com/documentation/web-api/concepts/scopes
scope = 'user-library-read'

In [99]:
# Username no Spotify
username = 'ingoreichertjr@gmail.com'

In [100]:
# Criação do token de acesso
token = spotipy.util.prompt_for_user_token(username, scope)

In [101]:
# Cria o objeto de autenticação
spotipy_obj = spotipy.Spotify(auth = token)

In [102]:
# Extrai até 50 músicas da lista de favoritos do usuário
saved_tracks = spotipy_obj.current_user_saved_tracks(limit = 50)

In [103]:
# Número de músicas extraídas
n_tracks = saved_tracks['total']
print('total de Tracks: %d ' % n_tracks)

total de Tracks: 55 


In [104]:
# Função para extrair os atributos da lista de músicas do usuário
def select_features(track_response):
    return {
        'id': str(track_response['track']['id']),
        'name': str(track_response['track']['name']),
        'artists': [artist['name'] for artist in track_response['track']['artists']],
        'popularity': track_response['track']['popularity']
    }

In [105]:
# Aplica a função
tracks = [select_features(track) for track in saved_tracks['items']]

In [106]:
# Extrai os atributos das músicas preferidas do usuário
while saved_tracks['next']:
    saved_tracks = spotipy_obj.next(saved_tracks)
    tracks.extend([select_features(track) for track in saved_tracks['items']])

In [107]:
# Criamos o dataframe do pandas
df_tracks = pd.DataFrame(tracks)
pd.set_option('display.max_rows', len(tracks))
df_tracks['artists'] = df_tracks['artists'].apply(lambda artists: artists[0])

In [108]:
df_tracks.head(10)

Unnamed: 0,id,name,artists,popularity
0,4zgyHc0bK2ca1U9X2iajg3,A Força do Silêncio (Ao Vivo),Pouca Vogal,31
1,3OtbktRUMTzYrGTrdKaJUn,A Montanha (Ao Vivo),Pouca Vogal,29
2,6RWbbxUbtbadvUtg4tz1ab,O Vôo do Besouro (Ao Vivo),Pouca Vogal,26
3,5eMgoUmfSjCdiUaP0fNgDD,Pouca Vogal (Ao Vivo),Pouca Vogal,24
4,7rL1ExqyusHQJjXZ2ZSDxQ,O Amanhã Colorido (Ao Vivo),Pouca Vogal,37
5,5E0sfiyI5PyHK3x54XuYyA,Ao Fim de Tudo (Ao Vivo),Pouca Vogal,27
6,3bhBlHJKJklCeQQ2QRHZMR,Tententender (Ao Vivo),Pouca Vogal,27
7,21Y2IgF2u7IpXSjmOMYO90,Refrão de Bolero (Ao Vivo),Pouca Vogal,44
8,4EYhfZnsQRHzQ3FUC425qO,Toda Forma De Poder + Banco (Ao Vivo),Pouca Vogal,28
9,73jOMwbzFrXVjCAqnbShYx,Pinhal (Ao Vivo),Pouca Vogal,33


Nosso sistema de recomendação vai aprender a partir das características de áudio das músicas, por isso vamos extraí-los a seguir:

In [109]:
# Dicionário para os atributos de áudio
audio_features = {}

In [110]:
# Extrai os atributos de áudio
# O pŕoprio objeto do spotify já oferece isso a partir do método audio_features
for idd in df_tracks['id'].tolist():
    audio_features[idd] = spotipy_obj.audio_features(idd)[0]

In [111]:
audio_features

{'4zgyHc0bK2ca1U9X2iajg3': {'danceability': 0.55,
  'energy': 0.618,
  'key': 2,
  'loudness': -7.509,
  'mode': 0,
  'speechiness': 0.0311,
  'acousticness': 0.624,
  'instrumentalness': 0,
  'liveness': 0.737,
  'valence': 0.684,
  'tempo': 138.01,
  'type': 'audio_features',
  'id': '4zgyHc0bK2ca1U9X2iajg3',
  'uri': 'spotify:track:4zgyHc0bK2ca1U9X2iajg3',
  'track_href': 'https://api.spotify.com/v1/tracks/4zgyHc0bK2ca1U9X2iajg3',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4zgyHc0bK2ca1U9X2iajg3',
  'duration_ms': 267773,
  'time_signature': 4},
 '3OtbktRUMTzYrGTrdKaJUn': {'danceability': 0.556,
  'energy': 0.366,
  'key': 7,
  'loudness': -12.399,
  'mode': 1,
  'speechiness': 0.0286,
  'acousticness': 0.882,
  'instrumentalness': 0.00966,
  'liveness': 0.726,
  'valence': 0.46,
  'tempo': 137.822,
  'type': 'audio_features',
  'id': '3OtbktRUMTzYrGTrdKaJUn',
  'uri': 'spotify:track:3OtbktRUMTzYrGTrdKaJUn',
  'track_href': 'https://api.spotify.com/v1/tracks/3Otbkt

[Stage 266:>                                                        (0 + 1) / 1]                                                                                

In [112]:
# Adicionamos os atributos de áudio ao dataframe
df_tracks['acousticness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['acousticness'])
df_tracks['speechiness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['speechiness'])
df_tracks['key'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['key'])
df_tracks['liveness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['liveness'])
df_tracks['instrumentalness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['instrumentalness'])
df_tracks['energy'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['energy'])
df_tracks['tempo'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['tempo'])
df_tracks['loudness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['loudness'])
df_tracks['danceability'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['danceability'])
df_tracks['valence'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['valence'])

In [113]:
df_tracks.head()

Unnamed: 0,id,name,artists,popularity,acousticness,speechiness,key,liveness,instrumentalness,energy,tempo,loudness,danceability,valence
0,4zgyHc0bK2ca1U9X2iajg3,A Força do Silêncio (Ao Vivo),Pouca Vogal,31,0.624,0.0311,2,0.737,0.0,0.618,138.01,-7.509,0.55,0.684
1,3OtbktRUMTzYrGTrdKaJUn,A Montanha (Ao Vivo),Pouca Vogal,29,0.882,0.0286,7,0.726,0.00966,0.366,137.822,-12.399,0.556,0.46
2,6RWbbxUbtbadvUtg4tz1ab,O Vôo do Besouro (Ao Vivo),Pouca Vogal,26,0.823,0.0306,4,0.969,5e-05,0.357,132.033,-12.167,0.599,0.224
3,5eMgoUmfSjCdiUaP0fNgDD,Pouca Vogal (Ao Vivo),Pouca Vogal,24,0.719,0.0462,9,0.979,0.0312,0.633,103.239,-8.023,0.381,0.35
4,7rL1ExqyusHQJjXZ2ZSDxQ,O Amanhã Colorido (Ao Vivo),Pouca Vogal,37,0.188,0.0366,2,0.983,0.0,0.609,122.056,-7.791,0.505,0.572


In [221]:
# Selecionamos uma música randomicamente do nosso dataframe
musica_randomica = random.randint(0,len(df_tracks)-1)
df_musica_randomica = df_tracks.head(musica_randomica)[-1:]
df_musica_randomica

Unnamed: 0,id,name,artists,popularity,acousticness,speechiness,key,liveness,instrumentalness,energy,tempo,loudness,danceability,valence
20,3g5FrnRdbmDQyWNiDIprts,All Star - Ao Vivo,Nando Reis,63,0.889,0.0492,9,0.714,3e-06,0.391,138.094,-10.4,0.512,0.277


In [222]:
# Músicas do straming do Spark
# Do streaming que ta vindo do kafka
spark_songs.show(5)

+--------------------+--------------------+----------+--------------------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-----+
|                  id|                name|popularity|             artists|danceability|energy| key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|tempo|
+--------------------+--------------------+----------+--------------------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-----+
|5pOumPAUcd7bgJi47...|       Nobody's Ting|         4|                NDAI|       0.525|   2.0|null|     1.0|      0.388|         0.0|           0.099|   0.528|143.912|  4.0|
|4YY8nv8tzU7AKLyRx...|                 You|         4|         GylesBartle|       0.331|   0.0|null|     1.0|      0.268|       0.327|           0.363|   0.663| 72.083|  4.0|
|7fAJw7CNYOOMbYr3S...|           Amor Fati|         0|ChampagneSuperchi...|       0.841|   2.0|null|     0.0|      0.123|    

In [223]:
# Ainda na lista de músicas que está vindo do kafka vou excluir algumas colunas que não precisaremos
spark_songs = spark_songs.drop('order_id',
                               'mode',
                               'release_date',
                               'id_artists',
                               'time_signature',
                               'duration_ms',
                               'timestamp')

In [224]:
# Cria o dataframe com a música escolhida randomicamente
df_sp = spark.createDataFrame(df_musica_randomica)

In [225]:
# Concatena músicas do streaming do Spark com a música do Spotify
df = spark_songs.union(df_sp)

In [226]:
df.show(5)

+--------------------+--------------------+----------+--------------------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-----+
|                  id|                name|popularity|             artists|danceability|energy| key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|tempo|
+--------------------+--------------------+----------+--------------------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-----+
|5pOumPAUcd7bgJi47...|       Nobody's Ting|         4|                NDAI|       0.525|   2.0|null|     1.0|      0.388|         0.0|           0.099|   0.528|143.912|  4.0|
|4YY8nv8tzU7AKLyRx...|                 You|         4|         GylesBartle|       0.331|   0.0|null|     1.0|      0.268|       0.327|           0.363|   0.663| 72.083|  4.0|
|7fAJw7CNYOOMbYr3S...|           Amor Fati|         0|ChampagneSuperchi...|       0.841|   2.0|null|     0.0|      0.123|    

### Pré-Processamento dos Dados

In [227]:
# Preparamos o VectorAssembler
vetor = VectorAssembler(inputCols = ['danceability',
                                     'energy',
                                     'loudness',
                                     'speechiness',
                                     'acousticness',
                                     'instrumentalness',
                                     'liveness',
                                     'valence',
                                     'tempo'],
                       outputCol = 'song_features')

In [228]:
# Descartamos valores inválidos
assembled = vetor.setHandleInvalid('skip').transform(df)

In [229]:
# Preparamos o padronizador
std = StandardScaler(inputCol = 'song_features', outputCol = 'standardized')

In [230]:
# Treinamos o padronizador
scale = std.fit(assembled)

In [231]:
# Dataframe com dados padronizados
df = scale.transform(assembled)

In [232]:
df.show(5)

+--------------------+--------------------+----------+--------------------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-----+--------------------+--------------------+
|                  id|                name|popularity|             artists|danceability|energy| key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|tempo|       song_features|        standardized|
+--------------------+--------------------+----------+--------------------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-----+--------------------+--------------------+
|5pOumPAUcd7bgJi47...|       Nobody's Ting|         4|                NDAI|       0.525|   2.0|null|     1.0|      0.388|         0.0|           0.099|   0.528|143.912|  4.0|[0.525,2.0,1.0,0....|[2.64468078997192...|
|4YY8nv8tzU7AKLyRx...|                 You|         4|         GylesBartle|       0.331|   0.0|null|     1.0|      0.268|       0.32

### Machine Learning com Aprendizado Não Supervisionado

In [233]:
# Cria o objeto do modelo
objeto_KMeans = KMeans(featuresCol = 'standardized', k = 3)

In [234]:
# Treina o modelo
modelo_KMeans = objeto_KMeans.fit(df)

In [235]:
# Previsões do modelo
df_output = modelo_KMeans.transform(df)

In [236]:
df_output.show(10)

+--------------------+--------------------+----------+--------------------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-----+--------------------+--------------------+----------+
|                  id|                name|popularity|             artists|danceability|energy| key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|tempo|       song_features|        standardized|prediction|
+--------------------+--------------------+----------+--------------------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-----+--------------------+--------------------+----------+
|5pOumPAUcd7bgJi47...|       Nobody's Ting|         4|                NDAI|       0.525|   2.0|null|     1.0|      0.388|         0.0|           0.099|   0.528|143.912|  4.0|[0.525,2.0,1.0,0....|[2.64468078997192...|         0|
|4YY8nv8tzU7AKLyRx...|                 You|         4|         GylesBartle|       0.331|

### Sistema de Recomendação

In [256]:
# Classe
class RecoSystem():
    
    # Método construtor
    def __init__(self, data):
        self.data_ = data
    
    # Método de recomendação
    def Recomm(self, nome_musica, amount = 1):
        
        # Lista para as distâncias
        distancias = []
        
        # Seleciona a música
        song = self.data_[(self.data_.name.str.lower() == nome_musica.lower())].head(1).values[0]
        res_dt = self.data_[self.data_.name.str.lower() != nome_musica.lower()]
        
        # Loop para o cálculo das distâncias
        for i_song in tqdm(res_dt.values):
            
            # Inicializa a distância
            distancia = 0
            
            # Loop para calcular a distância
            for col in np.arange(len(res_dt.columns)):
                if not col in [0,1,2,14]:
                    distancia = distancia + np.absolute(float(song[col]) - float(i_song[col]))
            
            # Adiciona na lista de distâncias
            distancias.append(distancia)
        
        res_dt['distance'] = distancias
        res_dt = res_dt.sort_values('distance')
        
        columns = ['id','name', 
                   'artists', 
                   'acousticness', 
                   'liveness', 
                   'instrumentalness', 
                   'energy', 
                   'danceability', 
                   'valence']
        
        return res_dt[columns][:amount]

In [257]:
# Selecionando as colunas do nosso dataframe de previsões
datalabel = df_output.select('id',
                             'name',
                             'artists',
                             'danceability',
                             'energy',
                             'key',
                             'loudness',
                             'speechiness',
                             'acousticness',
                             'instrumentalness',
                             'liveness',
                             'valence',
                             'tempo',
                             'prediction')

In [258]:
# Dataset final
df_final = datalabel.toPandas()
df_final.drop(df_final[df_final['artists'] == '0'].index, inplace = True)
df_final.drop_duplicates(inplace = True)
df_final.drop(df_final[df_final['danceability'] == 0.0000].index, inplace = True)
df_final.drop(df_final[df_final['liveness'] == 0.000].index, inplace = True)
df_final.drop(df_final[df_final['instrumentalness'] == 0.000000].index, inplace = True)
df_final.drop(df_final[df_final['energy'] == 0.0000].index, inplace = True)
df_final.drop(df_final[df_final['danceability'] == 0.000].index, inplace = True)
df_final.drop(df_final[df_final['valence'] == 0.000].index, inplace = True)

In [259]:
df_final.shape

(2346, 14)

In [260]:
df_final.sample(5)

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,prediction
141,6mAOLBah2eqPYDrCu16AOG,Bratt,HALEY,0.724,1.0,,0.0,0.0479,0.0295,0.274,0.095,128.003,4.0,0
1545,27x8TJNYboxPv6KheHY1AS,Easy Enough,LeTrouble,0.502,5.0,,1.0,0.593,0.577,0.0975,0.483,132.813,4.0,2
1521,6mY56gQjKYzpwyK2PNRJst,One Chick,Sondai,0.633,9.0,,0.0,0.306,0.0,0.353,0.932,136.086,4.0,0
999,6YPzUKk6TQPkwZAxrxCxge,Feel Something,Sgrow,0.68,6.0,,1.0,0.0849,0.236,0.117,0.487,97.732,4.0,0
504,6xrwKRPvVAKBLF365OA8MV,Retrograde,QNABushTea,0.656,10.0,,0.0,0.331,1.1e-05,0.104,0.306,167.7,4.0,0


In [261]:
# Cria o objeto 
reco_obj = RecoSystem(df_final)

In [262]:
musica = df_musica_randomica['name'].tolist()[0]

In [263]:
print(musica)

All Star - Ao Vivo


In [264]:
# Executa a recomendação
recomendacao = reco_obj.Recomm(musica)

100%|████████████████████████████████████| 2345/2345 [00:00<00:00, 42969.17it/s]


In [265]:
# Extrai a música randômica da lista de favoritos do Spotify:
y = df_musica_randomica[['id', 'name',
                         'artists',
                         'acousticness',
                         'liveness',
                         'instrumentalness',
                         'energy',
                         'danceability',
                         'valence']]

In [266]:
# Concatena a recomendação com a música randômica da lista de favoritos do Spotify
recomendacao = pd.concat([recomendacao, y])

In [267]:
# Salva a recomendação em disco
recomendacao.to_csv('recomendacao.csv')

In [268]:
# Carrega o arquivo do disco
df_reco = (spark.read.format('csv').options(header = 'true').load('recomendacao.csv'))

In [269]:
# Recomendação de música
df_reco.show()

+---+--------------------+--------------------+----------+------------+--------+----------------+------+------------+-------+
|_c0|                  id|                name|   artists|acousticness|liveness|instrumentalness|energy|danceability|valence|
+---+--------------------+--------------------+----------+------------+--------+----------------+------+------------+-------+
|320|4TwFQpzVR47KLXwej...|Portal (feat. Riz...|    252501|       0.887|   0.187|           0.688| 0.291|       0.323|  0.124|
| 20|3g5FrnRdbmDQyWNiD...|  All Star - Ao Vivo|Nando Reis|       0.889|   0.714|        2.63e-06| 0.391|       0.512|  0.277|
+---+--------------------+--------------------+----------+------------+--------+----------------+------+------------+-------+

