In [2]:
import pymongo
from pymongo.server_api import ServerApi
from pymongo.mongo_client import MongoClient
import spotipy
from spotipy.oauth2 import SpotifyOAuth

In [3]:
#mac command to start/stop mongodb
#brew services start mongodb-community@8.0
#brew services stop mongodb-community@8.0

In [None]:
uri = "mongodb+srv://mongouser:mongopassword@cluster0.uvnk2.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
# Create a new client and connect to the server
client = pymongo.MongoClient(uri, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

#Create MongoDB Database
db = client['Spotify']

Pinged your deployment. You successfully connected to MongoDB!


In [5]:
#To get private details such as playlist, etc. If not this will be more simple
client_id = "spotify_client_id"
client_secret = "spotify_secret"
redirect_uri = "http://localhost:8888/callback"

scope = "playlist-read-private user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id, client_secret=client_secret,
                                               redirect_uri=redirect_uri, scope=scope, open_browser=False))


In [6]:
playlists = sp.current_user_playlists()
for playlist in playlists['items']:
    if playlist['name'] == "===":
        print(playlist['id'])
        p_id = playlist['id']

3ehU2363fTghRKipKg6dYB


In [7]:
results = sp.playlist_items(p_id)
tracks = results['items']

In [8]:
#DB Design

songs = [] #for content-based filtering, collection columns: song_id, album_id, artist0,1,..., release_date, duration_ms
song_details = [] #to store song details, collection columns: song_id, song_name, album_id, album_name
singers = [] #to store singer details, collection columns: id, name
artists = {} #To check for keys

for track in tracks:
    collection = {} #entries for database collection
    c = 0

    song_id = track['track']['id']
    album_id = track['track']['album']['id']

    collection['song_id'] = song_id
    collection['album_id'] = album_id

    song_details.append({'song_id': song_id, 'song_name' : track['track']['name'], 'album_id': album_id, 'album_name':  track['track']['album']['name']})

    genres = set()

    for artist in track['track']['album']['artists']:
        collection[f'artist{c+1}'] = artist['id']
        c += 1
        if artist['id'] not in artists.keys():
            artists[artist['id']] = artist['name']
            singers.append({'id':artist['id'] ,'name':artist['name'] }) #append to collection

        artist_info = sp.artist(artist['id'])
        genres.update(set(artist_info['genres']))

    for idx, genre in enumerate(genres):
        collection[f'genre{idx+1}'] = genre

    collection['release_date'] = track['track']['album']['release_date']
    collection['duration_ms'] = track['track']['duration_ms']

    songs.append(collection)

In [9]:
song_col = db['songs']
song_details_col = db['song_details']
singers_col = db['artists']

In [10]:
#Run Once
#x = song_col.insert_many(songs)
#y = song_details_col.insert_many(song_details)
#z = singers_col.insert_many(singers)

In [11]:
query = {'song_name':'bad guy'}
doc = song_details_col.find(query)
for i in doc:
    print(i)

{'_id': ObjectId('672cc11a9da6c0405cbeba62'), 'song_id': '2Fxmhks0bxGSBdJ92vM42m', 'song_name': 'bad guy', 'album_id': '0S0KGZnfBGSIssfF54WSJh', 'album_name': 'WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?'}


In [12]:
import findspark
findspark.init()

from pyspark.sql import SparkSession


spark = SparkSession.getActiveSession()
if spark is not None:
    spark.stop()

spark = SparkSession.builder \
    .appName('Spotify Recommender') \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .config("spark.mongodb.input.uri", "mongodb+srv://juninnio:juninnio@cluster0.uvnk2.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0/Spotify.songs") \
    .config("spark.mongodb.output.uri", "mongodb+srv://juninnio:juninnio@cluster0.uvnk2.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0/Spotify.recommendations") \
    .getOrCreate()


In [13]:
songs_df = spark.read.format("mongo").option("database", "Spotify").option("collection", "songs").load()
songs_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------+-------------+--------------+-----------+------+------+------------+--------------------+
|                 _id|            album_id|             artist1|             artist2|             artist3|duration_ms|              genre1|              genre2|        genre3|       genre4|        genre5|     genre6|genre7|genre8|release_date|             song_id|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------+-------------+--------------+-----------+------+------+------------+--------------------+
|{672cc1199da6c040...|0S0KGZnfBGSIssfF5...|6qqNVTkY8uBg9cP3J...|                NULL|                NULL|     194087|                 pop|             art pop|          NULL|         NULL|          NULL| 

In [14]:
from pyspark.sql.functions import col
songs_df = songs_df.drop("_id")
songs_df.show()

+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------+-------------+--------------+-----------+------+------+------------+--------------------+
|            album_id|             artist1|             artist2|             artist3|duration_ms|              genre1|              genre2|        genre3|       genre4|        genre5|     genre6|genre7|genre8|release_date|             song_id|
+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+--------------+-------------+--------------+-----------+------+------+------------+--------------------+
|0S0KGZnfBGSIssfF5...|6qqNVTkY8uBg9cP3J...|                NULL|                NULL|     194087|                 pop|             art pop|          NULL|         NULL|          NULL|       NULL|  NULL|  NULL|  2019-03-29|2Fxmhks0bxGSBdJ92...|
|33pt9HBdGlAbRGBHQ...|53

In [15]:
#Encoding and handling missing Values for artists
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, array, explode, when

artist_df = songs_df.select(explode(array("artist1","artist2","artist3")).alias("artist"))
artist_df = artist_df.filter(col("artist").isNotNull()).distinct()

indexer = StringIndexer(inputCol="artist", outputCol="artist_index")
artist_indexed = indexer.fit(artist_df).transform(artist_df)

artist_indexed.show()

+--------------------+------------+
|              artist|artist_index|
+--------------------+------------+
|4r63FhuTkUYltbVAg...|        54.0|
|3jK9MiCrA42lLAdMG...|        46.0|
|3TVXtAsR1Inumwj47...|        42.0|
|2qxJFvFYMEDqd7ui6...|        35.0|
|26VFTg2z8YR0cCuwL...|        29.0|
|1Xyo4u8uXC1ZmMpat...|        20.0|
|0du5cEVh5yTK9QJze...|        12.0|
|1vCWHaC5f2uS3yhpw...|        26.0|
|4rTv3Ejc7hKMtmoBO...|        55.0|
|2wUjUUtkb5lvLKcGK...|        37.0|
|0Y5tJX1MQlPlqiwlO...|        11.0|
|14CHVeJGrR5xgUGQF...|        16.0|
|10exVja0key0uqUkk...|        14.0|
|757aE44tKEUQEqRuT...|        83.0|
|3YQKmKGau1PzlVlkL...|        43.0|
|56OjNTX2bkrdGcB0s...|        58.0|
|6LuN9FCkKOj5Pcnpo...|        71.0|
|5pUo3fmmHT8bhCyHE...|        66.0|
|1dPSMH55yhvjYIwqC...|        22.0|
|57nPqD7z62gDdq37U...|        60.0|
+--------------------+------------+
only showing top 20 rows



In [16]:
def join_index(songs_df, artist_indexed, artist_col):
  return songs_df.join(
      artist_indexed.withColumnRenamed("artist", artist_col).withColumnRenamed("artist_index", f"{artist_col}_index"),
      on=artist_col,
      how="left"
  )

songs_df = join_index(songs_df, artist_indexed, "artist1")
songs_df = join_index(songs_df, artist_indexed, "artist2")
songs_df = join_index(songs_df, artist_indexed, "artist3")

songs_df = songs_df.fillna({'artist2_index':-1, 'artist3_index':-1})
songs_df = songs_df.drop("artist1","artist2","artist3")
songs_df = songs_df.withColumnRenamed("artist1_index","artist1").withColumnRenamed("artist2_index","artist2").withColumnRenamed("artist3_index","artist3")
songs_df.show()

+--------------------+-----------+--------------------+--------------------+--------------+-------------+--------------+-----------+------+------+------------+--------------------+-------+-------+-------+
|            album_id|duration_ms|              genre1|              genre2|        genre3|       genre4|        genre5|     genre6|genre7|genre8|release_date|             song_id|artist1|artist2|artist3|
+--------------------+-----------+--------------------+--------------------+--------------+-------------+--------------+-----------+------+------+------------+--------------------+-------+-------+-------+
|0h2knr6qpiAq0tV5r...|     176658|           pop dance|                 pop|           edm|    dance pop|          NULL|       NULL|  NULL|  NULL|  2014-01-01|0ct6r3EGTcMLPtrXH...|   26.0|   -1.0|   -1.0|
|52u4anZbHd6UInnmH...|     196652|         melodic rap|                trap|           rap|         NULL|          NULL|       NULL|  NULL|  NULL|  2019-12-06|0nbXyq5TXYPCO7pr3...|

In [17]:
#Encoding genres

genres_df = songs_df.select(explode(array("genre1","genre2","genre3","genre4","genre5","genre6","genre7","genre8")).alias("genres"))
genres_df = genres_df.filter(col("genres").isNotNull()).distinct()

indexer = StringIndexer(inputCol="genres", outputCol="genres_index")
genre_indexed = indexer.fit(genres_df).transform(genres_df)

genre_indexed.show()

+-------------------+------------+
|             genres|genres_index|
+-------------------+------------+
| australian hip hop|         2.0|
|           boy band|         7.0|
|         electropop|        28.0|
|    electronic trap|        27.0|
|    seattle hip hop|        77.0|
|  conscious hip hop|        18.0|
|      chicago indie|        14.0|
|                r&b|        72.0|
|     tropical house|        85.0|
|                pop|        63.0|
|           pop rock|        67.0|
|            brostep|         9.0|
|    oakland hip hop|        59.0|
|        celtic rock|        13.0|
|           folk-pop|        32.0|
|              k-pop|        43.0|
|      miami hip hop|        52.0|
|     pittsburgh rap|        62.0|
|        melodic rap|        49.0|
|underground hip hop|        89.0|
+-------------------+------------+
only showing top 20 rows



In [18]:
def join_genres(songs_df, genre_indexed, genre_col):
  return songs_df.join(
      genre_indexed.withColumnRenamed("genres", genre_col).withColumnRenamed("genres_index", f"{genre_col}_index"),
      on=genre_col,
      how="left"
  )

songs_df = join_genres(songs_df, genre_indexed, "genre1")
songs_df = join_genres(songs_df, genre_indexed, "genre2")
songs_df = join_genres(songs_df, genre_indexed, "genre3")
songs_df = join_genres(songs_df, genre_indexed, "genre4")
songs_df = join_genres(songs_df, genre_indexed, "genre5")
songs_df = join_genres(songs_df, genre_indexed, "genre6")
songs_df = join_genres(songs_df, genre_indexed, "genre7")
songs_df = join_genres(songs_df, genre_indexed, "genre8")

songs_df = songs_df.fillna({'genre2_index':-1, 'genre3_index':-1,'genre4_index':-1,'genre5_index':-1,'genre6_index':-1,'genre7_index':-1, 'genre8_index':-1})
songs_df = songs_df.drop("genre1","genre2","genre3","genre4","genre5","genre6","genre7","genre8")
songs_df = songs_df.withColumnRenamed("genre1_index","genre1").withColumnRenamed("genre2_index","genre2").withColumnRenamed("genre3_index","genre3") \
                   .withColumnRenamed("genre4_index","genre4").withColumnRenamed("genre5_index","genre5").withColumnRenamed("genre6_index","genre6") \
                   .withColumnRenamed("genre7_index","genre7").withColumnRenamed("genre8_index","genre8")
songs_df.show()

+--------------------+-----------+------------+--------------------+-------+-------+-------+------+------+------+------+------+------+------+------+
|            album_id|duration_ms|release_date|             song_id|artist1|artist2|artist3|genre1|genre2|genre3|genre4|genre5|genre6|genre7|genre8|
+--------------------+-----------+------------+--------------------+-------+-------+-------+------+------+------+------+------+------+------+------+
|0h2knr6qpiAq0tV5r...|     176658|  2014-01-01|0ct6r3EGTcMLPtrXH...|   26.0|   -1.0|   -1.0|  64.0|  63.0|  25.0|  19.0|  -1.0|  -1.0|  -1.0|  -1.0|
|52u4anZbHd6UInnmH...|     196652|  2019-12-06|0nbXyq5TXYPCO7pr3...|   83.0|   -1.0|   -1.0|  49.0|  84.0|  73.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|
|78EicdHZr5XBWD7ll...|     214289|  2018-08-17|2dpaYNEQHiRxtZbfN...|   68.0|   85.0|   -1.0|  70.0|  63.0|  25.0|   9.0|  51.0|  53.0|  -1.0|  -1.0|
|0S0KGZnfBGSIssfF5...|     194087|  2019-03-29|2Fxmhks0bxGSBdJ92...|   81.0|   -1.0|   -1.0|  63.0|   1.0|

In [21]:
album_indexer = StringIndexer(inputCol="album_id", outputCol="album_indexed")
song_indexer = StringIndexer(inputCol="song_id", outputCol="song_indexed")
album_fit = album_indexer.fit(songs_df)
songs_fit = song_indexer.fit(songs_df)
songs_df = album_fit.transform(songs_df)
songs_df = songs_fit.transform(songs_df)

song_mapping = {label: index for index, label in enumerate(songs_fit.labels)}

songs_df = songs_df.drop('album_id','song_id').withColumnRenamed('album_indexed','album_id').withColumnRenamed('song_indexed','song_id')
songs_df.show()

+-----------+------------+-------+-------+-------+------+------+------+------+------+------+------+------+--------+-------+
|duration_ms|release_date|artist1|artist2|artist3|genre1|genre2|genre3|genre4|genre5|genre6|genre7|genre8|album_id|song_id|
+-----------+------------+-------+-------+-------+------+------+------+------+------+------+------+------+--------+-------+
|     141805|  2021-07-09|   36.0|   24.0|   -1.0|   2.0|  12.0|  63.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|    61.0|   67.0|
|     132780|  2021-09-10|   25.0|   -1.0|   -1.0|   2.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|    83.0|   73.0|
|     244960|  2016-07-29|   70.0|   29.0|   -1.0|  28.0|  38.0|  63.0|  31.0|  -1.0|  -1.0|  -1.0|  -1.0|    20.0|   89.0|
|     258342|  2012-10-09|   77.0|   39.0|   52.0|  77.0|  66.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|    88.0|   50.0|
|     220780|  2017-02-16|   28.0|    5.0|   -1.0|  85.0|  64.0|  63.0|  68.0|  25.0|  -1.0|  -1.0|  -1.0|    11.0|   52.0|
|     24

In [22]:
from pyspark.sql.functions import to_timestamp, length, lit, concat, year, month, dayofmonth

songs_df = songs_df.withColumn(
    "releasedate",
    when(length(col('release_date')) == 4,
         to_timestamp(concat(col('release_date'), lit('-01-01')), 'yyyy-MM-dd'))
    .when(length(col("release_date")) == 10,
         to_timestamp(col("release_date"), "yyyy-MM-dd"))
    .otherwise(None)
)

songs_df = songs_df.drop('release_date')

songs_df = songs_df.withColumn('release_year', year('releasedate'))
songs_df = songs_df.withColumn('release_month', month('releasedate'))
songs_df = songs_df.withColumn('release_day', dayofmonth('releasedate'))

songs_df = songs_df.drop('releasedate')

songs_df.show()

+-----------+-------+-------+-------+------+------+------+------+------+------+------+------+--------+-------+------------+-------------+-----------+
|duration_ms|artist1|artist2|artist3|genre1|genre2|genre3|genre4|genre5|genre6|genre7|genre8|album_id|song_id|release_year|release_month|release_day|
+-----------+-------+-------+-------+------+------+------+------+------+------+------+------+--------+-------+------------+-------------+-----------+
|     144935|   54.0|   -1.0|   -1.0|  84.0|  66.0|  35.0|  73.0|  57.0|  -1.0|  -1.0|  -1.0|    29.0|    9.0|        2019|            9|         27|
|     198973|   42.0|   -1.0|   -1.0|  66.0|  35.0|  12.0|  73.0|  11.0|  -1.0|  -1.0|  -1.0|    22.0|   72.0|        2018|            6|         29|
|     173986|   42.0|   -1.0|   -1.0|  66.0|  35.0|  12.0|  73.0|  11.0|  -1.0|  -1.0|  -1.0|    54.0|   22.0|        2016|            5|          6|
|     210090|   35.0|   37.0|   -1.0|  33.0|  17.0|  64.0|  10.0|  63.0|  12.0|  25.0|  -1.0|    26.

In [23]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.sql.types import DoubleType

songs_df = songs_df.withColumn('duration_ms', col('duration_ms').cast(DoubleType()))

assembler = VectorAssembler(inputCols=['duration_ms'], outputCol='duration_ms_vector')
songs_df = assembler.transform(songs_df)

scaler = StandardScaler(inputCol='duration_ms_vector',outputCol='duration_ms_scaled')
songs_df = scaler.fit(songs_df).transform(songs_df)
songs_df = songs_df.drop('duration_ms','duration_ms_vector')
songs_df.show()

+-------+-------+-------+------+------+------+------+------+------+------+------+--------+-------+------------+-------------+-----------+--------------------+
|artist1|artist2|artist3|genre1|genre2|genre3|genre4|genre5|genre6|genre7|genre8|album_id|song_id|release_year|release_month|release_day|  duration_ms_scaled|
+-------+-------+-------+------+------+------+------+------+------+------+------+--------+-------+------------+-------------+-----------+--------------------+
|   36.0|   24.0|   -1.0|   2.0|  12.0|  63.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|    61.0|   67.0|        2021|            7|          9|[3.4865527406934027]|
|   25.0|   -1.0|   -1.0|   2.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|    83.0|   73.0|        2021|            9|         10| [3.264655498108459]|
|   70.0|   29.0|   -1.0|  28.0|  38.0|  63.0|  31.0|  -1.0|  -1.0|  -1.0|  -1.0|    20.0|   89.0|        2016|            7|         29| [6.022819783225246]|
|   77.0|   39.0|   52.0|  77.0|  66.0|  -1.0|

In [24]:
from pyspark.ml.functions import vector_to_array
songs_df = songs_df.withColumn("duration_ms_scaled", vector_to_array(col("duration_ms_scaled"))[0])
songs_df.show()

+-------+-------+-------+------+------+------+------+------+------+------+------+--------+-------+------------+-------------+-----------+------------------+
|artist1|artist2|artist3|genre1|genre2|genre3|genre4|genre5|genre6|genre7|genre8|album_id|song_id|release_year|release_month|release_day|duration_ms_scaled|
+-------+-------+-------+------+------+------+------+------+------+------+------+--------+-------+------------+-------------+-----------+------------------+
|   36.0|   24.0|   -1.0|   2.0|  12.0|  63.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|    61.0|   67.0|        2021|            7|          9|3.4865527406934027|
|   25.0|   -1.0|   -1.0|   2.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|  -1.0|    83.0|   73.0|        2021|            9|         10| 3.264655498108459|
|   70.0|   29.0|   -1.0|  28.0|  38.0|  63.0|  31.0|  -1.0|  -1.0|  -1.0|  -1.0|    20.0|   89.0|        2016|            7|         29| 6.022819783225246|
|   77.0|   39.0|   52.0|  77.0|  66.0|  -1.0|  -1.0|  -1.

In [25]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

In [26]:
data = songs_df.toPandas()

In [27]:
data.head()

Unnamed: 0,artist1,artist2,artist3,genre1,genre2,genre3,genre4,genre5,genre6,genre7,genre8,album_id,song_id,release_year,release_month,release_day,duration_ms_scaled
0,36.0,24.0,-1.0,2.0,12.0,63.0,-1.0,-1.0,-1.0,-1.0,-1.0,61.0,67.0,2021,7,9,3.486553
1,25.0,-1.0,-1.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,83.0,73.0,2021,9,10,3.264655
2,70.0,29.0,-1.0,28.0,38.0,63.0,31.0,-1.0,-1.0,-1.0,-1.0,20.0,89.0,2016,7,29,6.02282
3,77.0,39.0,52.0,77.0,66.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,88.0,50.0,2012,10,9,6.351842
4,28.0,5.0,-1.0,85.0,64.0,63.0,68.0,25.0,-1.0,-1.0,-1.0,11.0,52.0,2017,2,16,5.428307


In [28]:
song_ids = data['song_id'].values
features = data.drop('song_id', axis=1).values

In [29]:
similarity_matrix = np.array([[1-cosine(features[i], features[j]) for j in range (len(features))] for i in range(len(features))])
similarity_df = pd.DataFrame(similarity_matrix, index=song_ids, columns=song_ids)

In [38]:
def top_recommendations(song_id, n=5):
  similar_songs = similarity_df.loc[song_id].sort_values(ascending=False)
  similar_songs = similar_songs[similar_songs.index != song_id]
  return similar_songs.head(n).index

In [39]:
song_map = pd.DataFrame(list(song_mapping.items()), columns=['song_id','song_index'])

In [50]:
rec_idx = 72
recommendations = top_recommendations(rec_idx)

In [51]:
rec_id =song_map[song_map['song_index'] == rec_idx]['song_id']
q = song_details_col.find({'song_id':rec_id.values[0]},{'_id':0})
for i in q:
  song_name = i['song_name']
  album = i['album_name']
  print(f"Requested Song: {song_name}, Album: {album}")
for i in recommendations:
  id =song_map[song_map['song_index'] == i]['song_id']
  q = song_details_col.find({'song_id':id.values[0]},{'_id':0})
  for j in q:
    song_name = j['song_name']
    album = j['album_name']
    print(f"Song: {song_name}, Album: {album}")

Requested Song: God's Plan, Album: Scorpion
Song: One Dance, Album: Views
Song: HUMBLE., Album: DAMN.
Song: Ride, Album: Blurryface
Song: VIBEZ, Album: KIRK
Song: It Ain't Me (with Selena Gomez), Album: It Ain't Me (with Selena Gomez)
