attempt to define functions for streamlit app 

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import joblib

# Load models and scaler only once
spotify_scaler = joblib.load('../scaler/spotify_scaler.pkl')
kmeans_model = joblib.load('../models/kmeans_model.pkl')


SPOTIPY_CLIENT_ID = os.getenv('SPOTIPY_CLIENT_ID')
SPOTIPY_CLIENT_SECRET = os.getenv('SPOTIPY_CLIENT_SECRET')

# Initialize Spotify client
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET))

user input of song name and api fetches several similar songs for the user to select from

In [2]:
def search_song(song_name, popularity_range, year_range, hot_songs_only):
    """
    Search for a song and create a DataFrame with the relevant song details.
    """
    query = song_name
    results = spotify.search(q=query, type='track', limit=15)  # Limit to 50 for better result matching

    if not results['tracks']['items']:
        return None

    tracks_info = []
    
    for track in results['tracks']['items']:
        artist_id = track['artists'][0]['id']
        artist_info = spotify.artist(artist_id)
        genres = artist_info['genres']
        release_year = int(track['album']['release_date'].split('-')[0])  # Get the year as an integer

            
            # Store track data
        track_info = {
           
            'spotify_title': track['name'],
            'spotify_artist': track['artists'][0]['name'],
            'album': track['album']['name'],
            'release_date': track['album']['release_date'],
            'year': release_year,  # Add the year to the track_info

            'popularity': track['popularity'],
            'duration_ms': track['duration_ms'],
            'explicit': track['explicit'],
            'album_cover': track['album']['images'][0]['url'] if track['album']['images'] else None,
            'genres': genres
        }

        track_info['popularity'] = int(track_info['popularity'])

        # Filter based on popularity and year range
        if popularity_range and not (popularity_range[0] <= track_info['popularity'] <= popularity_range[1]):
            continue
        if year_range and not (year_range[0] <= track_info['year'] <= year_range[1]):
            continue
        if hot_songs_only and track_info['popularity'] < 50:
            continue  # Skip songs with low popularity if hot_songs_only is checked

        tracks_info.append(track_info)

    return pd.DataFrame(tracks_info)

In [3]:
track_info_df = search_song("Ketchup", (0, 100), (1900, 2024), True)
track_info_df.head(10)


Unnamed: 0,spotify_title,spotify_artist,album,release_date,year,popularity,duration_ms,explicit,album_cover,genres
0,The Ketchup Song (Aserejé) - Spanglish Version,Las Ketchup,Hijas del Tomate,2002,2002,65,213973,False,https://i.scdn.co/image/ab67616d0000b27354dbaa...,[flamenco electronica]
1,The Ketchup Song (Aserejé) - Spanish Version,Las Ketchup,Aserejé (The Ketchup Song),2002,2002,60,213000,False,https://i.scdn.co/image/ab67616d0000b2737cbbd0...,[flamenco electronica]
2,Macarena,Los Del Rio,Hits Of The 90s,2002-05-20,2002,63,222626,False,https://i.scdn.co/image/ab67616d0000b273b32c68...,[tropical]
3,KETCHUP,2115,RODZINNY BIZNES,2022-12-16,2022,52,185973,True,https://i.scdn.co/image/ab67616d0000b273796d2a...,[polish hip hop]
4,King Kong,HBz,King Kong,2021-05-21,2021,55,156120,False,https://i.scdn.co/image/ab67616d0000b273f30f1a...,"[hypertechno, melbourne bounce international]"


selected song will be assigned a cluster and the top 10 songs from its respective cluster will be recommended

In [4]:
def get_recommendations(selected_track_info, n_recommendations=10):
    """
    Main recommendation function that takes a song's details and provides song recommendations.
    """
    # 1. Build a dtaframe with the correct columns used in scaling and modeling
    user_song_data = {
        'release_date': int(selected_track_info['release_date'].split('-')[0]),
        'popularity': selected_track_info['popularity'],
        'duration_ms': selected_track_info['duration_ms'],
        'explicit': selected_track_info['explicit']
    }

    # Add the genres column
    # Load the columns used in spotify_numerical.csv and remove the first 4 columns
    genres_columns = pd.read_csv('../data/6_spotify_numerical_scaled.csv').columns[4:-1]

    # Add the genres column to user_song_data, if the genre is in the list, add 1, if not, add 0
    for genre in genres_columns:
        if genre in selected_track_info['genres']:
            user_song_data[genre] = 1
        else:
            user_song_data[genre] = 0

    # Convert the dictionary to a DataFrame
    user_song_df = pd.DataFrame([user_song_data])

    # 2. Scale the features and predict cluster
    user_song_scaled = spotify_scaler.transform(user_song_df)
    # add the columms
    user_song_scaled_df = pd.DataFrame(user_song_scaled, columns=user_song_df.columns)
    
    # predict the cluster
    cluster = kmeans_model.predict(user_song_scaled_df)[0]

    # 3. Load clustered dataset and filter for the same cluster
    clustered_df = pd.read_csv('../data/8_spotify_million_tracks_clustered.csv')
    cluster_songs = clustered_df[clustered_df['kmeans_cluster'] == cluster]

    # 4. Get recommendations excluding the input song
    recommendations = cluster_songs[
        (cluster_songs['spotify_title'] != selected_track_info['spotify_title']) |
        (cluster_songs['spotify_artist'] != selected_track_info['spotify_artist'])
    ]
    recommendations = recommendations.nlargest(n_recommendations, 'popularity')

    return recommendations[['spotify_title', 'spotify_artist', 'popularity', 'album_cover']]

In [5]:
track_info_df.iloc[3]

spotify_title                                               KETCHUP
spotify_artist                                                 2115
album                                               RODZINNY BIZNES
release_date                                             2022-12-16
year                                                           2022
popularity                                                       52
duration_ms                                                  185973
explicit                                                       True
album_cover       https://i.scdn.co/image/ab67616d0000b273796d2a...
genres                                             [polish hip hop]
Name: 3, dtype: object

In [6]:
# get the recommendations
recommendations = get_recommendations(track_info_df.iloc[0])
recommendations


Unnamed: 0,spotify_title,spotify_artist,popularity,album_cover
2048,Mr. Brightside,The Killers,85,https://i.scdn.co/image/ab67616d0000b273ccdddd...
6668,Breakin' Dishes,Rihanna,82,https://i.scdn.co/image/ab67616d0000b273f9f271...
1429,This Love,Maroon 5,81,https://i.scdn.co/image/ab67616d0000b27392f2d7...
382,Valerie (feat. Amy Winehouse) - Version Revisited,Mark Ronson,80,https://i.scdn.co/image/ab67616d0000b2736a5beb...
1582,Poker Face,Lady Gaga,80,https://i.scdn.co/image/ab67616d0000b273e69121...
2023,If I Ain't Got You,Alicia Keys,80,https://i.scdn.co/image/ab67616d0000b27356ff19...
2315,Driving Home for Christmas - 2019 Remaster,Chris Rea,80,https://i.scdn.co/image/ab67616d0000b2736d621c...
4061,Don't Stop The Music,Rihanna,80,https://i.scdn.co/image/ab67616d0000b273f9f271...
3643,Smooth Operator - Single Version,Sade,79,https://i.scdn.co/image/ab67616d0000b2735e25e0...
4146,Heart-Shaped Box,Nirvana,79,https://i.scdn.co/image/ab67616d0000b273aca059...
