attempt to define functions for streamlit app 

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import joblib

# Load models and scaler only once
spotify_scaler = joblib.load('../scaler/spotify_scaler.pkl')
kmeans_model = joblib.load('../models/kmeans_model.pkl')


SPOTIPY_CLIENT_ID = os.getenv('SPOTIPY_CLIENT_ID')
SPOTIPY_CLIENT_SECRET = os.getenv('SPOTIPY_CLIENT_SECRET')

# Initialize Spotify client
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET))

user input of song name and api fetches several similar songs for the user to select from

In [21]:
def search_song(song_name, popularity_range, year_range, hot_songs_only):
    """
    Search for a song and create a DataFrame with the relevant song details.
    """
    query = song_name
    results = spotify.search(q=query, type='track', limit=15)  # Limit to 50 for better result matching

    if not results['tracks']['items']:
        return None

    tracks_info = []
    
    for track in results['tracks']['items']:
        artist_id = track['artists'][0]['id']
        artist_info = spotify.artist(artist_id)
        genres = artist_info['genres']
        release_year = int(track['album']['release_date'].split('-')[0])  # Get the year as an integer

            
            # Store track data
        track_info = {
           
            'spotify_title': track['name'],
            'spotify_artist': track['artists'][0]['name'],
            'album': track['album']['name'],
            'release_date': track['album']['release_date'],
            'year': release_year,  # Add the year to the track_info

            'popularity': track['popularity'],
            'duration_ms': track['duration_ms'],
            'explicit': track['explicit'],
            'album_cover': track['album']['images'][0]['url'] if track['album']['images'] else None,
            'genres': genres
        }

        track_info['popularity'] = int(track_info['popularity'])

        # Filter based on popularity and year range
        if popularity_range and not (popularity_range[0] <= track_info['popularity'] <= popularity_range[1]):
            continue
        if year_range and not (year_range[0] <= track_info['year'] <= year_range[1]):
            continue
        if hot_songs_only and track_info['popularity'] < 50:
            continue  # Skip songs with low popularity if hot_songs_only is checked

        tracks_info.append(track_info)

    return pd.DataFrame(tracks_info)

In [22]:
track_info_df = search_song("Bohemian Rhapsody", (0, 100), (1900, 2024), True)
track_info_df.head(10)


Unnamed: 0,spotify_title,spotify_artist,album,release_date,year,popularity,duration_ms,explicit,album_cover,genres
0,Bohemian Rhapsody,Queen,Bohemian Rhapsody (The Original Soundtrack),2018-10-19,2018,71,354947,False,https://i.scdn.co/image/ab67616d0000b273e8b066...,"[classic rock, glam rock, rock]"
1,Bohemian Rhapsody - Remastered 2011,Queen,A Night At The Opera (2011 Remaster),1975-11-21,1975,78,354320,False,https://i.scdn.co/image/ab67616d0000b273e319ba...,"[classic rock, glam rock, rock]"
2,Bohemian Rhapsody,Angelina Jordan,Bohemian Rhapsody,2020-01-24,2020,56,148750,False,https://i.scdn.co/image/ab67616d0000b2738891c1...,[contemporary vocal jazz]
3,Bohemian Rhapsody - Remastered 2011,Queen,Greatest Hits (Remastered),1981-10-26,1981,52,355466,False,https://i.scdn.co/image/ab67616d0000b273bb19d0...,"[classic rock, glam rock, rock]"
4,I Want To Break Free,Queen,Bohemian Rhapsody (The Original Soundtrack),2018-10-19,2018,69,223080,False,https://i.scdn.co/image/ab67616d0000b273e8b066...,"[classic rock, glam rock, rock]"
5,Bohemian Rhapsody - Live Aid,Queen,Bohemian Rhapsody (The Original Soundtrack),2018-10-19,2018,52,147840,False,https://i.scdn.co/image/ab67616d0000b273e8b066...,"[classic rock, glam rock, rock]"
6,Paranoid Android,Radiohead,OK Computer,1997-05-28,1997,68,387213,False,https://i.scdn.co/image/ab67616d0000b273c8b444...,"[alternative rock, art rock, melancholia, oxfo..."


selected song will be assigned a cluster and the top 10 songs from its respective cluster will be recommended

In [23]:
def get_recommendations(selected_track_info, n_recommendations=10):
    """
    Main recommendation function that takes a song's details and provides song recommendations.
    """
    # 1. Use the selected track info directly (already passed as a dict)
    user_song_df = pd.DataFrame([selected_track_info])

    # 2. Scale the features and predict cluster
    scaled_features = spotify_scaler.transform(user_song_df[['popularity', 'duration_ms', 'explicit']])
    cluster = kmeans_model.predict(scaled_features)[0]

    # 3. Load clustered dataset and filter for the same cluster
    clustered_df = pd.read_csv('../data/8_spotify_million_tracks_clustered.csv')
    cluster_songs = clustered_df[clustered_df['cluster'] == cluster]

    # 4. Get recommendations excluding the input song
    recommendations = cluster_songs[
        (cluster_songs['spotify_title'] != selected_track_info['spotify_title']) |
        (cluster_songs['spotify_artist'] != selected_track_info['spotify_artist'])
    ]
    recommendations = recommendations.nlargest(n_recommendations, 'popularity')

    return recommendations[['spotify_title', 'spotify_artist', 'popularity', 'album_cover']]

In [24]:
track_info_df.iloc[3]

spotify_title                   Bohemian Rhapsody - Remastered 2011
spotify_artist                                                Queen
album                                    Greatest Hits (Remastered)
release_date                                             1981-10-26
year                                                           1981
popularity                                                       52
duration_ms                                                  355466
explicit                                                      False
album_cover       https://i.scdn.co/image/ab67616d0000b273bb19d0...
genres                              [classic rock, glam rock, rock]
Name: 3, dtype: object

In [25]:
recommendations = get_recommendations(track_info_df.iloc[3])
recommendations

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- abstract hip hop
- acoustic blues
- adult standards
- album rock
- alternative dance
- ...
