# **Import Libraries**

In [1]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

# **Read Data**

In [3]:
data = pd.read_csv("data/AWS_data.csv")
orig = pd.read_csv("data/data.csv")
genre_data = pd.read_csv('data/data_by_genres.csv')
year_data = pd.read_csv('data/data_by_year.csv')

In [4]:
data.shape

(189340, 18)

In [5]:
orig.drop(columns='release_date', inplace=True)  # Drop 'release_date' from orig
orig.shape

(170653, 18)

In [6]:
# Step 1: Find IDs in 'orig' that are not in 'data'
ids_not_in_data = ~orig['id'].isin(data['id'])

# Step 2: Filter rows in 'orig' where ID is not in 'data'
rows_to_append = orig[ids_not_in_data]

# Step 3: Append these rows to 'data'
data = pd.concat([data, rows_to_append], ignore_index=True)

In [7]:
data.shape

(332594, 18)

In [8]:
data.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'id', 'duration_ms', 'year', 'artists', 'explicit', 'name',
       'popularity'],
      dtype='object')

In [7]:
data.to_csv('full_dataset.csv', index=False)

# **Clustering Genres with K-Means**

Here, the simple K-means clustering algorithm is used to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.

In [8]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

# **Clustering Songs with K-Means**

In [9]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False))
                                 ], verbose=False)

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

# **Build Recommender System**

* Based on the analysis and visualizations, it’s clear that similar genres tend to have data points that are located close to each other while similar types of songs are also clustered together.
* This observation makes perfect sense. Similar genres will sound similar and will come from similar time periods while the same can be said for songs within those genres. We can use this idea to build a recommendation system by taking the data points of the songs a user has listened to and recommending songs corresponding to nearby data points.
* [Spotipy](https://spotipy.readthedocs.io/en/2.16.1/) is a Python client for the Spotify Web API that makes it easy for developers to fetch data and query Spotify’s catalog for songs. You have to install using `pip install spotipy`
* After installing Spotipy, you will need to create an app on the [Spotify Developer’s page](https://developer.spotify.com/) and save your Client ID and secret key.

In [10]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="dca629dcc8b9492eb6f77404760b5728",
                                                           client_secret="356a267954824137922055ae5ca81dc4"))

def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [11]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

def get_song_data(song, spotify_data):
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    except IndexError:
        return find_song(song['name'], song['year'])

def get_mean_vector(song_list, spotify_data):
    song_vectors = []

    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print(f'Warning: {song["name"]} does not exist in Spotify or in database')
            continue
        try:
            song_vector = np.array([float(song_data[col]) for col in number_cols if col in song_data and song_data[col] is not None])
            if song_vector.size == len(number_cols):  # Ensure vector is complete
                song_vectors.append(song_vector)
            else:
                print(f'Warning: Incomplete data for {song["name"]}')
        except Exception as e:
            print(f'Error processing song {song["name"]}: {str(e)}')
            continue

    if not song_vectors:
        return None  # Handle case where no valid songs are found

    song_matrix = np.array(song_vectors)
    return np.mean(song_matrix, axis=0)

def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict

def recommend_songs(song_list, spotify_data, n_songs=5):
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)

    song_center = get_mean_vector(song_list, spotify_data)
    if song_center is None:
        print('No valid song data available for recommendations.')
        return []

    scaler = song_cluster_pipeline.named_steps['scaler']
    feature_order = scaler.feature_names_in_ if hasattr(scaler, 'feature_names_in_') else number_cols

    if song_center.shape[0] != len(feature_order):
        print(f"Error: Expected {len(feature_order)} features, got {song_center.shape[0]}")
        return []

    scaled_data = scaler.transform(spotify_data[feature_order])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))

    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs * 10][0])

    recommended_songs = spotify_data.iloc[index]
    recommended_songs = recommended_songs[~recommended_songs.apply(lambda x: any(
        (x['name'] == song['name'] and x['year'] == song['year'] and x['artists'] == song['artists']) for song in song_list), axis=1)]

    # Convert 'year' from float to int
    recommendations = recommended_songs[metadata_cols].head(n_songs).to_dict(orient='records')
    for song in recommendations:
        song['year'] = int(song['year'])  # Convert year to integer

    return recommendations

In [12]:
recommended_songs = recommend_songs([{'name': 'Come As You Are', 'year':1991},
                {'name': 'No Excuses', 'year': 1994},
                {'name': 'Lithium', 'year': 1992},
                {'name': 'Where Do Broken Hearts Go','year': 2014},
               ],  data)
print(recommended_songs)

[{'name': 'Forget About Us', 'year': 2001, 'artists': 'Tim McGraw'},
 {'name': 'Moving Target', 'year': 1986, 'artists': "['The Outfield']"},
 {'name': 'The Only Child', 'year': 1977, 'artists': "['Jackson Browne']"},
 {'name': 'The Dream Is Still Alive',
  'year': 1990,
  'artists': "['Wilson Phillips']"},
 {'name': 'Polynesian People', 'year': 2000, 'artists': 'Norm'}]

In [None]:
# Creating a DataFrame from the list of dictionaries
df = pd.DataFrame(recommended_songs)

# Saving the DataFrame to a CSV file
df.to_csv('recommended_songs.csv', index=False)

In [None]:
!aws s3 cp recommended_songs.csv s3://badmfinal/