<a href="https://colab.research.google.com/github/kenbuii/groovemeter/blob/main/DATASCI_FINAL_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project for DATASCI 112:
Authors: Kenneth Bui and Amaryllis Gao

In this project, we aim to classify the *danceability* of a song based on a series of input features, and build a short webapp by which users can input their own Spotify User IDs and receive a short danceability score report.


In [None]:
# import packages
import os
import json
import requests
import pandas as pd

# Data Collection: authorizing Spotify API calls

In [None]:
import requests
from requests.auth import HTTPBasicAuth

client_id = '3a2eb5e7563c465b8b53a32606874631'
client_secret = 'aa4ebcf9312146cb8a67cb65d2a17a64'

auth_url = 'https://accounts.spotify.com/api/token'
auth_data = {
    'grant_type': 'client_credentials'
}
auth_response = requests.post(auth_url, data=auth_data, auth=HTTPBasicAuth(client_id, client_secret))

if auth_response.status_code == 200:
    access_token = auth_response.json()['access_token']


In [None]:
access_token

'BQCxyLQsSHqSY8jBdQ7hgtmdDtrGlVtKehw2kMlD9Z9E6wMjJdJ3VR4IyPzeVM1qpCYk-yBqUv4vB7l_VKcM-0jNoRHHXipU3xH8QLuaOhoIFh9N11Q'

# Making API calls


In [None]:
def get_data(url: str, access_token: str, verbose: bool = False):
    response = requests.get(url, headers={'Authorization': access_token})
    result = json.loads(response.text)

    if verbose:
        print('Response body:\n', result)

    return result

# Data Collection: Getting Tracks


In [None]:
def get_tracks(genres_list: list, steps: int, limit: int, offset: int, access_token: str):
    tracks_df = pd.DataFrame()
    _initial_offset = offset

    for genre in genres_list:

        for step in range(steps):
            url = 'https://api.spotify.com/v1/search?q=genre:{}&type=track&limit={}&offset={}'.format(genre, limit, offset)
            search_item = get_data(url, access_token)

            for n in range(limit):
                track_id = search_item['tracks']['items'][n]['id']
                track_name = search_item['tracks']['items'][n]['name']
                artist_name = search_item['tracks']['items'][n]['artists'][0]['name']
                popularity = search_item['tracks']['items'][n]['popularity']

                tracks_df = tracks_df.append({
                    'track_id': track_id,
                    'track_name': track_name,
                    'artist_name': artist_name,
                    'popularity': popularity,
                    'genre': genre
                }, ignore_index=True)

            offset += limit
        offset = _initial_offset

    return tracks_df


# Data Collection: Genre and Artists
Here, we scrape the Spotify API to find the top 10 artists for the following genres:

1.   Pop
1.   Indie
2.   R&B

We remove duplicates from each category, and extract the top 10 songs of each artist.

In [None]:
import requests

# Define a function to get the top artists for a genre
def get_top_artists_for_genre(access_token, genre, limit=10):
    url = f"https://api.spotify.com/v1/search?type=artist&limit={limit}&q=genre:\"{genre}\""
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    response = requests.get(url, headers=headers)
    response_json = response.json()
    if 'artists' not in response_json:
        print('Error fetching artists:', response_json)
        return None  # Add an error check
    return response_json['artists']['items']

# Define a function to get the top tracks for an artist
def get_top_tracks_for_artist(access_token, artist_id, limit=50):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market=US"  # Use 'market' instead of 'country'
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    response = requests.get(url, headers=headers)
    response_json = response.json()
    if 'tracks' not in response_json:
        print('Error fetching tracks:', response_json)
        return None  # Add an error check
    return response_json['tracks']


# NOTE: may need to update token from next runtime due to expiration
# Use your access token from the authorization step
access_token = 'BQBaAQckIrCLwoCJyPnLXkWDhlrRWPMyiuBEjiZsTSsc9w5Pdn-IEAx0bC5zFSeJteIfIKgoIBEUIZkLXhqJB9bhVEQGeeHo8yRh9SKQ13giFw8gsb4'  # Make sure to use the actual access token


# Getting Top 10 Pop

In [None]:
import requests

def get_top_artists_for_genre(access_token, genre, limit=10):
    url = f"https://api.spotify.com/v1/search?type=artist&limit={limit}&q=genre:\"{genre}\""
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    response_json = response.json()
    if 'artists' not in response_json:
        print('Error fetching artists:', response_json.get('error', 'Unknown Error'))
        return []
    return response_json['artists']['items']

access_token = 'BQBaAQckIrCLwoCJyPnLXkWDhlrRWPMyiuBEjiZsTSsc9w5Pdn-IEAx0bC5zFSeJteIfIKgoIBEUIZkLXhqJB9bhVEQGeeHo8yRh9SKQ13giFw8gsb4'
genres = ['pop', 'indie', 'R&B']
all_genres_top_artists = {}

# Get top artists from all genres
for genre in genres:
    print(f"Processing genre: {genre}")
    top_artists = get_top_artists_for_genre(access_token, genre)
    unique_artists = {}

    for artist in top_artists:
        artist_id = artist['id']
        artist_name = artist['name']
        artist_popularity = artist['popularity']

        # Check if this artist is already recorded in other genres
        if artist_id not in unique_artists and artist_id not in all_genres_top_artists:
            unique_artists[artist_id] = {'name': artist_name, 'popularity': artist_popularity}
        elif artist_id in all_genres_top_artists and artist_popularity > all_genres_top_artists[artist_id]['popularity']:
            unique_artists[artist_id] = {'name': artist_name, 'popularity': artist_popularity}

    # Sort the unique artists by popularity and take the top 10 for the current genre
    sorted_unique_artists = sorted(unique_artists.values(), key=lambda x: x['popularity'], reverse=True)[:10]
    all_genres_top_artists.update(unique_artists)  # Update the main list with the unique artists for this genre

    # Output the top artists for the genre
    print(f"Top artists for {genre} genre:")
    for artist in sorted_unique_artists:
        print(f"- {artist['name']} (Popularity: {artist['popularity']})")
    print("\n")

# At this point, `all_genres_top_artists` holds the unique top 10 artists for each genre, avoiding duplicates


Processing genre: pop
Error fetching artists: {'status': 401, 'message': 'The access token expired'}
Top artists for pop genre:


Processing genre: indie
Error fetching artists: {'status': 401, 'message': 'The access token expired'}
Top artists for indie genre:


Processing genre: R&B
Error fetching artists: {'status': 401, 'message': 'The access token expired'}
Top artists for R&B genre:




In [None]:
def search_artist_id(access_token, artist_name):
    url = f"https://api.spotify.com/v1/search?q={artist_name}&type=artist"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    response_json = response.json()

    if response.status_code != 200:
        print('Error searching for artist:', response_json.get('error', {}).get('message', 'Unknown Error'))
        return None

    artists = response_json.get('artists', {}).get('items', [])
    if artists:
        # Assuming we want the first search result
        return artists[0]['id']
    else:
        print(f"No artist found for name: {artist_name}")
        return None

access_token = 'BQBaAQckIrCLwoCJyPnLXkWDhlrRWPMyiuBEjiZsTSsc9w5Pdn-IEAx0bC5zFSeJteIfIKgoIBEUIZkLXhqJB9bhVEQGeeHo8yRh9SKQ13giFw8gsb4'
artist_names = ['Taylor Swift', 'Drake', "The Weeknd", "Ariana Grande", "Sza", "Lana Del Rey", "Rihanna", "Ty Dolla $ign", "Olivia Rodrigo", "Lil Wayne"]

artist_ids = {}

for artist_name in artist_names:
    artist_id = search_artist_id(access_token, artist_name)
    if artist_id:
        artist_ids[artist_name] = artist_id

# Now you have a dictionary of artist names to artist IDs
chicken = artist_ids.values()
chicken

Error searching for artist: The access token expired
Error searching for artist: The access token expired
Error searching for artist: The access token expired
Error searching for artist: The access token expired
Error searching for artist: The access token expired
Error searching for artist: The access token expired
Error searching for artist: The access token expired
Error searching for artist: The access token expired
Error searching for artist: The access token expired
Error searching for artist: The access token expired


dict_values([])

In [None]:
import requests

def get_top_tracks_for_artist(access_token, artist_id, market='US', limit=10):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market={market}"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    response_json = response.json()
    if 'tracks' not in response_json:
        print('Error fetching tracks:', response_json.get('error', 'Unknown Error'))
        return []
    return response_json['tracks']

def get_audio_features_for_track(access_token, track_id):
    url = f"https://api.spotify.com/v1/audio-features/{track_id}"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    response_json = response.json()
    if 'error' in response_json:
        print('Error fetching audio features:', response_json.get('error', {}).get('message', 'Unknown Error'))
        return {}
    return response_json

# Example usage
access_token = 'BQBaAQckIrCLwoCJyPnLXkWDhlrRWPMyiuBEjiZsTSsc9w5Pdn-IEAx0bC5zFSeJteIfIKgoIBEUIZkLXhqJB9bhVEQGeeHo8yRh9SKQ13giFw8gsb4'  # Replace with your actual access token
artist_ids = chicken
# This dictionary will store all the information
all_tracks_info = {}

for artist_id in artist_ids:
    top_tracks = get_top_tracks_for_artist(access_token, artist_id)
    all_tracks_info[artist_id] = []

    for track in top_tracks:
        track_info = {
            'name': track['name'],
            'popularity': track['popularity'],
            'id': track['id']
        }
        audio_features = get_audio_features_for_track(access_token, track['id'])

        # Add audio features to the track informationD
        track_info.update(audio_features)

        # Append the track information to the artist's list of tracks
        all_tracks_info[artist_id].append(track_info)

# Now `all_tracks_info` contains all the information
# You can process this information further as needed
track_info = all_tracks_info


In [None]:
track_info

{}

In [None]:
tracks_list = [track for tracks in all_tracks_info.values() for track in tracks]

# Create a DataFrame from the list of dictionaries
pop_df = pd.DataFrame(tracks_list)

# Display the DataFrame to ensure it's correct
pop_df.head()

In [None]:
tracks_list = [track for tracks in all_tracks_info.values() for track in tracks]

# Create a DataFrame from the list of dictionaries
indie_df = pd.DataFrame(tracks_list)

# Display the DataFrame to ensure it's correct
indie_df.head()

In [None]:
# Flatten the nested dictionaries into a list of dictionaries
tracks_list = [track for tracks in all_tracks_info.values() for track in tracks]

# Create a DataFrame from the list of dictionaries
rnb_df = pd.DataFrame(tracks_list)

# Display the DataFrame to ensure it's correct
rnb_df.head()

In [None]:
#time to make a big boy df
big_boi = pd.read_csv("final_music_df.csv")

frames = [pop_df, indie_df, rnb_df, big_boi]
chicken = pd.concat(frames)
chicken.to_csv('biggie_boi.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'final_music_df .csv'

# Get the other spotify characteristics

In [None]:
import requests

def get_top_tracks_for_artist(access_token, artist_id, market='US', limit=10):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market={market}"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    response_json = response.json()
    if 'tracks' not in response_json:
        print('Error fetching tracks:', response_json.get('error', 'Unknown Error'))
        return []
    return response_json['tracks']

def get_audio_features_for_track(access_token, track_id):
    url = f"https://api.spotify.com/v1/audio-features/{track_id}"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    response_json = response.json()
    if 'error' in response_json:
        print('Error fetching audio features:', response_json.get('error', {}).get('message', 'Unknown Error'))
        return {}
    return response_json

# Example usage
access_token = 'BQBaTr9tGvv4GXH_AJijt3UyORRoTjlc8GuLLUn2-Pw8nWKVPkQ3htj0DqjplT_9Wlr4SnAtFsiG8EJDwskcU-hM5S1mFvQZm5RrTZVsNImlf7zZ94A' # Replace with your actual access token
artist_ids = ['ARTIST_ID_1', 'ARTIST_ID_2'] # Replace with actual artist IDs

for artist_id in artist_ids:
    top_tracks = get_top_tracks_for_artist(access_token, artist_id)
    for track in top_tracks:
        print(f"- {track['name']} (Popularity: {track['popularity']})")
        track_features = get_audio_features_for_track(access_token, track['id'])
        print(f"Audio Features: {track_features}\n")


In [None]:
# Define your Genius API key here
GENIUS_API_KEY = 'your_genius_api_key_here'

def get_song_id(song_name, artist_name):
    base_url = "http://api.genius.com"
    headers = {'Authorization': 'Bearer ' + GENIUS_API_KEY}
    search_url = base_url + "/search"
    data = {'q': song_name + ' ' + artist_name}
    response = requests.get(search_url, data=data, headers=headers)

    if response.status_code == 200:
        response_json = response.json()
        hits = response_json['response']['hits']

        # If there are no songs found in the hits, return None
        if not hits:
            return None

        # Often, the first hit is the song you're looking for
        for hit in hits:
            if hit['result']['primary_artist']['name'].lower() == artist_name.lower():
                return hit['result']['id']

    return None

def get_lyrics_from_id(song_id):
    base_url = "http://api.genius.com"
    headers = {'Authorization': 'Bearer ' + GENIUS_API_KEY}
    song_url = base_url + f"/songs/{song_id}"
    response = requests.get(song_url, headers=headers)

    if response.status_code == 200:
        response_json = response.json()
        path = response_json['response']['song']['path']

        # Constructing the URL to the song's lyrics page
        lyrics_url = "http://genius.com" + path
        page = requests.get(lyrics_url)

        # Here you would need to parse the page to find the lyrics.
        # This will require HTML parsing and possibly dealing with JavaScript-rendered pages.
        # This can get complex and is highly dependent on the structure of the Genius lyrics page.

        # For example, you might use BeautifulSoup to parse the HTML as follows:
        # (This is illustrative and might not work directly due to Genius page structures or changes in their layout)
        from bs4 import BeautifulSoup
        html = BeautifulSoup(page.text, "html.parser")
        [h.extract() for h in html('script')]  # Remove script tags
        lyrics_div = html.find("div", class_="lyrics")
        if lyrics_div:
            return lyrics_div.get_text()

    return None

def get_lyrics_for_song(song_name, artist_name):
    song_id = get_song_id(song_name, artist_name)
    if song_id:
        return get_lyrics_from_id(song_id)
    else:
        return None

# Example usage
# lyrics = get_lyrics_for_song("Someone Like You", "Adele")
# print(lyrics)


In [None]:
import pandas as pd

def get_lyrics_for_song(song_name, artist_name):
    # This function should implement the API call to a lyrics service
    # and return the lyrics for the given song and artist.
    # You need to refer to the documentation of the service you're using.
    pass

# Create an empty list to store song data
songs_data = []

# Loop through your artist and songs data structure
for artist_id, artist_info in all_genres_top_artists.items():
    genre = artist_info['genre']
    artist_name = artist_info['name']
    for song in artist_info['top_tracks']:
        song_name = song['name']
        # Fetch lyrics for each song
        lyrics = get_lyrics_for_song(song_name, artist_name)
        # Append a dictionary with the song data to the list
        songs_data.append({
            'genre': genre,
            'artist': artist_name,
            'song name': song_name,
            'songlyrics': lyrics
        })

# Convert the list of song data into a pandas DataFrame
songs_df = pd.DataFrame(songs_data)

# Assuming you've set up pandas and DataFrame is what you're working with
print(songs_df.head())  # This prints the first few rows of your DataFrame
