In [12]:
import csv
import pandas as pd
import hvplot.pandas
import matplotlib as plt
from pprint import pprint
import kagglehub
import os
import requests
import pickle
from dotenv import load_dotenv


# Download latest version
path = kagglehub.dataset_download("nelgiriyewithana/most-streamed-spotify-songs-2024")
# Load data if it is not already loaded
if not os.path.exists(os.path.join("data", "spotify.csv")):
    file_name = os.listdir(path)[0]
    os.rename(os.path.join(path, file_name), os.path.join("data", "spotify.csv"))

# Load to dataframe
spotify_df = pd.read_csv("data/spotify.csv", encoding='ISO-8859-1')

# Clean up column names
for i in spotify_df.columns:
    spotify_df.rename(columns={i: (i.lower()).replace(" ", "_")}, inplace=True)

# Save to pickle   
pickle_path = os.path.join("data", "spotify_df.pkl")
spotify_df.to_pickle(pickle_path)

In [None]:
# Load the .env file
load_dotenv()
lastfm_id = os.getenv("lastfm_id")
lastfm_secret = os.getenv("lastfm_secret")

lastfm_url = 'http://ws.audioscrobbler.com/2.0/'
parameters = {
    'method': 'track.getTopTags',
    'api_key': lastfm_id,
    'format': 'json',
    'autocorrect': 1
}

if os.path.exists(pickle_path):
    with open(pickle_path, 'wb') as file:
        spotify_df = pickle.load(file)
        if 'lastfm_tags' not in spotify_df.columns:
            spotify_df['lastfm_tags'] = None
        else: 
            for index, row in spotify_df[spotify_df['lastfm_tags'].apply(lambda x: x == [] or x == None)].iterrows():
                parameters['track'] = row['track']
                parameters['artist'] = row['artist']
                r = requests.get(lastfm_url, params=parameters).json()
                try:
                    tags = [tag['name'] for tag in r['toptags']['tag']]
                except KeyError:
                    print(f"Error with {row['track']} by {row['artist']}")
                    pprint(r)
                    tags = []
                spotify_df.at[index, 'lastfm_tags'] = tags
else:
    print("no pickle found: creating one now") 
    pickle.dump(spotify_df, file)

In [None]:
# Load the .env file
load_dotenv()

spotify_id = os.getenv("spotify_id")
spotify_secret = os.getenv("spotify_secret")


auth_url = "https://accounts.spotify.com/api/token"
auth_response  = requests.post(auth_url, {
    'grant_type': 'client_credentials',
    'client_id': spotify_id,
    'client_secret': spotify_secret,
})

#Convert response to JSON
auth_response_data = auth_response.json()

#Save the access token
access_token = auth_response_data['access_token']

#Need to pass access token into header to send properly formed GET request to API server
headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

base_url = 'https://api.spotify.com/v1/'

if os.path.exists(path=pickle_path):
    with open(pickle_path, 'rb') as file:
        spotify_df = pickle.load(file)
        if 'spotify_id' not in spotify_df.columns:
            spotify_df['spotify_id'] = None
        for index, row in spotify_df.iterrows():
            if row['spotify_id'] == None:
                parameters = {
                'q': 'isrc:'+row['isrc'],
                'type': 'track',
                'limit' : 1
            }
                try:
                    r = requests.get(base_url + 'search', headers=headers, params=parameters).json()
                    spotify_id = r['tracks']['items'][0]['id']
                    print(f'processing row: {index} --- adding {spotify_id} as spotify_id' )
                    spotify_df.at[index, 'spotify_id'] = r['tracks']['items'][0]['id']
                except IndexError:
                    print(f"track not found by isrc: {row['isrc']}")
                    print(f'trying by track and artist: {row["track"]} by {row["artist"]}')
                    parameters = {
                        'q': f'track:{row["track"]}+artist:{row["artist"]}',
                        'type': 'track',
                        'limit' : 1
                    }
                    r = requests.get(base_url + 'search', headers=headers, params=parameters).json()
                    try:
                        spotify_id = r['tracks']['items'][0]['id']
                        print(f'processing row: {index} --- adding {spotify_id} as spotify_id' )
                        spotify_df.at[index, 'spotify_id'] = r['tracks']['items'][0]['id']
                    except IndexError:
                        print(f"track not found by track and artist: {row['track']} by {row['artist']}")
                        print(f"giving up")
                        spotify_df.at[index, 'spotify_id'] = None
else:
    print("no pickle found: creating one now") 
    pickle.dump(spotify_df, file)

processing row: 0 --- adding 5AJ9hqTS2wcFQCELCFRO7A as spotify_id
processing row: 1 --- adding 6AI3ezQ4o3HUoP6Dhudph3 as spotify_id
processing row: 2 --- adding 2GxrNKugF82CnoRFbQfzPf as spotify_id
processing row: 3 --- adding 7DSAEUvxU8FajXtRloy8M0 as spotify_id
processing row: 4 --- adding 2HYFX63wP3otVIvopRS99Z as spotify_id
processing row: 5 --- adding 4xhsWYTOGcal8zt0J161CU as spotify_id
processing row: 6 --- adding 3xkHsmpQCBMytMJNiDf3Ii as spotify_id
processing row: 7 --- adding 6XjDF6nds4DE2BBbagZol6 as spotify_id
processing row: 8 --- adding 6KD131fI0hLfrb7Uwa6jCw as spotify_id
processing row: 9 --- adding 7iabz12vAuVQYyekFIWJxD as spotify_id
processing row: 10 --- adding 5IZXB5IKAD2qlvTPJYDCFB as spotify_id
processing row: 11 --- adding 4yq8Oc51K2mgLfo9BjU3Rr as spotify_id
processing row: 12 --- adding 629DixmZGHc7ILtEntuiWE as spotify_id
processing row: 13 --- adding 2tudvzsrR56uom6smgOcSf as spotify_id
processing row: 14 --- adding 5JvEHdLTVmD6I9a3EMoOjL as spotify_id
proce

In [None]:
# Load the .env file
load_dotenv()

spotify_id = os.getenv("spotify_id")
spotify_secret = os.getenv("spotify_secret")


auth_url = "https://accounts.spotify.com/api/token"
auth_response  = requests.post(auth_url, {
    'grant_type': 'client_credentials',
    'client_id': spotify_id,
    'client_secret': spotify_secret,
})

#Convert response to JSON
auth_response_data = auth_response.json()

#Save the access token
access_token = auth_response_data['access_token']

#Need to pass access token into header to send properly formed GET request to API server
headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

base_url = 'https://api.spotify.com/v1/'

pickle_path = os.path.join("data", "spotify_df.pkl")

if os.path.exists(pickle_path):
    with open(pickle_path, 'rb') as file:
        spotify_df = pickle.load(file)
        if 'spotify_features' not in spotify_df.columns:
            spotify_df['spotify_features'] = None
        for index, row in spotify_df.iterrows():
            if row['spotify_features'] == None:
                try:
                    r = requests.get(base_url + 'audio-features/' + row['spotify_id'], headers=headers).json()
                    spotify_df.at[index, 'spotify_features'] = r
                except (requests.exceptions.RequestException, KeyError, IndexError) as e:
                    print(f"Error with {row['spotify_id']}: ( {row['track']} by {row['artist']} ) \n {e}")
                    spotify_df.at[index, 'spotify_features'] = None
else:
    print("no pickle found: creating one now") 
    pickle.dump(spotify_df, file)

In [None]:
df = spotify_df[spotify_df['lastfm_tags'].apply(lambda x: x != [])]
df.reset_index(inplace=True)
df.rename(columns={'index': 'original_rank'}, inplace=True)

In [None]:
def tag_frequency(tag, df):
    """
    Calculate the frequency of a specific tag in the DataFrame.

    Parameters:
    tag (str): The tag to search for in the 'lastfm_tags' column.
    df (pd.DataFrame): The DataFrame containing the 'lastfm_tags' column.

    Returns:
    int: The number of occurrences of the specified tag in the DataFrame.
    
    Example usage:
    tag = 'Hip-Hop'
    frequency = tag_frequency(tag, spotify_df)
    print(f"The tag '{tag}' appears {frequency} times in the lastfm_tags column.")
    """
    return df['lastfm_tags'].apply(lambda x: tag in x).sum() # df['lastfm_tags'].apply(lambda x: tag in x) returns a boolean series with true for each row that contains the tag 

all_tags = spotify_df['lastfm_tags'].explode().unique()

tag_frequency_df = pd.DataFrame({'tag': all_tags, 'frequency': [tag_frequency(tag, df) for tag in all_tags]})

tag_frequency_df.sort_values('frequency', ascending=False, inplace=True)
tag_frequency_df

TypeError: argument of type 'NoneType' is not iterable