In [None]:
# Import libraries
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import datetime
import random 
import collections
import requests
import spotipy.oauth2 as oauth2
import time

from matplotlib.pyplot import figure
from matplotlib import cm

# Import initial dataset
df = pd.read_csv('songs_dataset.csv')

In [None]:
df.head()

In [None]:
# Count total number of tracks per genre in the dataset
counter=collections.Counter(df["genre"].values)
counter.most_common()

In [None]:
# We will take 8 basic genres Rock, Pop, Hip Hop, Metal, Country, Jazz, Electronic, R&B
# We drop "Not Available" and "Other" since they dont give value to the prediction
# Drop "Indie" and "Folk" to simplify the problem

In [None]:
# First we perform data cleaning. We will delete the tracks that have missing values
df = df.dropna()

In [None]:
# Count again
counter=collections.Counter(df["genre"].values)
counter.most_common()

In [None]:
# Now we will take random 1000 tracks for each genre
# By randomising we avoid having the same artist repeated a lot, assuming that the more variety of artists the better the
# prediction
list_genres = ["Rock", "Pop", "Hip-Hop", "Metal", "Country", "Jazz", "Electronic", "R&B"]
reduced_df = pd.DataFrame()

for genre in list_genres:
    random_index = random.sample(range(1, len(df[df["genre"] == genre])), 1000)
    reduced_df = reduced_df.append(df[df["genre"] == genre].reset_index().iloc[random_index])

In [None]:
counter=collections.Counter(reduced_df["genre"].values)
counter.most_common()

In [None]:
reduced_df = reduced_df.reset_index(drop=True)

In [None]:
reduced_df.head()

In [None]:
# Next, we will the features of each track using the Spotify API
# 1. Get ID of each track using track and artist names as input
# 2. Get features using the ID as input

def is_token_expired(token_info):
    now = int(time.time())
    return token_info['expires_at'] - now < 60

# Get token
creds = oauth2.SpotifyClientCredentials(client_id="c2d07b756064444495ab44c7d14d4a81", client_secret="c6bae7d2398d4098b0d1435d7a7cf486")
token = creds.get_access_token()

spotify_track_details = []

# For each track
for index, row in reduced_df.iterrows():
    
    # If token is expired, generate another one
    if is_token_expired(creds.token_info) == True:
        token = creds.get_access_token()
            
    track = row["song"]
    artist = row["artist"]
    
    # Request track information
    spotify_id_request = requests.get(
        "https://api.spotify.com/v1/search?q=track:{}&artist:{}&type=track".format(track, artist),
        headers={"Authorization": "Bearer {}".format(token)}).json()
    
    if spotify_id_request["tracks"]["items"]:
        # Save track ID
        track_id = spotify_id_request["tracks"]["items"][0]["id"]
        
        # Get audio features
        spotify_features_request = requests.get(
            "https://api.spotify.com/v1/audio-features/{}".format(track_id),
            headers={"Authorization": "Bearer {}".format(token)}).json()
        
        if "error" not in spotify_features_request.keys():
            
            (spotify_track_details.append([track, artist, row["lyrics"], 
                    spotify_features_request["acousticness"], spotify_features_request["danceability"], 
                    spotify_features_request["duration_ms"], spotify_features_request["energy"], 
                    spotify_features_request["instrumentalness"], spotify_features_request["key"], 
                    spotify_features_request["liveness"], spotify_features_request["loudness"], 
                    spotify_features_request["mode"], spotify_features_request["speechiness"], 
                    spotify_features_request["tempo"], spotify_features_request["time_signature"], 
                    spotify_features_request["valence"], row["genre"]]))


In [None]:
detailed_df = pd.DataFrame(spotify_track_details, columns = ["track", "artist", "lyrics", "acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "key", "liveness", "loudness", "mode", "speechiness", "tempo", "time_signature", "valence", "genre"])

In [None]:
detailed_df.head()

In [None]:
# Check class imbalance for those tracks that didnt have features analysed or werent found in Spotify
counter=collections.Counter(detailed_df["genre"].values)
counter.most_common()

In [None]:
# Even the fact that there is some imbalance in the dataset, this is not significant and we progress with it
# Save dataset
detailed_df.to_csv("dataset.csv", index=False)

In [None]:
# Now we will proceed with the analysis and prediction in the other script

In [None]:
# Request track information
spotify_id_request = requests.get(
        "https://api.spotify.com/v1/search?q=track:{}&artist:{}&type=track".format("bohemian rapsody - remastered", "queen"),
        headers={"Authorization": "Bearer {}".format(token)}).json()

In [None]:
spotify_id_request