In [1]:
# Import libraries
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import datetime
import random 
import collections
import requests
import spotipy.oauth2 as oauth2
import time

from matplotlib.pyplot import figure
from matplotlib import cm

# Import initial dataset
df = pd.read_csv('songs_dataset.csv')

In [2]:
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [3]:
# We will take song, artist and genre for our analysis
# Count total number of tracks per genre in the dataset
counter = collections.Counter(df["genre"].values)
counter.most_common()

[('Rock', 131377),
 ('Pop', 49444),
 ('Hip-Hop', 33965),
 ('Not Available', 29814),
 ('Metal', 28408),
 ('Other', 23683),
 ('Country', 17286),
 ('Jazz', 17147),
 ('Electronic', 16205),
 ('R&B', 5935),
 ('Indie', 5732),
 ('Folk', 3241)]

In [4]:
# We will take 8 basic genres: Rock, Pop, Hip Hop, Metal, Country, Jazz, Electronic, R&B
# We drop "Not Available" and "Other" since they dont give value to the prediction
# Drop "Indie" and "Folk" to simplify the problem

In [5]:
# First we perform data cleaning. We will delete the tracks that have missing values
df = df.dropna()

In [6]:
# Count again
counter=collections.Counter(df["genre"].values)
counter.most_common()

[('Rock', 109235),
 ('Pop', 40466),
 ('Hip-Hop', 24850),
 ('Not Available', 23941),
 ('Metal', 23759),
 ('Country', 14387),
 ('Jazz', 7970),
 ('Electronic', 7966),
 ('Other', 5189),
 ('R&B', 3401),
 ('Indie', 3149),
 ('Folk', 2243)]

In [7]:
# Now we will take random 2000 tracks for each genre
# By randomising we avoid having the same artist repeated a lot, assuming that the more variety of artists the better the
# prediction
list_genres = ["Rock", "Pop", "Hip-Hop", "Metal", "Country", "Jazz", "Electronic", "R&B"]
reduced_df = pd.DataFrame()

for genre in list_genres:
    random_index = random.sample(range(1, len(df[df["genre"] == genre])), 2000)
    reduced_df = reduced_df.append(df[df["genre"] == genre].reset_index().iloc[random_index])

In [8]:
counter=collections.Counter(reduced_df["genre"].values)
counter.most_common()

[('Rock', 2000),
 ('Pop', 2000),
 ('Hip-Hop', 2000),
 ('Metal', 2000),
 ('Country', 2000),
 ('Jazz', 2000),
 ('Electronic', 2000),
 ('R&B', 2000)]

In [9]:
reduced_df = reduced_df.reset_index(drop=True)

In [10]:
reduced_df.head()

Unnamed: 0,level_0,index,song,year,artist,genre,lyrics
0,229979,229979,cold-hearted-man,2009,ac-dc,Rock,No one knew\nWhere he came from\nHe never knew...
1,155672,155672,midnight-clear-love-song,2015,chris-tomlin,Rock,It came upon the midnight clear\nThat glorious...
2,45259,45259,money-for-nothing,1985,dire-straits,Rock,(I want my MTV)\nNow look at them yo-yo's that...
3,327678,327678,beautiful-world,2008,carolina-liar,Rock,Here it comes in the morning\nI'm just trying ...
4,339043,339043,do-you-want-to-dance,2015,cliff-richard,Rock,"Well, do you want to dance and hold my hand?\n..."


In [24]:
# Next, we will find the features of each track using the Spotify API
# 1. Get ID of each track using track and artist names as input
# 2. Get features using the ID as input

def is_token_expired(token_info):
    now = int(time.time())
    return token_info['expires_at'] - now < 60

# Get token
creds = oauth2.SpotifyClientCredentials(client_id="YOUR_CLIENT_ID", client_secret="YOUR_CLIENT_SECRET")
token = creds.get_access_token()

spotify_track_details = []

# For each track
for index, row in reduced_df.iterrows():
    
    # If token is expired, generate another one
    if is_token_expired(creds.token_info) == True:
        token = creds.get_access_token()
            
    track = row["song"]
    artist = row["artist"]
    if row["genre"] == "Electronic":
        # Request track information
        spotify_id_request = requests.get(
            "https://api.spotify.com/v1/search?q=track:{}&artist:{}&type=track".format(track, artist),
            headers={"Authorization": "Bearer {}".format(token)}).json()

        if spotify_id_request["tracks"]["items"]:
            # Save track ID
            track_id = spotify_id_request["tracks"]["items"][0]["id"]

            # Get audio features
            spotify_features_request = requests.get(
                "https://api.spotify.com/v1/audio-features/{}".format(track_id),
                headers={"Authorization": "Bearer {}".format(token)}).json()

            if "error" not in spotify_features_request.keys():
                (spotify_track_details.append([track, artist, row["lyrics"], 
                        spotify_features_request["acousticness"], spotify_features_request["danceability"], 
                        spotify_features_request["duration_ms"], spotify_features_request["energy"], 
                        spotify_features_request["instrumentalness"], spotify_features_request["key"], 
                        spotify_features_request["liveness"], spotify_features_request["loudness"], 
                        spotify_features_request["mode"], spotify_features_request["speechiness"], 
                        spotify_features_request["tempo"], spotify_features_request["time_signature"], 
                        spotify_features_request["valence"], row["genre"]]))


In [27]:
detailed_df = pd.DataFrame(spotify_track_details, columns = ["track", "artist", "lyrics", "acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "key", "liveness", "loudness", "mode", "speechiness", "tempo", "time_signature", "valence", "genre"])

In [28]:
detailed_df.head()

Unnamed: 0,track,artist,lyrics,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre
0,hexes,bassnectar,You surprised me with a kiss\nUnderneath the c...,0.344,0.865,155147,0.591,8e-06,1,0.246,-9.331,0,0.143,82.972,4,0.614,Electronic
1,affection,crystal-castles,Catch a moth hold it in my hand\nCrush it casu...,0.516,0.273,235226,0.565,2e-06,2,0.0855,-7.283,1,0.248,179.518,4,0.298,Electronic
2,funkhadafi,front-242,We who are oppressed love those who\nfight aga...,5.3e-05,0.558,210213,0.86,0.631,4,0.778,-12.459,0,0.0815,117.992,4,0.106,Electronic
3,lies,burns,You've been cheatin' and tellin' me lies\nYou'...,0.0956,0.708,201707,0.648,0.0,6,0.134,-5.626,1,0.0449,143.955,4,0.338,Electronic
4,da-funk-daftendirekt,daft-punk,"Paul Jonson, DJ Funk, DJ Skull, DJ Rush\nWaxma...",0.00994,0.676,396973,0.859,0.754,5,0.98,-9.602,0,0.0412,111.991,4,0.546,Electronic


In [29]:
# Check class imbalance for those tracks that didnt have features analysed or werent found in Spotify
counter=collections.Counter(detailed_df["genre"].values)
counter.most_common()

[('Electronic', 1793)]

In [95]:
df_processed = pd.DataFrame()
for genre in list_genres:
    df_processed = df_processed.append(detailed_df[detailed_df["genre"] == genre].iloc[0:1500])

In [97]:
df_processed = df_processed.reset_index().drop("index", axis=1)

In [98]:
counter=collections.Counter(df_processed["genre"].values)
counter.most_common()

[('Rock', 1500),
 ('Pop', 1500),
 ('Hip-Hop', 1500),
 ('Metal', 1500),
 ('Country', 1500),
 ('Jazz', 1500),
 ('Electronic', 1500)]

In [99]:
# Save dataset
df_processed.to_csv("dataset_reduced.csv", index=False)

In [None]:
# Now we will proceed with the analysis and prediction in the other script