In [0]:
# imports
!pip install fake_useragent -q
from bs4 import BeautifulSoup
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from fake_useragent import UserAgent

# Load and Parse Web Page

We want to pull all artists + songs off of a given r/music page.

Please note, the below applies to NEW reddit html format.

In [0]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    ua = UserAgent()
    header = {'User-Agent':str(ua.chrome)}
    try:
        with closing(get(url, stream=True, headers=header)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

# get raw html response from web page   
r_music_url = 'https://www.reddit.com/r/Music/' # can be altered for different web pages
raw_response = simple_get(r_music_url)

print (raw_response)

b'<!DOCTYPE html><html lang="en"><head><script>\n          var __SUPPORTS_TIMING_API = typeof performance === \'object\' && !!performance.mark && !! performance.measure && !!performance.getEntriesByType;\n          function __perfMark(name) { __SUPPORTS_TIMING_API && performance.mark(name); };\n          var __firstLoaded = false;\n          function __markFirstPostVisible() {\n            if (__firstLoaded) { return; }\n            __firstLoaded = true;\n            __perfMark("first_post_title_image_loaded");\n          }\n        </script><script>\n          __perfMark(\'head_tag_start\');\n        </script><title>/r/Music</title><meta charSet="utf8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><style>\n  /* http://meyerweb.com/eric/tools/css/reset/\n    v2.0 | 20110126\n    License: none (public domain)\n  */\n\n  html, body, div, span, applet, object, iframe,\n  h1, h2, h3, h4, h5, h6, p, blockquote, pre,\n  a, abbr, acronym, address, big, button, cite, co

In [0]:
# parsing the html with BeautifulSoup
soup = BeautifulSoup(raw_response, 'html.parser')

In [0]:
# get song information from post names
def get_song_info(soup):
  """
  Parameters
  ----------
  soup : Beatuiful Soup
    Should be a Beautiful Soup object of r/music web page
  Returns
  ---------
  song info as a list of tuples [(artist, song name)]
  """
  song_info = []
  for title in soup.select('h2'): # loop through post titles
    artist = title.text[0:title.text.find(' - ')]
    song = title.text[title.text.find(' - ')+3:title.text.find(' [')]
    song_info.append((artist, song))
    
  return song_info

# Interacting with Spotify's API

We need to get the artist ID to interact with Spotify more easily.

First, let's install spotipy and enter our credentials.

We will be using the **spotify** object we create below to acess Spotify's API.

In [0]:
!pip install spotipy -q
import spotipy
import math
import pandas as pd

In [0]:
# set spotify credentials
from spotipy.oauth2 import SpotifyClientCredentials
client_credentials_manager = SpotifyClientCredentials(client_id = '0d38f9d11cec49e3ad044a18de85df9b',
                                                      client_secret='9a47fa59db3e4a42a70a8f79628ac454')
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Utility Functions for Getting Artist and Song Info

For a given Song and Artist, use get_song_features() to return a dictionary of features, including the song name and artist.

In [0]:
def get_artist_key(artist_name):
  """
  Parameters
  ----------
  artist_name : string
  
  Returns
  ----------
  """
  artist = spotify.search(q='artist:' + artist_name, type='artist')['artists']['items'][0]['href'] # get top artist
  return artist[artist.find('artists/')+8:]

# test
# print ('Test Artist Key')
# print (get_artist_key('KIDS SEE GHOSTS'))

def get_song_id(artist_name, song_name, supress_error=True):
  """
  Gets the top song matching a song name and given artist
  
  Returns a string with the ID if found. If not found, returns None
  
  """
  try:
    track_id = spotify.search(q='artist:' + artist_name + ' track:' + song_name, type='track')['tracks']['items'][0]['uri']
    return track_id[track_id.find('track:')+6:]
  except IndexError as e:
    if not supress_error: log_error('Error searching for artist name: {0} and song name: {1}'.format(artist_name, song_name))
    return None

# testing
# print ('\nSong ID Test')
# print ('Song ID:', get_song_id('KIDS SEE GHOSTS', 'Reborn')) # positive test
# print(get_song_id('bad artist', 'Reborn', supress_error=False)) # negative test


# combining prior get song id and features into one function
def get_song_features(artist_name, song_name):
  """"
  Parameters
  ----------
  song_id : string
    Unique song ID specific to Spotify
  
  Returns
  ----------
  Dictionary of audio features for the song ID
  
  """
  song_id = get_song_id(artist_name, song_name)
  
  features = spotify.audio_features(song_id)[0] # return the first element, otherwise we end up with a list of one dictionary
  features['song'] = song_name
  features['artist'] = artist_name
  features['recommendations'] = [get_recommendations(artist_name, song_name)] # list of lists to ensure we can create a dataframe column
  return features

# print ('\nSong Features Test')
# features_test = get_song_features('KIDS SEE GHOSTS', 'Reborn')
# print(features_test)

def get_recommendations(artist_name, song_name, count=5):
  recommendation_tracks = spotify.recommendations(seed_artists=[get_artist_key(artist_name)], seed_tracks=[get_song_id(artist_name, song_name)], limit=count)['tracks']
  return [recommendation_tracks[i]['name'] for i in range(count)]

print ('\nRecommendations Test')
rec = get_recommendations('KIDS SEE GHOSTS', 'Reborn')
print(rec)


Recommendations Test
['Mo Bamba', 'Bloodshed', 'Baby Blue (feat. Chance the Rapper)', 'Magnolia', 'Kids See Ghosts']


## Putting It All Together

### Creating Song Feature Information from URL

Given the work above, we can now go from Reddit URL to a Pandas Dataframe with song info.

In [0]:
def create_feature_info(url='https://www.reddit.com/r/Music/'):
  # get list of artists and songs from URL (assumed to be r/music)
  raw_response = simple_get(url)
  soup = BeautifulSoup(raw_response, 'html.parser')
  song_list = get_song_info(soup) # create list of (artist, song) tuples
  
  # processing songs
  created_df = False
  for i in range(len(song_list)):
    song_id = get_song_id(song_list[i][0], song_list[i][1])
    if song_id is None: # some may either not be songs (ads, stories), or may not be found
      pass
    else:
      if not created_df: # for the first one, create the dataframe
        df_songs = pd.DataFrame(get_song_features(song_list[i][0], song_list[i][1]), index=[i])
        created_df = True
      else:
        df_temp = pd.DataFrame(get_song_features(song_list[i][0], song_list[i][1]), index=[i])
        df_songs = pd.concat([df_songs, df_temp], ignore_index=True)
  
  return df_songs

# test
df_songs_test = create_feature_info(url='https://www.reddit.com/r/Music/')
df_songs_test.head()

Unnamed: 0,acousticness,analysis_url,artist,danceability,duration_ms,energy,id,instrumentalness,key,liveness,...,mode,recommendations,song,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.586,https://api.spotify.com/v1/audio-analysis/35Vq...,The Dead Milkmen,0.549,181067,0.345,35VqQrwh16t58JMjdLxZDZ,6.1e-05,0,0.0537,...,1,"[Everything Sux, Stuart, Slip It In, The Path ...",Bitchin Camaro,0.463,155.368,4,https://api.spotify.com/v1/tracks/35VqQrwh16t5...,audio_features,spotify:track:35VqQrwh16t58JMjdLxZDZ,0.832
1,0.015,https://api.spotify.com/v1/audio-analysis/4KfS...,Heart,0.547,261933,0.691,4KfSdst7rW39C0sfhArdrz,0.105,4,0.144,...,0,"[False Alarm, Don't You Want Me, Hot Water - 2...",Barracuda,0.0369,137.148,4,https://api.spotify.com/v1/tracks/4KfSdst7rW39...,audio_features,spotify:track:4KfSdst7rW39C0sfhArdrz,0.667
2,0.0467,https://api.spotify.com/v1/audio-analysis/2mKj...,Sex Pistols,0.258,198160,0.932,2mKj8Em0GLFu8I78yM1CfU,8e-06,2,0.142,...,1,"[No Future - God Save the Queen, Motorcade, Ta...",Pretty Vacant,0.0702,145.582,4,https://api.spotify.com/v1/tracks/2mKj8Em0GLFu...,audio_features,spotify:track:2mKj8Em0GLFu8I78yM1CfU,0.334
3,0.578,https://api.spotify.com/v1/audio-analysis/26Pw...,Bob Marley,0.684,233560,0.248,26PwuMotZqcczKLHi4Htz3,0.0,0,0.0643,...,1,"[Coming In From The Cold, Parents, Saca Prende...",Redemption Song,0.0435,116.002,4,https://api.spotify.com/v1/tracks/26PwuMotZqcc...,audio_features,spotify:track:26PwuMotZqcczKLHi4Htz3,0.621
4,0.0904,https://api.spotify.com/v1/audio-analysis/0kEQ...,Everlast,0.678,303133,0.556,0kEQwPz9SrMN8E5iL9cxQL,0.0334,0,0.0866,...,1,"[Gone For Good, House Party, Tabletops, Wylin ...",What It's Like,0.0281,85.158,4,https://api.spotify.com/v1/tracks/0kEQwPz9SrMN...,audio_features,spotify:track:0kEQwPz9SrMN8E5iL9cxQL,0.405


## Output the DataFrame to Useful JSON Format

We need to include the following features:
- Classification (Pierre to-do)
- Other info optional?

In [0]:
# features to include in the data
feature_subset = ['accusticness', 'danceability', 'duration', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
                  'valence', 'time_signature']

def prettify_song_dataframe(df):
  df['Title'] = df['artist'] + ' - ' + df['song'] # nice display name to be used
  
  # optional - drop features? more transformation? (TO-DO)
  
  
  return df

def write_to_json(df, file_name='songs.json'):
  return df.to_json(orient='records')
  
df_songs = prettify_song_dataframe(df_songs_test)
write_to_json(df_songs)

'[{"acousticness":0.586,"analysis_url":"https:\\/\\/api.spotify.com\\/v1\\/audio-analysis\\/35VqQrwh16t58JMjdLxZDZ","artist":"The Dead Milkmen","danceability":0.549,"duration_ms":181067,"energy":0.345,"id":"35VqQrwh16t58JMjdLxZDZ","instrumentalness":0.0000614,"key":0,"liveness":0.0537,"loudness":-16.77,"mode":1,"recommendations":["Everything Sux","Stuart","Slip It In","The Path Is Paved","On The Sly"],"song":"Bitchin Camaro","speechiness":0.463,"tempo":155.368,"time_signature":4,"track_href":"https:\\/\\/api.spotify.com\\/v1\\/tracks\\/35VqQrwh16t58JMjdLxZDZ","type":"audio_features","uri":"spotify:track:35VqQrwh16t58JMjdLxZDZ","valence":0.832,"Title":"The Dead Milkmen - Bitchin Camaro"},{"acousticness":0.015,"analysis_url":"https:\\/\\/api.spotify.com\\/v1\\/audio-analysis\\/4KfSdst7rW39C0sfhArdrz","artist":"Heart","danceability":0.547,"duration_ms":261933,"energy":0.691,"id":"4KfSdst7rW39C0sfhArdrz","instrumentalness":0.105,"key":4,"liveness":0.144,"loudness":-13.553,"mode":0,"recomme

In [0]:
df_songs.head()

Unnamed: 0,acousticness,analysis_url,artist,danceability,duration_ms,energy,id,instrumentalness,key,liveness,...,recommendations,song,speechiness,tempo,time_signature,track_href,type,uri,valence,Title
0,0.586,https://api.spotify.com/v1/audio-analysis/35Vq...,The Dead Milkmen,0.549,181067,0.345,35VqQrwh16t58JMjdLxZDZ,6.1e-05,0,0.0537,...,"[Everything Sux, Stuart, Slip It In, The Path ...",Bitchin Camaro,0.463,155.368,4,https://api.spotify.com/v1/tracks/35VqQrwh16t5...,audio_features,spotify:track:35VqQrwh16t58JMjdLxZDZ,0.832,The Dead Milkmen - Bitchin Camaro
1,0.015,https://api.spotify.com/v1/audio-analysis/4KfS...,Heart,0.547,261933,0.691,4KfSdst7rW39C0sfhArdrz,0.105,4,0.144,...,"[False Alarm, Don't You Want Me, Hot Water - 2...",Barracuda,0.0369,137.148,4,https://api.spotify.com/v1/tracks/4KfSdst7rW39...,audio_features,spotify:track:4KfSdst7rW39C0sfhArdrz,0.667,Heart - Barracuda
2,0.0467,https://api.spotify.com/v1/audio-analysis/2mKj...,Sex Pistols,0.258,198160,0.932,2mKj8Em0GLFu8I78yM1CfU,8e-06,2,0.142,...,"[No Future - God Save the Queen, Motorcade, Ta...",Pretty Vacant,0.0702,145.582,4,https://api.spotify.com/v1/tracks/2mKj8Em0GLFu...,audio_features,spotify:track:2mKj8Em0GLFu8I78yM1CfU,0.334,Sex Pistols - Pretty Vacant
3,0.578,https://api.spotify.com/v1/audio-analysis/26Pw...,Bob Marley,0.684,233560,0.248,26PwuMotZqcczKLHi4Htz3,0.0,0,0.0643,...,"[Coming In From The Cold, Parents, Saca Prende...",Redemption Song,0.0435,116.002,4,https://api.spotify.com/v1/tracks/26PwuMotZqcc...,audio_features,spotify:track:26PwuMotZqcczKLHi4Htz3,0.621,Bob Marley - Redemption Song
4,0.0904,https://api.spotify.com/v1/audio-analysis/0kEQ...,Everlast,0.678,303133,0.556,0kEQwPz9SrMN8E5iL9cxQL,0.0334,0,0.0866,...,"[Gone For Good, House Party, Tabletops, Wylin ...",What It's Like,0.0281,85.158,4,https://api.spotify.com/v1/tracks/0kEQwPz9SrMN...,audio_features,spotify:track:0kEQwPz9SrMN8E5iL9cxQL,0.405,Everlast - What It's Like
