# James Camagong's DJ Set Analysis - Data Scraping and Spotify API Call

This Notebook explains how I scraped DJ set data from www.1001tracklists.com and passed those songs through the Spotify API to retrieve the audio feautures for analysis.

## Scraping the data using BeautifulSoup

In [None]:
# These are the DJs I chose for analysis and their respective 1001tracklists page which lists all the tracklists
# that are available to view. We will scrape the tracklist URLs from these later.

dj_pages = [
       ['https://www.1001tracklists.com/dj/alesso/index.html', 'alesso'],
       ['https://www.1001tracklists.com/dj/porterrobinson/index.html', 'porter_robinson'],
       ['https://www.1001tracklists.com/dj/kaskade/index.html', 'kaskade'],
       ['https://www.1001tracklists.com/dj/skrillex/index.html', 'skrillex'],
       ['https://www.1001tracklists.com/dj/diplo/index.html', 'diplo'],
       ['https://www.1001tracklists.com/dj/martingarrix/index.html', 'martin_garrix'],
       ['https://www.1001tracklists.com/dj/zedd/index.html', 'zedd'],
       ['https://www.1001tracklists.com/dj/djsnake/index.html', 'dj_snake'],
       ['https://www.1001tracklists.com/dj/illenium/index.html', 'illenium'],
       ['https://www.1001tracklists.com/dj/deadmau5/index.html', 'deadmau5']
]

### Note:
Websites usually have security to prevent too many requests to their server for DDoS attacks at the like. Scraping the data was kind of a struggle because www.1001tracklists.com has pretty good security and I was only allowed a few requests before getting blocked. 

Here I fake a user agent and in later cells I use a free proxy service to cycle IPs. This code might not work anymore and depends on the quality of the proxy IPs provided and the security of the site. I thank www.1001tracklists.com for the data and I never wanted to be a strain on their server. Make sure to not spam too many requests at once and to have a wait time between requests so you do not cause problems with your web scraping :)

In [None]:
from time import sleep
import random
from random import uniform
import shadow_useragent
import requests
from bs4 import BeautifulSoup


# Scrapes the individual URLs for the individual tracklists from the artist pages I hardcoded above 
ua = shadow_useragent.ShadowUserAgent()
hdr = {"User-Agent":ua.random_nomobile}

dj_tracklists = []

for link in dj_pages:
  sleep(random.uniform(1, 3))
  page_open = requests.get(link[0], headers = hdr).text
  soup = BeautifulSoup(page_open)
  
  tracklists = []
  for a in soup.find_all('a', href=True):
    if '/tracklist/' in str(a['href']):
      tracklists.append('https://www.1001tracklists.com' + str(a['href']))

  dj_tracklists.append([link[1], tracklists[:9]])

print(dj_tracklists)

In [None]:
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from IPython.core.display import clear_output


# This function scrapes a free proxy provider website for IPs to use while making requests. It returns a 
# list of IP addresses which will be used later.
# Thanks to the contributors here for this code:
# https://stackoverflow.com/questions/38785877/spoofing-ip-address-when-web-scraping-python?noredirect=1&lq=1

def get_proxies():
  # Here I provide some proxies for not getting caught while scraping
  ua = UserAgent() # From here we generate a random user agent
  proxies = [] # Will contain proxies [ip, port]

  # Retrieve latest proxies
  headers= {'User-Agent': ua.random, "Accept-Language": "en-US, en;q=0.5"}
  urlopen = requests.get('https://www.sslproxies.org/', headers = headers).text
  soup = BeautifulSoup(urlopen, 'lxml')
  proxies_table = soup.find(id='proxylisttable')

  # Save proxies in the array
  for row in proxies_table.tbody.find_all('tr'):
    proxies.append({
      'ip':   row.find_all('td')[0].string,
      'port': row.find_all('td')[1].string
    })

  # Choose a random proxy
  proxy_index = random.randint(0, len(proxies) - 1)
  proxy = proxies[proxy_index]

  for n in range(1, 20):
    req = Request('http://icanhazip.com')
    req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http')

    # Every 10 requests, generate a new proxy
    if n % 10 == 0:
      proxy_index = random.randint(0, len(proxies) - 1)
      proxy = proxies[proxy_index]

    # Make the call
    try:
      my_ip = urlopen(req).read().decode('utf8')
      print('#' + str(n) + ': ' + my_ip)
      clear_output(wait = True)
    except: # If error, delete this proxy and find another one
      del proxies[proxy_index]
      print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.')
      proxy_index = random.randint(0, len(proxies) - 1)
      proxy = proxies[proxy_index]

  global converted_proxies 
  converted_proxies = []
  for i in proxies:
    k = {'https':'https://'+str(i['ip']+':'+str(i['port']))}
    converted_proxies.append(k)
  
  return converted_proxies


In [None]:
from fake_useragent import UserAgent
import random
from requests.exceptions import ProxyError
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


# This code retrieves the individual webpage for each track on 1001tracklists.com. These links are needed because 
# the body of the webpage has the unique Spotify IDs of each track which makes it easy to get the music features
# from the Spotify API.

ua = UserAgent() 

retry_strategy = Retry(
    total=5,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

tracks_from_tracklists = []
track_urls = []

for i in dj_tracklists:
    for k in i[1]:
        exception = True
        while (exception):
            exception = False
      try:
          sleep(random.uniform(1, 3))
          headers= {'User-Agent': ua.random, "Accept-Language": "en-US, en;q=0.5"}
          proxy = random.choice(converted_proxies)
          if len(converted_proxies) > 0:
            page_open = http.get(k, headers=headers, proxies=proxy).text
          else:
            page_open = http.get(k, headers=headers).text
          soup = BeautifulSoup(page_open)

          meta_tag = soup.find_all('meta', attrs={'itemprop': 'url'})
          counter = 0
          for each in meta_tag:
            if "/track/" in str(each):
              url = 'https://www.1001tracklists.com' + str(each).split('"')[1]
              counter += 1
              x = {'id':counter, 'tracklist_link':k, 'track_link':url, 'artist':str(i[0])}
              print(x)
              track_urls.append(x)
            else:
                pass

      except Exception as e:
        print(e)
        print(proxy)
        converted_proxies.remove(proxy)
        exception = True

df = pd.DataFrame(track_urls)
print(df)

In [None]:
from requests.exceptions import ProxyError


# This block of code is the largest web scraping needed for this project, ~1000 requests. We take the individual 
# track URLs on 1001tracklists and find the Spotify ID for each track so that we can later pass it through the 
# Spotify API. We do this in chunks because the free Proxy IPs get banned after ~20 requests. After a list of IPs
# is exhausted, the IP retrieval function gets new list of IPs until we have made the 1000 requests.

ua = UserAgent() # From here we generate a random user agent

spotify_ids = []

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))
    # Thank you to the contributors here for this useful function that allows you to iterate over a list in chunks
    # https://stackoverflow.com/questions/434287/what-is-the-most-pythonic-way-to-iterate-over-a-list-in-chunks

counter = 0
for chunk in chunker(track_links,20):
    get_proxies()
    proxy_pool = converted_proxies
    print(proxy_pool)
    for url in chunk:
    exception = True
    while (exception):
        exception = False
        try:
        headers= {'User-Agent': ua.random, "Accept-Language": "en-US, en;q=0.5"}
        sleep(random.uniform(1, 3))
        proxy = random.choice(proxy_pool)
        urlopen = http.get(url, headers = headers, proxies=proxy).text

        soup2 = BeautifulSoup(urlopen, 'lxml')

        classes = []
        for element in soup2.find_all(class_= True):
            classes.extend(element["class"])

        for cl in classes:
            if 'mediaItem' in str(cl) and len(str(cl))==31:
                j = {'track_link':url, 'spotify_id':cl.split('Item')[1]}
                spotify_ids.append(j) 
                counter += 1
                print(counter, j)
                break
            else:
                pass

        except Exception as e:
            print(e)
            proxy_pool.remove(proxy)
            exception = True

print(spotify_ids)

# And now we're finally done scraping data off 1001tracklists :)

## Retrieving data from the Spotify API

In [2]:
import spotipy 
from spotipy.oauth2 import SpotifyClientCredentials
import requests

# Spotipy is a great package that simplifies using the Spotify API. Here we initialize the session. You will 
# need to follow the Spotify instructions on how to get a client ID and client secret on their webpage.

cid = 'insert here'
secret = 'insert here'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret, requests_timeout=100)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [4]:
# This is a simple loop to retrieve the audio features which are in dictionary form.

audio_feature_dicts = []

for chunk in chunker(spotify_ids,99):
  sleep(random.uniform(1, 3))
  audio_json = sp.audio_features(chunk)
  for each in audio_json:
    audio_feature_dicts.append(each)

In [5]:
# And here we construct a Pandas dataframe that is ready for analysis
df_features = pd.DataFrame(audio_feature_dicts)
df_features