In [1]:
import os
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'Data')
TEMP_DIR = os.path.join(BASE_DIR, 'Temp')

In [3]:
class BeatportChartScraper(object):
    """
    Web scraper for acquiring data from Beatport DJ Charts.
    It extracts publication date of the chart and a list of music genres included in it.
    """
        
    def links(self, url):
        """Return a list of links."""
        base_url = 'https://www.beatport.com'
        r = requests.get(url)
        if r.status_code not in range(200,299):
            return False
        soup = BeautifulSoup(r.content, 'html.parser')
        links = []
        for link in soup.select('li.bucket-item'):
            link = link.find('a').get('href')
            links.append(base_url + link)
        return links
    
    def chart(self, link): 
        """Extract data from a single link, return Pandas Series."""
        try:
            r = requests.get(link)
            soup = BeautifulSoup(r.content, 'html.parser')
            date = soup.find('span', class_='value').string[:4]
            chart = []
            for genre in soup.find_all('p', class_='buk-track-genre')[1:]:
                genre = genre.find('a').string
                item = genre
                chart.append(item)
            series = pd.Series(chart, dtype=str, name=date)
            return series
        except AttributeError:
            print('Invalid.')
    
    def run(self, url):
        """Run the scraper for all links in the list and return Pandas DataFrame."""
        temp = []
        links = self.links(url)
        for link in links:
            temp.append(self.chart(link))
        data = pd.concat(temp)
        df = pd.DataFrame(data.value_counts())
        print('Done.')
        return df
    
    def scrape(self, year=None, start_page=1, pages=None, save=True):
        """
        Main method used for scraping.
        Create a single DataFrame and write it to csv file.
        
        Parameters
        ----------
        year: int, default None
            The publication year of charts
        pages: int, default None
            The number of pages to be scraped in given year
        start_page: int, default 1
            Page number to start scraping from
        save: bool, default True
            If False, returns a DataFrame without writing to csv
        """
        
        assert len(f'{year}') == 4 # FIX: len(str(year))
        assert isinstance(year, int)
        assert isinstance(start_page, int)
        assert isinstance(pages, int)
        
        if start_page != 1:
            p = start_page + pages
        else:
            p = pages + 1
        dataframes = []
        for i in range(start_page, p):
            url = f'https://www.beatport.com/charts/all?page={i}&start-date={year}-01-01&end-date={year}-12-31'
            dataframes.append(self.run(url))
            print(f'{i}/{pages}')
            print(url)
            time.sleep(3)
        df = pd.concat(dataframes)
        if save == False:
            return df
        os.makedirs(TEMP_DIR, exist_ok=True)
        csv_path = os.path.join(TEMP_DIR, f'{year} {pages}.csv')
        df.to_csv(csv_path, index=True)
        print('Finished.')
        return df

In [4]:
scraper = BeatportChartScraper()

In [5]:
# scraper.scrape(2017, 50, 1)

Done.
50/1
https://www.beatport.com/charts/all?page=50&start-date=2017-01-01&end-date=2017-12-31
Finished.


Unnamed: 0,2017
Tech House,60
Techno (Peak Time / Driving),54
Minimal / Deep Tech,24
House,18
Electronica,13
Big Room,12
Funky House,11
Deep House,10
Trance,7
Progressive House,6
