Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
85 lines (57 sloc) 2.17 KB
import logging
import requests
from bs4 import BeautifulSoup
class ArchiveCrawler(object):
"""
Crawls through FreeMusicArchive.org.
Main purpose is to return information about tracks based
on genre.
"""
url = "http://freemusicarchive.org/genre/"
logger = logging.getLogger(__name__)
def __init__(self, genre):
self.genre = genre
self.logger.info("Setting up ArchiveCrawler for '{}' genre".format(genre))
self.page = 1
def _tracks_on_page(self, soup):
"""Extracts all tracks and retruns dict of them"""
all_tracks = []
# Track: {Artist, Track, Album}
all_tracks_meta = soup.select("div[class^=play-item]")
clean_tag = lambda tag: tag.get_text().strip()
for track_meta in all_tracks_meta:
track = {}
track['url'] = track_meta.find(title="Download")['href']
artist = track_meta.find(class_="ptxt-artist")
track['artist'] = clean_tag(artist)
album = track_meta.find(class_="ptxt-album")
track['album'] = clean_tag(album)
name = track_meta.find(class_="ptxt-track")
track['name'] = clean_tag(name)
genre = track_meta.find(class_="ptxt-genre")
track['genre'] = clean_tag(genre)
all_tracks.append(track)
return all_tracks
def get_all_tracks(self):
url = self.url
url = url + '/{genre}/'.format(genre=self.genre)
url = url + "?page={page}".format(page=self.page)
r = requests.get(url)
if r.status_code != 200:
msg = "Problem while reading url: "+url
self.logger.error(msg)
raise ValueError(msg)
soup = BeautifulSoup(r.text, 'html.parser')
tracks = self._tracks_on_page(soup)
self.page += 1
return tracks
def get_track(self):
"""Returns generator to tracks"""
while(True):
all_tracks = self.get_all_tracks()
if all_tracks is None:
break
for track in all_tracks:
yield track
def get_track_url(self):
self.tracks_per_page = self.get_all_tracks()