Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
114 lines (87 sloc) 3.25 KB
from __future__ import print_function
import logging
import os
import time
import urllib2
import re
import random
from bs4 import BeautifulSoup
from crawlers import ArchiveCrawler
from handler_db import MusicDatabase
def download_track(url, filename=None, path=None, force=False):
"""Downloads mp3 track and stores it in predefined path"""
logger.debug("Downloading mp3 from URL="+url)
# Append suffix if not present
if filename is not None and filename[-3:].lower() != "mp3":
logger.debug("Appending 'mp3' to the name -> '%s.mp3'", filename)
filename = filename.split('.')[0] + ".mp3"
# Make sure path exists
if not os.path.exists(path):
logger.debug("Creating path: "+str(path))
os.makedirs(path)
# Mimic browser (add header)
request = urllib2.Request(url)
request.add_header('User-agent', 'Mozilla/5.0')
opener = urllib2.build_opener()
try:
open_request = opener.open(request)
except urllib2.HTTPError, e:
logger.error("HTTP Error {}: {}".format(e.code,url))
return False
except urllib2.URLError, e:
logger.error("URL Error {}: {}".format(e.reason,url))
return False
if filename is None:
cd = open_request.info()['Content-Disposition']
filename = re.findall('"(.+)"', cd)[0]
logger.info("Saving to file: %s ", filename)
filepath = os.path.join(path, filename)
# Check if file exists
if not force and os.path.isfile(filepath):
logger.info("File already exists. Skipping.")
return True
with open(filepath, 'wb') as f:
logger.debug("Writing to file")
f.write(opener.open(request).read())
return True
def run():
genres = ["Soul-RB", "Blues", "Classical", "Country",
"Electronic", "Hip-Hop", "Instrumental", "Jazz",
"Pop", "Rock"]
db = MusicDatabase()
db.create()
for genre in genres:
logger.info("Current genre: "+genre)
crawler = ArchiveCrawler(genre)
tracks = crawler.get_track()
current_track_num = 0
while(current_track_num < MAX_TRACK_NUM):
current_track_num += 1
# Get metadata of next track
track = tracks.next()
logger.info("Track: %s - %s (%s)", track['artist'],
track['name'], track['album'])
# Download mp3 and save to a file
url = track['url']
filename = track['name'].replace(" ", "-")
save_path = os.path.join("Data",genre)
success = download_track(url, path=save_path)
if success:
db.add_track(track)
# Be nice! Don't block traffic!
sleep_time = SLEEP_TIME + random.random()*RANDOM_SLEEP_TIME
time.sleep(sleep_time)
SLEEP_TIME = 1.0 # s
RANDOM_SLEEP_TIME = 1.0 # s
MAX_TRACK_NUM = 200
if __name__ == "__main__":
logging.basicConfig(filename='scraper.log',level=logging.DEBUG,
format='%(asctime)s %(module)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
try:
run()
except Exception as e:
logger.exception("Something went horribly wrong, " \
+ "but it worked on my machine... ")
else:
logger.info("Finished successfully")