In [1]:
import requests
from bs4 import BeautifulSoup
import os
import sys
import time
import json
import re

In [8]:
class DMjazzCrawler():
    BASE_URL = 'http://www.bushgrafts.com/jazz'
    ROOT = 'archive'
    
    def __init__(self, sleep_time=0.1, log=True):
        self.sleep_time = sleep_time
        self.log = log

    def _request_url(self, url, doctype='html'):
        # set header
        try:
            response = requests.get(url, headers = {"Cache-Control":"max-age=0"})
        except:
            return None
        
        # sleep
        time.sleep(self.sleep_time)

        # return
        if doctype =='html':
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        elif  doctype =='content':
            return response.content
        else:
            return response

    def _log_print(self, log, quite=False):
        if not quite:
            print(log)

        if self.log:
            with open("log.txt", "a") as f:
                print(log, file=f)
    
    def fetch_song(self):
        self.soup = self._request_url('http://bushgrafts.com/midi/')
        a_list = dmc.soup.find_all('a')
        midi_list = []
        name_list = []

        cnt = 0
        for idx, a in enumerate(a_list):
            str_ = a.get('href')
            if str_ and (str_ not in midi_list) and ('.mid' in str_):
                song_name = re.sub( '\s+', ' ' , a.text.replace( '\r\n' ,  '' )).strip(' ')
                
                if song_name:
                    midi_fn = str_.split('/')[-1]
                    midi_list.append(midi_fn )
                    name_list.append(song_name)
                    print('%3d | %-40s %s'%(idx, song_name, midi_fn))
                    cnt += 1
                    
#                     print(str_)
#                     print(a.text)
#                     print(song_name)
#                     break

        self._log_print('Total: %d'%cnt)
        
        return dict(zip(midi_list, name_list))
    
    def crawl_song(self, song_dict):
        for idx, k in enumerate(song_dict.keys()):
   
            url = self.BASE_URL + '/Midi%20site/' + k
            print('%3d %s' %(idx, url))
        
            time.sleep(5) 
            content = self._request_url(url, doctype='response')
            
            if content is None:
                url = self.BASE_URL + '/' + k
                print('%3d %s' %(idx, url))
                time.sleep(1) 
                content = self._request_url(url, doctype='response')
                if content is None:
                    break
            
#             with open(os.path.join(self.ROOT,k), "wb") as f:
#                 f.write(content)
            
            print(os.path.join(self.ROOT,k))
            with open(os.path.join(self.ROOT,k), 'wb') as target:
                for chunk in content.iter_content(chunk_size=128):
                    target.write(chunk)
            
    def run(self):
        
        song_dict = self.fetch_song()
        
        if not os.path.exists(self.ROOT):
            os.makedirs(self.ROOT)
        with open(os.path.join(self.ROOT, 'archive.json'), "w") as f:
                json.dump(song_dict, f)
        
        self.crawl_song(song_dict)
        
    def crawl_song_continue(self, song_dict):
        for idx, k in enumerate(song_dict.keys()):
           
            if idx < 170:
                continue
            
            url = self.BASE_URL + '/Midi%20site/' + k
            print('%3d %s' %(idx, url))
        
            time.sleep(5) 
            content = self._request_url(url, doctype='response')
            
            if content is None:
                url = self.BASE_URL + '/' + k
                print('%3d %s' %(idx, url))
                time.sleep(1) 
                content = self._request_url(url, doctype='response')
                if content is None:
                    break
            
#             with open(os.path.join(self.ROOT,k), "wb") as f:
#                 f.write(content)
            
            print(os.path.join(self.ROOT,k))
            with open(os.path.join(self.ROOT,k), 'wb') as target:
                for chunk in content.iter_content(chunk_size=128):
                    target.write(chunk)
    
    def run_continue(self):
        
        song_dict = self.fetch_song()
        
        if not os.path.exists(self.ROOT):
            os.makedirs(self.ROOT)
        with open(os.path.join(self.ROOT, 'archive.json'), "w") as f:
                json.dump(song_dict, f)
        
        self.crawl_song_continue(song_dict)


In [40]:
dmc = DMjazzCrawler()
dmc.run()

 42 | A Fine Romance – take 2                  afine-2.mid
 43 | A Ghost Of A Chance                      Aghostofachance.mid
 44 | A House Is Not A Home                    AHouseis.mid
 45 | A Nightingale Sang…                      Anighting.mid
 46 | A Remark You Made                        ARemark.mid
 47 | A Sleepin’ Bee                           A%20Sleepin'%20Bee.mid
 48 | After You’ve Gone                        AfterYou.mid
 49 | Alfie                                    alfiepno.mid
 50 | Alice In Wonderland                      AliceInWonderland.mid
 51 | All The Things You Are                   AllTheThings%20V2.mid
 52 | All The Things You Are/2                 All%20The%20Things%20You%20Are.mid
 53 | All The Things Reharmonized              AllTheThings%20Reharmonized.mid
 54 | Alone Together (trio)                    Alone%20Together.mid
 55 | Answer Me My Love                        Answer%20me%20My%20Love.mid
 56 | Ask Me Now (Monk)                        Ask%20Me%20Now%

archive/afine-2.mid
  1 http://www.bushgrafts.com/jazz/Midi%20site/Aghostofachance.mid
archive/Aghostofachance.mid
  2 http://www.bushgrafts.com/jazz/Midi%20site/AHouseis.mid
archive/AHouseis.mid
  3 http://www.bushgrafts.com/jazz/Midi%20site/Anighting.mid
archive/Anighting.mid
  4 http://www.bushgrafts.com/jazz/Midi%20site/ARemark.mid
  4 http://www.bushgrafts.com/jazz/ARemark.mid
archive/ARemark.mid
  5 http://www.bushgrafts.com/jazz/Midi%20site/A%20Sleepin'%20Bee.mid
archive/A%20Sleepin'%20Bee.mid
  6 http://www.bushgrafts.com/jazz/Midi%20site/AfterYou.mid
archive/AfterYou.mid
  7 http://www.bushgrafts.com/jazz/Midi%20site/alfiepno.mid
archive/alfiepno.mid
  8 http://www.bushgrafts.com/jazz/Midi%20site/AliceInWonderland.mid
archive/AliceInWonderland.mid
  9 http://www.bushgrafts.com/jazz/Midi%20site/AllTheThings%20V2.mid
archive/AllTheThings%20V2.mid
 10 http://www.bushgrafts.com/jazz/Midi%20site/All%20The%20Things%20You%20Are.mid
archive/All%20The%20Things%20You%20Are.mid
 11 http:

archive/Green%20Dolph%20solo.mid
 85 http://www.bushgrafts.com/jazz/Midi%20site/Gymnopedie-It%20Never%20Entered%20My%20Mind.mid
archive/Gymnopedie-It%20Never%20Entered%20My%20Mind.mid
 86 http://www.bushgrafts.com/jazz/Midi%20site/Have%20You%20Met%20-%20duet.mid
archive/Have%20You%20Met%20-%20duet.mid
 87 http://www.bushgrafts.com/jazz/Midi%20site/Have%20You%20Met%20Miss%20Jones%20-%20latin.mid
archive/Have%20You%20Met%20Miss%20Jones%20-%20latin.mid
 88 http://www.bushgrafts.com/jazz/Midi%20site/HowkumU.mid
archive/HowkumU.mid
 89 http://www.bushgrafts.com/jazz/Midi%20site/How%20Deep%20is%20the%20Ocean%20(Doug%20McKenzie).mid
archive/How%20Deep%20is%20the%20Ocean%20(Doug%20McKenzie).mid
 90 http://www.bushgrafts.com/jazz/Midi%20site/howdsolo.mid
archive/howdsolo.mid
 91 http://www.bushgrafts.com/jazz/Midi%20site/hwdptrio.mid
archive/hwdptrio.mid
 92 http://www.bushgrafts.com/jazz/Midi%20site/Hymn%20To%20Freedom.mid
archive/Hymn%20To%20Freedom.mid
 93 http://www.bushgrafts.com/jazz/Midi

archive/Mood%20Indigo%20-%20solo.mid
166 http://www.bushgrafts.com/jazz/Midi%20site/Moon%20and%20Sand.mid
archive/Moon%20and%20Sand.mid
167 http://www.bushgrafts.com/jazz/Midi%20site/Moon%20River%203.mid
archive/Moon%20River%203.mid
168 http://www.bushgrafts.com/jazz/Midi%20site/moonlightinvermont.mid
archive/moonlightinvermont.mid
169 http://www.bushgrafts.com/jazz/Midi%20site/MyFoolishHeart.mid
archive/MyFoolishHeart.mid
170 http://www.bushgrafts.com/jazz/Midi%20site/MyFunny3.mid
170 http://www.bushgrafts.com/jazz/MyFunny3.mid


In [9]:
dmc = DMjazzCrawler()
dmc.run_continue()

 42 | A Fine Romance – take 2                  afine-2.mid
 43 | A Ghost Of A Chance                      Aghostofachance.mid
 44 | A House Is Not A Home                    AHouseis.mid
 45 | A Nightingale Sang…                      Anighting.mid
 46 | A Remark You Made                        ARemark.mid
 47 | A Sleepin’ Bee                           A%20Sleepin'%20Bee.mid
 48 | After You’ve Gone                        AfterYou.mid
 49 | Alfie                                    alfiepno.mid
 50 | Alice In Wonderland                      AliceInWonderland.mid
 51 | All The Things You Are                   AllTheThings%20V2.mid
 52 | All The Things You Are/2                 All%20The%20Things%20You%20Are.mid
 53 | All The Things Reharmonized              AllTheThings%20Reharmonized.mid
 54 | Alone Together (trio)                    Alone%20Together.mid
 55 | Answer Me My Love                        Answer%20me%20My%20Love.mid
 56 | Ask Me Now (Monk)                        Ask%20Me%20Now%

archive/MyFunny3.mid
171 http://www.bushgrafts.com/jazz/Midi%20site/funny%20val%20solo.mid
archive/funny%20val%20solo.mid
172 http://www.bushgrafts.com/jazz/Midi%20site/myheartstoodstill%20edited%20a%20bit.mid
archive/myheartstoodstill%20edited%20a%20bit.mid
173 http://www.bushgrafts.com/jazz/Midi%20site/ManGone.mid
archive/ManGone.mid
174 http://www.bushgrafts.com/jazz/Midi%20site/My%20Old%20FlameGM.mid
archive/My%20Old%20FlameGM.mid
175 http://www.bushgrafts.com/jazz/Midi%20site/Myoneand.mid
archive/Myoneand.mid
176 http://www.bushgrafts.com/jazz/Midi%20site/My%20Romance.mid
archive/My%20Romance.mid
177 http://www.bushgrafts.com/jazz/Midi%20site/My%20Shining%20Hour.mid
archive/My%20Shining%20Hour.mid
178 http://www.bushgrafts.com/jazz/Midi%20site/MyShiningHoursolo.mid
archive/MyShiningHoursolo.mid
179 http://www.bushgrafts.com/jazz/Midi%20site/MyShip.mid
archive/MyShip.mid
180 http://www.bushgrafts.com/jazz/Midi%20site/Nardis.mid
archive/Nardis.mid
181 http://www.bushgrafts.com/jazz/

archive/The%20End%20of%20a%20Love%20Affair.mid
255 http://www.bushgrafts.com/jazz/Midi%20site/The%20folks%20who%20live%20on%20the%20hill.mid
archive/The%20folks%20who%20live%20on%20the%20hill.mid
256 http://www.bushgrafts.com/jazz/Midi%20site/TheManThatGotAway.mid
archive/TheManThatGotAway.mid
257 http://www.bushgrafts.com/jazz/Midi%20site/The%20Masquerade%20is%20Over.mid
archive/The%20Masquerade%20is%20Over.mid
258 http://www.bushgrafts.com/jazz/Midi%20site/moreicu.mid
archive/moreicu.mid
259 http://www.bushgrafts.com/jazz/Midi%20site/McKenzie-ThePeacocks.mid
archive/McKenzie-ThePeacocks.mid
260 http://www.bushgrafts.com/jazz/Midi%20site/Peanut%20Vendor%20(El%20Manisero).mid
archive/Peanut%20Vendor%20(El%20Manisero).mid
261 http://www.bushgrafts.com/jazz/Midi%20site/TheSongisYoutrio.mid
archive/TheSongisYoutrio.mid
262 http://www.bushgrafts.com/jazz/Midi%20site/Summer%20wind%202.mid
archive/Summer%20wind%202.mid
263 http://www.bushgrafts.com/jazz/Midi%20site/The%20Way%20You%20look%20t

In [None]:
dmc = DMjazzCrawler()
dmc.run_continue()