In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import numpy as np
import pandas as pd

In [2]:
def Test_Crawl(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    data = soup.find('div', { 'class' : 'js-store' })['data-content']
    data_json = json.loads(data)
    print(data)

In [None]:
Test_Crawl('https://tabs.ultimate-guitar.com/tab/tom-odell/another-love-chords-1198980')

In [2]:
def Crawl_One(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    data = soup.find('div', { 'class' : 'js-store' })['data-content']
    data_json = json.loads(data)
    data_new = {
        'title': data_json['store']['page']['data']['tab']['song_name'],
        'artist': data_json['store']['page']['data']['tab']['artist_name'],
        'key': data_json['store']['page']['data']['tab']['tonality_name'],
        'chords': data_json['store']['page']['data']['tab_view']['wiki_tab']['content']
    }
    try:
        data_new['capo'] = data_json['store']['page']['data']['tab_view']['meta']['capo']
    except:
        data_new['capo'] = ''
    return data_new

In [3]:
def Parse_Links(stop, decade):
    links = []
    for i in range(1, stop+1):
        url = f"https://www.ultimate-guitar.com/explore?decade[]={decade}&page={i}&type[]=Chords"
        html = requests.get(url).text
        soup = BeautifulSoup(html, 'html.parser')
        data = soup.find('div', { 'class' : 'js-store' })['data-content']
        data_json = json.loads(data)
        for tab in data_json['store']['page']['data']['data']['tabs']:
            links.append(tab['tab_url'])
        print(f"Parsed links for page {i}. Current number of links: {len(links)}")
    return links

In [4]:
def Parse_Song(song):
    lines = []
    tag_verification_needed = False
    current_opening_tag = ''
    current_closing_tag = ''
    parsing_opening_tag = False
    parsing_closing_tag = False
    current_content = ''
    parsing_content = False
    current_line = []
    for char in song:
        if char == "[":
            tag_verification_needed = True
            parsing_content = False
            # print('opened a tag - verification needed')
        elif char == "]": # 태그가 닫혔다
            if parsing_opening_tag: # 오프닝태그가 닫혔다
                parsing_opening_tag = False
                if current_opening_tag == 'ch':
                    parsing_content = True
                # print('closed opening tag: ' + current_opening_tag)
            else: # 클로징태그가 닫혔다
                parsing_closing_tag = False
                # print('closed closing tag: ' + current_closing_tag)
                if current_closing_tag == 'ch': # 코드가 닫혔을경우
                    current_line.append(current_content)
                    # print('pushing new chord: ' + current_content)
                    current_opening_tag = ''
                    current_closing_tag = ''
                    current_content = ''
                elif current_closing_tag == 'tab': # 탭이 닫혔을경우
                    if current_line != []:
                        lines.append(current_line)
                    # print('pushing new line: ', current_line)
                    current_opening_tag = ''
                    current_closing_tag = ''
                    current_content = ''
                    current_line = []
        elif char == "/":
            if tag_verification_needed:
                tag_verification_needed = False
                parsing_opening_tag = False
                parsing_closing_tag = True
                # print('--tag is closing tag')
            elif parsing_content:
                current_content += char
        else: # 그냥 글자가 왔다
            if char != ' ':
                if tag_verification_needed:
                    if len(current_opening_tag) > 0: # 이미 오프닝태그가 있는 경우 내용 삭제후 새로 파싱
                        current_opening_tag = ''
                        current_content = ''
                        tag_verification_needed = False
                        parsing_opening_tag = True
                        current_opening_tag += char
                    else:
                        tag_verification_needed = False
                        parsing_opening_tag = True
                        parsing_closing_tag = False
                        current_opening_tag += char
                    # print('--tag is opening tag')
                elif parsing_opening_tag:
                    current_opening_tag += char
                elif parsing_closing_tag:
                    current_closing_tag += char
                elif parsing_content:
                    current_content += char
    return lines

In [5]:
def Crawl(stop, decade):
    songs = []
    links = Parse_Links(stop, decade)
    for index, url in enumerate(links):
        try:
            print(f'Crawling {index} out of {len(links)}...', end='')
            data = Crawl_One(url)
            data['chords'] = Parse_Song(data['chords'])
            print('Success.')
            songs.append(data)
        except Exception as e:
            print('Failed.')
            print(f'Error at URL: {url}')
            print(str(e))
    return songs

In [6]:
# crawl + save csv file
raw_2010 = Crawl(100, 2010)
df_2010 = pd.DataFrame(raw_2010)
df_2010.to_csv('2010.csv')

Parsed links for page 1. Current number of links: 50
Parsed links for page 2. Current number of links: 100
Parsed links for page 3. Current number of links: 150
Parsed links for page 4. Current number of links: 200
Parsed links for page 5. Current number of links: 250
Parsed links for page 6. Current number of links: 300
Parsed links for page 7. Current number of links: 350
Parsed links for page 8. Current number of links: 400
Parsed links for page 9. Current number of links: 450
Parsed links for page 10. Current number of links: 500
Parsed links for page 11. Current number of links: 550
Parsed links for page 12. Current number of links: 600
Parsed links for page 13. Current number of links: 650
Parsed links for page 14. Current number of links: 700
Parsed links for page 15. Current number of links: 750
Parsed links for page 16. Current number of links: 800
Parsed links for page 17. Current number of links: 850
Parsed links for page 18. Current number of links: 900
Parsed links for pag