Just running some experiments in a notebook for convenience

In [None]:
import re, requests

import bs4

In [None]:
MA_BASE_URL = 'https://www.metal-archives.com'

## How to get a list of all bands?

Based on https://github.com/jonchar/ma-scraper,
it looks like MA uses AJAX internally.

So browse by band alphabetically and open dev tools.  The Network tab shows the following request:
https://www.metal-archives.com/browse/ajax-letter/l/A/json/1?sEcho=1&iColumns=4&sColumns=&iDisplayStart=0&iDisplayLength=500&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=false&_=1551813490451

This returns a JSON object with all the info used to populate the table.
It looks like the we can use `iDisplayStart`, `iDisplayLength` to get all the data.

Looks like `iDisplayLength` must be 500 and `iDisplayStart` must be a multiple of 500.
Otherwise metal-archives will just return the first page of data.
As Jon Charest noticed, we must also specify `sEcho` to get a valid JSON response.
Looks like the value can be just `''`.

In [None]:
def ma_band_list_url(letter):
    """
    Construct the GET request URL for the alphabetical band list.
    Looks like letter can be A-Z, NBR, or ~.
    """
    url = MA_BASE_URL + '/browse/ajax-letter/l/' + str(letter) + '/json/1'
    return url

headers = {'user-agent': 'bot'} # give a user agent so we don't 403
params = {'sEcho': '',
          'iDisplayStart': 500,
          'iDisplayLength': 500,
         }
print(ma_band_list_url('A'))
response = requests.get(ma_band_list_url('A'), params=params, headers=headers)
print(response)

In [None]:
response.json()['aaData'][0]

In [None]:
response.headers

## Okay, now let's munge the response data a little bit.

In [None]:
band_info = ["<a href='https://www.metal-archives.com/bands/Abducted/3540381624'>Abducted</a>",
 'Spain',
 'Thrash Metal',
 '<span class="split_up">Split-up</span>']
band_info

In [None]:
soup = bs4.BeautifulSoup(band_info[0], 'html5lib')
soup2 = bs4.BeautifulSoup(band_info[3], 'html5lib')

In [None]:
name = soup.a.text
url = soup.a.get('href')
country = band_info[1]
genre = band_info[2]
status = soup2.span.text
print(name, url, country, genre, status)

## Similar munging for the review JSON

In [None]:
review_info = ['January 31',
               '<a href="https://www.metal-archives.com/reviews/Toxik_Attack/Assassinos_em_S%C3%A9rie/746017/Cosmic_Mystery/407515" title="ole skool thrash metal!" class="iconContainer ui-state-default ui-corner-all"><span class="ui-icon ui-icon-search">Read</span></a>',
               '<a href="https://www.metal-archives.com/bands/Toxik_Attack/3540389184">Toxik Attack</a>',
               '<a href="https://www.metal-archives.com/albums/Toxik_Attack/Assassinos_em_S%C3%A9rie/746017">Assassinos em Série</a>',
               '67%',
               '<a href="https://www.metal-archives.com/users/Cosmic%20Mystery" class="profileMenu">Cosmic Mystery</a>',
               '23:18']

In [None]:
year = 2019
month = 1
day = int(review_info[0].split()[1])
hour, minute = map(int, review_info[6].split(':'))

score = int(review_info[4][:-1])

soup = bs4.BeautifulSoup(review_info[1], 'html5lib')
soup2 = bs4.BeautifulSoup(review_info[2], 'html5lib')
soup3 = bs4.BeautifulSoup(review_info[3], 'html5lib')
soup5 = bs4.BeautifulSoup(review_info[5], 'html5lib')

review_url = soup.a.get('href')
review = soup.a.get('title')

band_url = soup2.a.get('href')
band = soup2.a.text

album_url = soup3.a.get('href')
album = soup3.a.text

reviewer_url = soup5.a.get('href')
reviewer = soup5.a.text

print(year, month, day, hour, minute, band, band_url, album, album_url, review, review_url, score, reviewer, reviewer_url)

In [None]:
def get_ID_from_band_URL(band_url):
    return int(band_url.split('/')[-1])
get_ID_from_band_URL(band_url)

## Now let's scrape a band page & relevent AJAX

In [None]:
band_url = 'https://www.metal-archives.com/bands/Panopticon/126117'
band_url = 'https://www.metal-archives.com/bands/Cattle_Decapitation/2840'
headers = {'user-agent': 'bot'} # give a user agent so we don't 403
response = requests.get(band_url, headers=headers)
print(response.status_code)
#print(response.text)
soup = bs4.BeautifulSoup(response.text)

### Get the band name (to eventually check that it matches what we requested)

In [None]:
band_name_list = soup.find_all('h1', 'band_name')
assert len(band_name_list) == 1, 'should only have one band name header element'
band_name_tag = band_name_list[0]
print(band_name_tag)
print(band_name_tag.a.get('href'))
print(band_name_tag.text)

### Get lyrical themes

In [None]:
#print(soup.body)
stats_div = soup.find('div', {'id': 'band_stats'})
right_stuff = stats_div.find('dl', 'float_right')
dds = right_stuff.find_all('dd')
themes = dds[1].text
print(themes)

### Get current label

In [None]:
label_tag = dds[2].a
label = label_tag.text
label_url = label_tag.get('href')
print(label, label_url)

### Get the added/modified dates

In [None]:
audit_div = soup.find('div', {'id': 'auditTrail'})
td_list = audit_div.find_all('td')
added_on_td = td_list[2]
modified_on_td = td_list[3]
date_matcher = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')
added_on_matcher = re.compile(r'Added on: (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})')

added_on_match = re.match(added_on_matcher, added_on_td.text)
print(added_on_match)
added_on = added_on_match.group(1)
print(added_on)

### Get band members

In [None]:
band_members_tag = soup.find('div', {'id': 'band_members'})
tab_links = band_members_tag.ul.find_all('li')
tab_links[0].a.

In [None]:
complete_members_tag = soup.find('div', {'id': 'band_tab_members_all'}) # complete lineup
current_members_tag = soup.find('div', {'id': 'band_tab_members_current'}) # current lineup
#past_members_tag = soup.find('div', {'id': 'band_tab_members_past'}) # past lineup
#live_members_tag = soup.find('div', {'id': 'band_tab_members_live'}) # live lineup
#print(complete_members_tag)

# Complete members table has everybody (duh) and is delimited by Current, Past, Current (Live), Past (Live)
# We should be able to get everybody by iterating over the complete members table.
#print(complete_members_tag.table.tbody)
for row in complete_members_tag.table.tbody.find_all('tr'):
    c = row.get('class')[0]
    
    if c == 'lineupHeaders':
        text = str(row.td.text).strip().rstrip()
        text = re.sub(r'\s+', ' ', text)
        print(text)
    elif c == 'lineupRow':
        artist_tag = row.td
        print(artist_tag.find_all('a'))
        
        pass
    elif c == 'lineupBandsRow':
        pass
    else:
        raise NotImplementedError()

### Get the band comment/"read-more", which requires a separate request

In [None]:
band_id = get_ID_from_band_URL(band_url)
read_more_url = 'https://www.metal-archives.com/band/read-more/id/' + str(band_id)
response = requests.get(read_more_url, headers=headers)
print(response.status_code)
soup = bs4.BeautifulSoup(response.text, 'html5lib')
print(str(soup.body)) # with body tag
print(''.join(map(str, soup.body.children))) # without body tag
#Probably want to keep the whole body HTML for future stuff.  For example, 
#this has A.Lunn's artist page linked, so we could search for artist, band, etc. pages.


### Get all similar artists by users' votes

In [None]:
band_id = get_ID_from_band_URL(band_url)
#similar_bands_url = 'https://www.metal-archives.com/band/ajax-recommendations/id/' + str(band_id) + '?showMoreSimilar=1'
#response = requests.get(similar_bands_url, headers=headers)

similar_bands_url = 'https://www.metal-archives.com/band/ajax-recommendations/id/' + str(band_id)
params = {'showMoreSimilar': '1'}
response = requests.get(similar_bands_url, headers=headers, params=params)

print(response.status_code)
print(response.url)

In [None]:
soup = bs4.BeautifulSoup(response.text)
table = soup.find('table', {'id': 'artist_list'})
#print(table)
for row in table.tbody.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) < 4: break
    artist_tag = cells[0]
    score = cells[3].text
    print(artist_tag.text, score)

### Get discography

In [None]:
discog_url = f'https://www.metal-archives.com/band/discography/id/{band_id}/tab/all'
response = requests.get(discog_url, headers=headers)
if response.status_code != 200:
    raise RuntimeError('Got response status {}, bailing.'.format(response.status_code))

In [None]:
soup = bs4.BeautifulSoup(response.text, 'html5lib')
table = soup.find('table', {'class': 'display discog'})
for row in table.tbody.find_all('tr'):
    cells = row.find_all('td')
    album_tag = cells[0].a
    album = album_tag.text
    album_url = album_tag.get('href')
    #album_id = ...
    
    type_tag = cells[1]
    type_str = type_tag.text
    
    year_tag = cells[2]
    year = int(year_tag.text)
    
    print(album, album_url, type_str, year)