Just running some experiments in a notebook for convenience

In [None]:
import re, requests

import bs4

In [None]:
MA_BASE_URL = 'https://www.metal-archives.com'

## How to get a list of all bands?

Based on https://github.com/jonchar/ma-scraper,
it looks like MA uses AJAX internally.

So browse by band alphabetically and open dev tools.  The Network tab shows the following request:
https://www.metal-archives.com/browse/ajax-letter/l/A/json/1?sEcho=1&iColumns=4&sColumns=&iDisplayStart=0&iDisplayLength=500&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=false&_=1551813490451

This returns a JSON object with all the info used to populate the table.
It looks like the we can use `iDisplayStart`, `iDisplayLength` to get all the data.

Looks like `iDisplayLength` must be 500 and `iDisplayStart` must be a multiple of 500.
Otherwise metal-archives will just return the first page of data.
As Jon Charest noticed, we must also specify `sEcho` to get a valid JSON response.
Looks like the value can be just `''`.

In [None]:
def ma_band_list_url(letter):
    """
    Construct the GET request URL for the alphabetical band list.
    Looks like letter can be A-Z, NBR, or ~.
    """
    url = MA_BASE_URL + '/browse/ajax-letter/l/' + str(letter) + '/json/1'
    return url

headers = {'user-agent': 'bot'} # give a user agent so we don't 403
params = {'sEcho': '',
          'iDisplayStart': 500,
          'iDisplayLength': 500,
         }
print(ma_band_list_url('A'))
response = requests.get(ma_band_list_url('A'), params=params, headers=headers)
print(response)

In [None]:
response.json()['aaData'][0]

In [None]:
response.headers

## Okay, now let's munge the response data a little bit.

In [None]:
band_info = ["<a href='https://www.metal-archives.com/bands/Abducted/3540381624'>Abducted</a>",
 'Spain',
 'Thrash Metal',
 '<span class="split_up">Split-up</span>']
band_info

In [None]:
soup = bs4.BeautifulSoup(band_info[0], 'html5lib')
soup2 = bs4.BeautifulSoup(band_info[3], 'html5lib')

In [None]:
name = soup.a.text
url = soup.a.get('href')
country = band_info[1]
genre = band_info[2]
status = soup2.span.text
print(name, url, country, genre, status)

## Similar munging for the review JSON

In [None]:
review_info = ['January 31',
               '<a href="https://www.metal-archives.com/reviews/Toxik_Attack/Assassinos_em_S%C3%A9rie/746017/Cosmic_Mystery/407515" title="ole skool thrash metal!" class="iconContainer ui-state-default ui-corner-all"><span class="ui-icon ui-icon-search">Read</span></a>',
               '<a href="https://www.metal-archives.com/bands/Toxik_Attack/3540389184">Toxik Attack</a>',
               '<a href="https://www.metal-archives.com/albums/Toxik_Attack/Assassinos_em_S%C3%A9rie/746017">Assassinos em Série</a>',
               '67%',
               '<a href="https://www.metal-archives.com/users/Cosmic%20Mystery" class="profileMenu">Cosmic Mystery</a>',
               '23:18']

In [None]:
year = 2019
month = 1
day = int(review_info[0].split()[1])
hour, minute = map(int, review_info[6].split(':'))

score = int(review_info[4][:-1])

soup = bs4.BeautifulSoup(review_info[1], 'html5lib')
soup2 = bs4.BeautifulSoup(review_info[2], 'html5lib')
soup3 = bs4.BeautifulSoup(review_info[3], 'html5lib')
soup5 = bs4.BeautifulSoup(review_info[5], 'html5lib')

review_url = soup.a.get('href')
review = soup.a.get('title')

band_url = soup2.a.get('href')
band = soup2.a.text

album_url = soup3.a.get('href')
album = soup3.a.text

reviewer_url = soup5.a.get('href')
reviewer = soup5.a.text

print(year, month, day, hour, minute, band, band_url, album, album_url, review, review_url, score, reviewer, reviewer_url)

In [None]:
def get_ID_from_band_URL(band_url):
    return int(band_url.split('/')[-1])
get_ID_from_band_URL(band_url)

## Now let's scrape a band page

In [None]:
band_url = 'https://www.metal-archives.com/bands/Panopticon/126117'
headers = {'user-agent': 'bot'} # give a user agent so we don't 403
response = requests.get(band_url, headers=headers)

In [None]:
print(response.status_code)
#print(response.text)

In [None]:
soup = bs4.BeautifulSoup(response.text)

Get the band name (to eventually check that it matches what we requested)

In [None]:
band_name_list = soup.find_all('h1', 'band_name')
assert len(band_name_list) == 1, 'should only have one band name header element'
band_name_tag = band_name_list[0]
print(band_name_tag)
print(band_name_tag.a.get('href'))
print(band_name_tag.text)

Get the band comment/"read-more"

In [None]:
band_id = get_ID_from_band_URL(band_url)
read_more_url = 'https://www.metal-archives.com/band/read-more/id/' + str(band_id)
response = requests.get(read_more_url, headers=headers)

In [None]:
print(response.status_code)

In [None]:
soup = bs4.BeautifulSoup(response.text)

In [None]:
soup.body