# Installs

In [None]:
pip install requests

In [None]:
pip install beautifulsoup4

# Imports

In [None]:
import re
import json
from requests import get
from bs4 import BeautifulSoup
from pprint import pprint as pp

# Setup

In [None]:
countryCodes = json.loads(get("https://gist.githubusercontent.com/jgphilpott/e49c4f53384c8e6528a4762218797355/raw/0a8d9c4e9970d2ce37810cfd7807105c7efd94b5/geoNamesCodes.js").content.decode("utf-8"))
api = "https://www.geonames.org/advanced-search.html?q=museum"
feature_class = "&featureClass=S"
max_rows = "&maxRows=500"

# Scrape Raw Data

In [None]:
data = []

for countryCode in countryCodes:

    country = "&country={}".format(countryCode)

    try:

        soup = BeautifulSoup(get(api + feature_class + max_rows + country).content.decode("utf-8"))

        if not soup.find_all(text="no records found in geonames database, showing wikipedia results") and not len(soup.find_all("table", class_="restable")) < 2:

            row_count = 0
            meta = soup.find("small").text
            records = int("".join(char for char in meta if char.isdigit()))

            while row_count < records and row_count <= 5000:

                print("Scraping: " + api + feature_class + max_rows + country + "&startRow={}".format(row_count))

                soup = BeautifulSoup(get(api + feature_class + max_rows + country + "&startRow={}".format(row_count)).content.decode("utf-8"))

                table = soup.find_all("table", class_="restable")[1]
                table_data = table.find_all("tr")[2:-1]

                for row in table_data:

                    museum = {"id": len(data) + 1}

                    cells = row.find_all("td")[1:]

                    if len(cells[0].find_all("a")) >= 1:
                        museum["name"] = cells[0].find_all("a")[0].text.strip()
                    else:
                        museum["name"] = ""

                    if len(cells[0].find_all("a")) >= 2:
                        museum["wiki"] = cells[0].find_all("a")[1]["href"].strip()
                    else:
                        museum["wiki"] = ""

                    if len(cells[1].find_all("a")) >= 1:
                        museum["country"] = cells[1].find_all("a")[0].text.strip()
                    else:
                        museum["country"] = ""

                    if len(cells[1].contents) >= 2:
                        museum["zone"] = re.sub(r"[^\w\s]", "", cells[1].contents[1]).strip()
                        if len(cells[1].contents) >= 4:
                            museum["sub_zone"] = cells[1].contents[3].text.strip()
                        else:
                            museum["sub_zone"] = ""
                    else:
                        museum["zone"] = ""

                    if len(cells[2].contents) >= 1 and str(cells[2].contents[0]) != "<br/>":
                        museum["feature"] = cells[2].contents[0].strip()
                    else:
                        museum["feature"] = ""

                    if len(cells[2].contents) > 3:
                        museum["elevation_m"] = int("".join(char for char in cells[2].contents[2].text if char.isdigit()))
                    else:
                        museum["elevation_m"] = None

                    museum["latitude"] = cells[3].text.strip()
                    museum["longitude"] = cells[4].text.strip()

                    data.append(museum)

                row_count += 500

        else:

            print("No data at: " + api + feature_class + max_rows + country + "&startRow={}".format(row_count))

    except:

        print("Error at: " + api + feature_class + max_rows + country + "&startRow={}".format(row_count))

print("Done!")

# Save Raw Data

In [None]:
with open("raw_museum_data.json", "w") as file:
    json.dump(data, file, indent=2, sort_keys=True)