In [2]:
from bs4 import BeautifulSoup
import csv
import requests
import pandas as pd

In [117]:
official_wiki_url_list = [
    "https://en.wikipedia.org/wiki/List_of_countries_by_total_renewable_water_resources",
    "https://en.wikipedia.org/wiki/List_of_countries_by_average_elevation",
    "https://en.wikipedia.org/wiki/List_of_countries_by_ecological_footprint",
    "https://en.wikipedia.org/wiki/List_of_countries_by_length_of_coastline",
    "https://en.wikipedia.org/wiki/List_of_countries_by_carbon_intensity_of_GDP",
    "https://en.wikipedia.org/wiki/List_of_countries_by_average_annual_precipitation",
    "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area",
    "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population",
    "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population_density",
    "https://en.wikipedia.org/wiki/List_of_elevation_extremes_by_country",
]

In [119]:
wiki_url_list = [
    "NULL"
]

In [120]:
# Backup request for when pandas does not work. Uses BS4.
def backup_request(url):
    try: 
        # Get the page url and scrape the data
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        
        # Find the title and use it as the csv file name
        title_html = soup.find("h1", id="firstHeading")
        title_html = [el.text.strip() for el in title_html][0].replace(" ", "_")
        title = "data/" + title_html + ".csv"
        print(f"{url}, title: {title}")

        # Get the table     
        table = soup.find('table', attrs={'class':'wikitable'})
        # Find the table header names. Use them for col names
        thead = table.find_all('th')
        thead = [el.text.strip() for el in thead]
        thead = [i.replace(" ", "_") for i in thead]

        # Get the rows. For each row, strip the info and save to a list
        table_rows = table.find_all('tr')
        res = []
        for tr in table_rows:
            td = tr.find_all('td')
            row = [tr.text.strip() for tr in td if tr.text.strip()]
            if row:
                res.append(row)
        
        # Convert rows to dataframe
        df = pd.DataFrame(res, columns=thead)
        # Save to csv
        df.to_csv(title, index=False)
        
    # Common exception: Known bug with assigning the column headers
    except Exception as e:
        print(f"Error: {e}")

In [147]:
dataframe = []
for url in wiki_url_list:
    try:        
        table = pd.read_html(url)
        
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        
        title_html = soup.find("h1", id="firstHeading")
        title_html = [el.text.strip() for el in title_html][0].replace(" ", "_")
        title = "data/" + title_html + ".csv"
        print(f"{url}, title: {title}")
        
        position = -1
        length = -1
        for i in range(0, len(table)):
            if len(table[i]) > length:
                position = i
                length = len(table[i])
        
        df = table[position]
        dataframe.append(df)
        df.to_csv(title, index=False)
        
    except ValueError as e:
        print(f"Error: {e}")
        try:
            backup_request(url)
        except Exception as e:
            print(e)

https://en.wikipedia.org/wiki/List_of_countries_by_ecological_footprint, title: data/List_of_countries_by_ecological_footprint.csv
https://en.wikipedia.org/wiki/List_of_countries_by_carbon_intensity_of_GDP, title: data/List_of_countries_by_carbon_intensity_of_GDP.csv
https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area, title: data/List_of_countries_and_dependencies_by_area.csv
