In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import numpy as np
from datetime import datetime as dt
import re

In [2]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States')
response.status_code

200

In [3]:
soup = BS(response.text)

In [4]:
states_table = soup.find_all('table', {'class': 'wikitable sortable'})

In [5]:
parks_by_state = pd.read_html(str(states_table[0]))[0] # Automatically parses first table's html to pandas dataframe

In [6]:
parks_by_state.head()

Unnamed: 0,State,Total parks,Exclusive parks,Shared parks
0,California,9,8,1
1,Alaska,8,8,—
2,Utah,5,5,—
3,Colorado,4,4,—
4,Arizona,3,3,—


In [7]:
parks_by_state = parks_by_state.replace('—', np.NaN)

In [8]:
parks_by_state

Unnamed: 0,State,Total parks,Exclusive parks,Shared parks
0,California,9,8.0,1.0
1,Alaska,8,8.0,
2,Utah,5,5.0,
3,Colorado,4,4.0,
4,Arizona,3,3.0,
5,Florida,3,3.0,
6,Washington,3,3.0,
7,Hawaii,2,2.0,
8,New Mexico,2,2.0,
9,South Dakota,2,2.0,


In [9]:
parks_by_state = parks_by_state.rename(columns={'State':'state'
                                               ,'Total parks':'total_parks'
                                               ,'Exclusive parks':'exclusive_parks'
                                               ,'Shared parks':'shared_parks'})
parks_by_state.columns

Index(['state', 'total_parks', 'exclusive_parks', 'shared_parks'], dtype='object')

In [42]:
parks_by_state.to_csv('../data/wikipedia/parks_by_state.csv', index=False)

In [11]:
parks_table = soup.find_all('table')

In [12]:
parks_table = pd.read_html(str(parks_table[0]))[0]

In [13]:
parks_table.head(21)

Unnamed: 0,Name,Image,Location,Date established as park[12],Area (2023)[8],Recreation visitors (2022)[11],Description
0,Acadia,,"Maine .mw-parser-output .geo-default,.mw-parse...","February 26, 1919","49,071.40 acres (198.6 km2)",3970260,Covering most of Mount Desert Island and other...
1,American Samoa,,American Samoa 14°15′S 170°41′W﻿ / ﻿14.25°S 17...,"October 31, 1988","8,256.67 acres (33.4 km2)",12135,The southernmost national park is on three Sam...
2,Arches,,Utah 38°41′N 109°34′W﻿ / ﻿38.68°N 109.57°W,"November 12, 1971","76,678.98 acres (310.3 km2)",1460652,"This site features more than 2,000 natural san..."
3,Badlands,,South Dakota 43°45′N 102°30′W﻿ / ﻿43.75°N 102....,"November 10, 1978","242,755.94 acres (982.4 km2)",1006809,"The Badlands are a collection of buttes, pinna..."
4,Big Bend †,,Texas 29°15′N 103°15′W﻿ / ﻿29.25°N 103.25°W,"June 12, 1944","801,163.21 acres (3,242.2 km2)",514107,Named for the prominent bend in the Rio Grande...
5,Biscayne,,Florida 25°39′N 80°05′W﻿ / ﻿25.65°N 80.08°W,"June 28, 1980","172,971.11 acres (700.0 km2)",701023,"The central part of Biscayne Bay, this mostly ..."
6,Black Canyon of the Gunnison,,Colorado 38°34′N 107°43′W﻿ / ﻿38.57°N 107.72°W,"October 21, 1999","30,779.83 acres (124.6 km2)",297257,The park protects a quarter of the Gunnison Ri...
7,Bryce Canyon,,Utah 37°34′N 112°11′W﻿ / ﻿37.57°N 112.18°W,"February 25, 1928","35,835.08 acres (145.0 km2)",2354660,Bryce Canyon is a geological amphitheater on s...
8,Canyonlands,,Utah 38°12′N 109°56′W﻿ / ﻿38.2°N 109.93°W,"September 12, 1964","337,597.83 acres (1,366.2 km2)",779147,This landscape was eroded into a maze of canyo...
9,Capitol Reef,,Utah 38°12′N 111°10′W﻿ / ﻿38.20°N 111.17°W,"December 18, 1971","241,904.50 acres (979.0 km2)",1227608,The park's Waterpocket Fold is a 100-mile (160...


In [14]:
parks_table = parks_table.rename(columns={'Name':'park_name'
                                        , 'Location':'location'
                                        , 'Date established as park[12]':'date_established'
                                        , 'Area (2023)[8]':'area'
                                        , 'Description':'description'})

In [15]:
parks_table.head(1)

Unnamed: 0,park_name,Image,location,date_established,area,Recreation visitors (2022)[11],description
0,Acadia,,"Maine .mw-parser-output .geo-default,.mw-parse...","February 26, 1919","49,071.40 acres (198.6 km2)",3970260,Covering most of Mount Desert Island and other...


In [16]:
parks_table = parks_table.drop(columns=['Image', 'Recreation visitors (2022)[11]'])
parks_table.head(1)

Unnamed: 0,park_name,location,date_established,area,description
0,Acadia,"Maine .mw-parser-output .geo-default,.mw-parse...","February 26, 1919","49,071.40 acres (198.6 km2)",Covering most of Mount Desert Island and other...


In [17]:
parks_table.iloc[57]

park_name                                                 White Sands
location             New Mexico 32°47′N 106°10′W﻿ / ﻿32.78°N 106.17°W
date_established                               December 20, 2019[111]
area                                     146,344.31 acres (592.2 km2)
description         Located in the mountain-ringed Tularosa Basin,...
Name: 57, dtype: object

In [18]:
parks_table.iloc[20].date_established = 'February 22, 2018'

In [19]:
parks_table.iloc[57].date_established = 'December 20, 2019'

In [20]:
parks_table.date_established = pd.to_datetime(parks_table['date_established'], format='%B %d, %Y')

In [21]:
parks_table

Unnamed: 0,park_name,location,date_established,area,description
0,Acadia,"Maine .mw-parser-output .geo-default,.mw-parse...",1919-02-26,"49,071.40 acres (198.6 km2)",Covering most of Mount Desert Island and other...
1,American Samoa,American Samoa 14°15′S 170°41′W﻿ / ﻿14.25°S 17...,1988-10-31,"8,256.67 acres (33.4 km2)",The southernmost national park is on three Sam...
2,Arches,Utah 38°41′N 109°34′W﻿ / ﻿38.68°N 109.57°W,1971-11-12,"76,678.98 acres (310.3 km2)","This site features more than 2,000 natural san..."
3,Badlands,South Dakota 43°45′N 102°30′W﻿ / ﻿43.75°N 102....,1978-11-10,"242,755.94 acres (982.4 km2)","The Badlands are a collection of buttes, pinna..."
4,Big Bend †,Texas 29°15′N 103°15′W﻿ / ﻿29.25°N 103.25°W,1944-06-12,"801,163.21 acres (3,242.2 km2)",Named for the prominent bend in the Rio Grande...
...,...,...,...,...,...
58,Wind Cave,South Dakota 43°34′N 103°29′W﻿ / ﻿43.57°N 103....,1903-01-09,"33,970.84 acres (137.5 km2)",Wind Cave is distinctive for its calcite fin f...
59,Wrangell–St. Elias *,Alaska 61°00′N 142°00′W﻿ / ﻿61.00°N 142.00°W,1980-12-02,"8,323,146.48 acres (33,682.6 km2)",The largest national park in the system protec...
60,Yellowstone ‡,"Wyoming, Montana, Idaho 44°36′N 110°30′W﻿ / ﻿4...",1872-03-01,"2,219,790.71 acres (8,983.2 km2)","Situated on the Yellowstone Caldera, the park ..."
61,Yosemite *,California 37°50′N 119°30′W﻿ / ﻿37.83°N 119.50°W,1890-10-01,"761,747.50 acres (3,082.7 km2)","Yosemite features sheer granite cliffs, except..."


In [22]:
parks_table[['area_acres', 'area_km2']] = parks_table['area'].str.split(' acres ', n=1, expand = True)

In [23]:
parks_table.area_km2 = parks_table.area_km2.str[1:-5]

In [24]:
parks_table = parks_table.drop(columns='area')

In [25]:
parks_table['area_acres'] = parks_table['area_acres'].str.replace(',', '')

In [26]:
parks_table['area_km2'] = parks_table['area_km2'].str.replace(',', '')

In [27]:
parks_table

Unnamed: 0,park_name,location,date_established,description,area_acres,area_km2
0,Acadia,"Maine .mw-parser-output .geo-default,.mw-parse...",1919-02-26,Covering most of Mount Desert Island and other...,49071.40,198.6
1,American Samoa,American Samoa 14°15′S 170°41′W﻿ / ﻿14.25°S 17...,1988-10-31,The southernmost national park is on three Sam...,8256.67,33.4
2,Arches,Utah 38°41′N 109°34′W﻿ / ﻿38.68°N 109.57°W,1971-11-12,"This site features more than 2,000 natural san...",76678.98,310.3
3,Badlands,South Dakota 43°45′N 102°30′W﻿ / ﻿43.75°N 102....,1978-11-10,"The Badlands are a collection of buttes, pinna...",242755.94,982.4
4,Big Bend †,Texas 29°15′N 103°15′W﻿ / ﻿29.25°N 103.25°W,1944-06-12,Named for the prominent bend in the Rio Grande...,801163.21,3242.2
...,...,...,...,...,...,...
58,Wind Cave,South Dakota 43°34′N 103°29′W﻿ / ﻿43.57°N 103....,1903-01-09,Wind Cave is distinctive for its calcite fin f...,33970.84,137.5
59,Wrangell–St. Elias *,Alaska 61°00′N 142°00′W﻿ / ﻿61.00°N 142.00°W,1980-12-02,The largest national park in the system protec...,8323146.48,33682.6
60,Yellowstone ‡,"Wyoming, Montana, Idaho 44°36′N 110°30′W﻿ / ﻿4...",1872-03-01,"Situated on the Yellowstone Caldera, the park ...",2219790.71,8983.2
61,Yosemite *,California 37°50′N 119°30′W﻿ / ﻿37.83°N 119.50°W,1890-10-01,"Yosemite features sheer granite cliffs, except...",761747.50,3082.7


In [28]:
parks_table.park_name = parks_table['park_name'].str.replace(r'[\*†‡]$', '', regex=True).str.strip()

In [29]:
parks_table

Unnamed: 0,park_name,location,date_established,description,area_acres,area_km2
0,Acadia,"Maine .mw-parser-output .geo-default,.mw-parse...",1919-02-26,Covering most of Mount Desert Island and other...,49071.40,198.6
1,American Samoa,American Samoa 14°15′S 170°41′W﻿ / ﻿14.25°S 17...,1988-10-31,The southernmost national park is on three Sam...,8256.67,33.4
2,Arches,Utah 38°41′N 109°34′W﻿ / ﻿38.68°N 109.57°W,1971-11-12,"This site features more than 2,000 natural san...",76678.98,310.3
3,Badlands,South Dakota 43°45′N 102°30′W﻿ / ﻿43.75°N 102....,1978-11-10,"The Badlands are a collection of buttes, pinna...",242755.94,982.4
4,Big Bend,Texas 29°15′N 103°15′W﻿ / ﻿29.25°N 103.25°W,1944-06-12,Named for the prominent bend in the Rio Grande...,801163.21,3242.2
...,...,...,...,...,...,...
58,Wind Cave,South Dakota 43°34′N 103°29′W﻿ / ﻿43.57°N 103....,1903-01-09,Wind Cave is distinctive for its calcite fin f...,33970.84,137.5
59,Wrangell–St. Elias,Alaska 61°00′N 142°00′W﻿ / ﻿61.00°N 142.00°W,1980-12-02,The largest national park in the system protec...,8323146.48,33682.6
60,Yellowstone,"Wyoming, Montana, Idaho 44°36′N 110°30′W﻿ / ﻿4...",1872-03-01,"Situated on the Yellowstone Caldera, the park ...",2219790.71,8983.2
61,Yosemite,California 37°50′N 119°30′W﻿ / ﻿37.83°N 119.50°W,1890-10-01,"Yosemite features sheer granite cliffs, except...",761747.50,3082.7


In [30]:
parks_table.iloc[0].description

'Covering most of Mount Desert Island and other coastal islands, Acadia features the tallest mountain on the Atlantic coast of the United States, granite peaks, ocean shoreline, woodlands, and lakes. There are freshwater, estuary, forest, and intertidal habitats.[13][14]'

In [31]:
parks_table.description = parks_table.description.str.replace(r'\[.*$', '', regex=True)

In [32]:
parks_table.iloc[0].description

'Covering most of Mount Desert Island and other coastal islands, Acadia features the tallest mountain on the Atlantic coast of the United States, granite peaks, ocean shoreline, woodlands, and lakes. There are freshwater, estuary, forest, and intertidal habitats.'

In [33]:
parks_table.iloc[0].location

'Maine .mw-parser-output .geo-default,.mw-parser-output .geo-dms,.mw-parser-output .geo-dec{display:inline}.mw-parser-output .geo-nondefault,.mw-parser-output .geo-multi-punct,.mw-parser-output .geo-inline-hidden{display:none}.mw-parser-output .longitude,.mw-parser-output .latitude{white-space:nowrap}44°21′N 68°13′W\ufeff / \ufeff44.35°N 68.21°W'

In [34]:
def getState(location):
    for state in parks_by_state.state:
        if location.startswith(state):
            return state

In [35]:
parks_table['state'] = parks_table.location.apply(getState)

In [36]:
parks_table

Unnamed: 0,park_name,location,date_established,description,area_acres,area_km2,state
0,Acadia,"Maine .mw-parser-output .geo-default,.mw-parse...",1919-02-26,Covering most of Mount Desert Island and other...,49071.40,198.6,Maine
1,American Samoa,American Samoa 14°15′S 170°41′W﻿ / ﻿14.25°S 17...,1988-10-31,The southernmost national park is on three Sam...,8256.67,33.4,American Samoa
2,Arches,Utah 38°41′N 109°34′W﻿ / ﻿38.68°N 109.57°W,1971-11-12,"This site features more than 2,000 natural san...",76678.98,310.3,Utah
3,Badlands,South Dakota 43°45′N 102°30′W﻿ / ﻿43.75°N 102....,1978-11-10,"The Badlands are a collection of buttes, pinna...",242755.94,982.4,South Dakota
4,Big Bend,Texas 29°15′N 103°15′W﻿ / ﻿29.25°N 103.25°W,1944-06-12,Named for the prominent bend in the Rio Grande...,801163.21,3242.2,Texas
...,...,...,...,...,...,...,...
58,Wind Cave,South Dakota 43°34′N 103°29′W﻿ / ﻿43.57°N 103....,1903-01-09,Wind Cave is distinctive for its calcite fin f...,33970.84,137.5,South Dakota
59,Wrangell–St. Elias,Alaska 61°00′N 142°00′W﻿ / ﻿61.00°N 142.00°W,1980-12-02,The largest national park in the system protec...,8323146.48,33682.6,Alaska
60,Yellowstone,"Wyoming, Montana, Idaho 44°36′N 110°30′W﻿ / ﻿4...",1872-03-01,"Situated on the Yellowstone Caldera, the park ...",2219790.71,8983.2,Wyoming
61,Yosemite,California 37°50′N 119°30′W﻿ / ﻿37.83°N 119.50°W,1890-10-01,"Yosemite features sheer granite cliffs, except...",761747.50,3082.7,California


In [37]:
parks_table = parks_table.drop(columns='location')

In [38]:
parks_table

Unnamed: 0,park_name,date_established,description,area_acres,area_km2,state
0,Acadia,1919-02-26,Covering most of Mount Desert Island and other...,49071.40,198.6,Maine
1,American Samoa,1988-10-31,The southernmost national park is on three Sam...,8256.67,33.4,American Samoa
2,Arches,1971-11-12,"This site features more than 2,000 natural san...",76678.98,310.3,Utah
3,Badlands,1978-11-10,"The Badlands are a collection of buttes, pinna...",242755.94,982.4,South Dakota
4,Big Bend,1944-06-12,Named for the prominent bend in the Rio Grande...,801163.21,3242.2,Texas
...,...,...,...,...,...,...
58,Wind Cave,1903-01-09,Wind Cave is distinctive for its calcite fin f...,33970.84,137.5,South Dakota
59,Wrangell–St. Elias,1980-12-02,The largest national park in the system protec...,8323146.48,33682.6,Alaska
60,Yellowstone,1872-03-01,"Situated on the Yellowstone Caldera, the park ...",2219790.71,8983.2,Wyoming
61,Yosemite,1890-10-01,"Yosemite features sheer granite cliffs, except...",761747.50,3082.7,California


In [41]:
parks_table.to_csv('../data/wikipedia/parks_info.csv', index=False)