# Scraping exploration

Tailor scraping commands to extract desired information

In [6]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import json
import re

In [7]:
# Get html
r = requests.get('https://www.dgcoursereview.com/course.php?id=1')
soup = BeautifulSoup(r.text, 'html.parser')

In [8]:
soup

<!DOCTYPE html>

<html class="has-no-js template-dg_course_view" data-app="public" data-container-key="" data-content-key="" data-cookie-prefix="xf_" data-csrf="1723640587,3413215958cc2229644f25dedc5fbf42" data-logged-in="false" data-template="dg_course_view" dir="LTR" id="XF" lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, viewport-fit=cover" name="viewport"/>
<title>Prompton State Park | Disc Golf Course Review</title>
<link href="/webmanifest.php" rel="manifest"/>
<meta content="DGCR" name="apple-mobile-web-app-title"/>
<link href="https://cdn.dgcoursereview.com/data/assets/logo/dgcr-icon-192.png" rel="apple-touch-icon"/>
<meta content="Prompton State Park" property="og:title">
<meta content="Prompton State Park" property="twitter:title">
<meta content="A very scenic, mostly open course overlooking Prompton Lake with considerable elevation changes.  Each hole has 2 baskets, yellow

In [9]:
soup.find(property = ['og:'+'description']).get('content')

'A very scenic, mostly open course overlooking Prompton Lake with considerable elevation changes.  Each hole has 2 baskets, yellows are short, reds are long. The lake comes into play on three holes as well.'

In [10]:
# A bunch of useful information is contained in the block of type 'application/ld+json'
dict=json.loads(soup.find('script', type="application/ld+json").text)
print(dict)

{'@context': 'https://schema.org', '@type': 'SportsActivityLocation', 'name': 'Prompton State Park', '@id': 'https://www.dgcoursereview.com/courses/prompton-state-park.1', 'description': 'A very scenic, mostly open course overlooking Prompton Lake with considerable elevation changes.  Each hole has 2 baskets, yellows are short, reds are long. The lake comes into play on three holes as well.', 'isAccessibleForFree': True, 'url': 'https://www.dgcoursereview.com/courses/prompton-state-park.1', 'address': {'@type': 'PostalAddress', 'streetAddress': '[unknown]', 'addressLocality': 'Honesdale', 'addressRegion': 'Pennsylvania', 'postalCode': '18456', 'addressCountry': 'United States'}, 'geo': {'@type': 'GeoCoordinates', 'latitude': 41.591367, 'longitude': -75.330167}, 'hasMap': 'https://www.google.com/maps/search/?api=1&query=41.591367,-75.330167', 'image': 'https://cdn.dgcoursereview.com/data/attach/77/77581-74568552.jpg', 'aggregateRating': {'@type': 'AggregateRating', 'ratingValue': 3.95, 

In [11]:
dict['name']

'Prompton State Park'

In [12]:
soup.find('script', type="application/ld+json").text

'\n        {\n    "@context": "https://schema.org",\n    "@type": "SportsActivityLocation",\n    "name": "Prompton State Park",\n    "@id": "https://www.dgcoursereview.com/courses/prompton-state-park.1",\n    "description": "A very scenic, mostly open course overlooking Prompton Lake with considerable elevation changes.  Each hole has 2 baskets, yellows are short, reds are long. The lake comes into play on three holes as well.",\n    "isAccessibleForFree": true,\n    "url": "https://www.dgcoursereview.com/courses/prompton-state-park.1",\n    "address": {\n        "@type": "PostalAddress",\n        "streetAddress": "[unknown]",\n        "addressLocality": "Honesdale",\n        "addressRegion": "Pennsylvania",\n        "postalCode": "18456",\n        "addressCountry": "United States"\n    },\n    "geo": {\n        "@type": "GeoCoordinates",\n        "latitude": 41.591367,\n        "longitude": -75.330167\n    },\n    "hasMap": "https://www.google.com/maps/search/?api=1&query=41.591367,-7

In [13]:
soup.find(string='Holes').find_parent().find_previous().text

'\nHoles\n'

In [14]:
soup.find(string = re.compile('Designer')).find_parent().find_next().text

'John Harvey Dave Harvey'

In [15]:
soup.find(string = re.compile('Water in play')).find_parent().find_previous().text

'5'

In [16]:
soup.find(string=re.compile('Par Info')).find_parent().next_sibling

'\n'

In [17]:
D = soup.find_all('div', class_ = 'c-course-details-row')

In [18]:
D[4].text

'\nDGCR SSE: [ ? ]\n\n\n\n46.8\n\n\n\n54.3\n\n\n'

In [19]:
soup.find(string = re.compile('Par Info')).find_parent().find_parent()

<div class="c-course-details-row">
<span>Par Info:</span>
<span>
<span class="c-bullet">
<span class="c-bullet-icon" style="background-color:#FFFF00;"></span>
57
</span>
<span class="c-bullet">
<span class="c-bullet-icon" style="background-color:#FF0000;"></span>
67
</span>
</span>
</div>

In [20]:
C=soup.find_all('div', class_='c-course-details-row')                       

In [21]:
A = soup.find_all(class_='c-course-details-row', string=re.compile('DGCR'))

In [22]:
S = soup.find(string=re.compile('SSE')).find_parent().find_parent().find_all(class_="c-bullet")

q = []
for s in S:
    q.append(s.text.strip())

In [23]:
q

['46.8', '54.3']

In [24]:
#soup.find('h3', string="Course conditions:").find_parent().find(class_='active').text

In [25]:
for s in soup.find_all(class_="c-course-stat"):
    print(s.text)


5864 ft. - 8071 ft. Metric


Moderately Hilly


Lightly Wooded


18
Holes


18
Baskets


5
Water in play



In [26]:
soup.find(class_='c-course-stat-label', string='Holes').find_previous().text

'18'

In [27]:
soup.find(class_='c-course-stat-label', string='Water in play')

<span class="c-course-stat-label">Water in play</span>

In [28]:
soup.find(class_='c-course-stat-label', string='Water in play').find_previous().text

'5'

In [29]:
# Scrape length
S = soup.find_all(class_="dg_unit")
for s in S:
    print(s['data-meters'])

1787
2460


In [30]:
# Woods
[v.text.strip() for v in soup.find(class_="c-course-course_info").find_all(class_="c-course-stat")]

['5864 ft.\xa0-\xa08071 ft. Metric',
 'Moderately Hilly',
 'Lightly Wooded',
 '18\nHoles',
 '18\nBaskets',
 '5\nWater in play']

In [31]:
# Rounds recorded\
Q = []
for s in soup.find(string=re.compile('Rounds Recorded')).find_parent().find_next():
    Q.append(s.text.strip())

In [32]:
[s.text.strip() for s in soup.find(string=re.compile('Rounds Recorded')).find_parent().find_next()]

['', '101 / 62', '', '77 / 60', '', '20 / 67', '']

In [33]:
for s in soup.find_all(class_='fas fa-link'):
    print(s.find_parent()['href'])

http://www.dcnr.state.pa.us/stateparks/findapark/prompton/index.htm


In [34]:
links = []
for tag in soup.find_all(class_='fas fa-link'):
    links.append(tag.find_parent()['href'])

In [35]:
links

['http://www.dcnr.state.pa.us/stateparks/findapark/prompton/index.htm']

In [36]:
[tag.find_parent()['href'] for tag in soup.find_all(class_='fas fa-link')]

['http://www.dcnr.state.pa.us/stateparks/findapark/prompton/index.htm']

In [37]:
len(soup.find_all(class_='c-course _extinct'))

0

In [38]:
Q = soup.find(string='Rounds Recorded / Average Score:').find_parent().find_next()

In [39]:
print(Q)

<span>
<a href="/courses/prompton-state-park.1/rounds">101 / 62</a>
<span class="c-bullet"><span class="c-bullet-icon" style="background-color:#FFFF00;"></span>77 / 60</span>
<span class="c-bullet"><span class="c-bullet-icon" style="background-color:#FF0000;"></span>20 / 67</span>
</span>


In [40]:
Q.find('a').text

'101 / 62'

In [41]:
d = {}
for q in Q.find_all(class_="c-bullet"):
    color = q.next_element['style'].split(':')[-1][:-1]
    roundsandavg = q.text
    d[color] = roundsandavg

In [46]:
d

{'#FFFF00': '77 / 60', '#FF0000': '20 / 67'}