Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base fork: jstuckey/Nice-Time-on-Ice
...
head fork: jstuckey/Nice-Time-on-Ice
  • 2 commits
  • 2 files changed
  • 0 commit comments
  • 1 contributor
Showing with 127 additions and 132 deletions.
  1. +60 −65 data/dailyscraper.py
  2. +67 −67 data/gamescraper.py
View
125 data/dailyscraper.py
@@ -1,4 +1,5 @@
import os
+import sys
import re
import time
import urllib2
@@ -18,20 +19,19 @@
else:
os.system('rm dailyscrape.txt')
- # Open the file to write to
+ # Open the file to write to
if rootDir:
scrapeFile = open('data/dailyscrape.txt', 'a')
else:
scrapeFile = open('dailyscrape.txt', 'a')
+ # Regular expression for parsing game preview links to extract the game ID
+ gameLinkPattern = re.compile('/ice/preview\.htm\?id=')
# Hardcode this season's seasonID and gameType
seasonID = '20112012'
gameType = 'Playoffs'
- # Regular expression for parsing game preview links to extract the game ID
- gameLinkPattern = re.compile('/ice/preview\.htm\?id=')
-
print 'Connecting to nhl.com daily schedule...'
# Make some soup out of the nhl.com daily schedule page
@@ -39,16 +39,18 @@
print 'Scraping nhl.com schedule for today\'s games...'
- # Today's date is in the h3 tag inside teh contentBlock div
+ # Check if there are games scheduled for today
+ noGamesDiv = soup('div', {'id' : 'noGamesScheduled'})
+ if noGamesDiv:
+ print 'No games scheduled for today'
+ sys.exit()
+
+ # Today's date is in the h3 tag inside the contentBlock div
# Date format is EEEE MMMM dd, yyyy - Needs to be EEE MMM dd, yyyy for consistency
- # Python's date formats suck....
gameDateRaw = soup('div', {'class' : 'contentBlock'})[0].h3.contents[0]
gameDateObj = time.strptime(gameDateRaw.strip(), '%A %B %d, %Y')
gameDate = time.strftime('%a %b %d, %Y', gameDateObj)
-
- print gameDate
-
# Get table elements with class == schedTbl
scheduleTables = soup('table', {'class' : 'schedTbl'})
if len(scheduleTables) > 1:
@@ -60,65 +62,58 @@
# Loop through the tr tags in the schedule table
for row in scheduleTable.tbody('tr'):
- # Variables to scrape
- homeTeam = None
- awayTeam = None
- nhlGameID = None
-
- # Get the td tags from the current tr tag
- tds = row('td')
-
- # Loop through td tags and look for the date, away team abbreviation, home team abbreviation, and nhl game ID
- for col in tds:
- # td classes are:
- # - team (x2)
- # - time
- # - tvInfo
- # - skedLinks
-
- # Try to get the div that contains the team name (either away or home)
- teamNameDivs = col('div', {'class' : 'teamName'})
- if teamNameDivs:
- # Loop through the team divs and try to pull out the team abbreviation
- for div in teamNameDivs:
- # Get the anchor inside the div that holds the team abbreviation
- # The anchor should be the first child element of the div
- teamNameAnchor = div.contents[0]
+ # Variables to scrape
+ homeTeam = None
+ awayTeam = None
+ nhlGameID = None
+
+ # Get the td tags from the current tr tag
+ tds = row('td')
+
+ # Loop through td tags and look for the date, away team abbreviation, home team abbreviation, and nhl game ID
+ for col in tds:
+ # td classes are:
+ # - team (x2)
+ # - time
+ # - tvInfo
+ # - skedLinks
+
+ # Try to get the div that contains the team name (either away or home)
+ teamNameDivs = col('div', {'class' : 'teamName'})
+ if teamNameDivs:
+ # Loop through the team divs and try to pull out the team abbreviation
+ for div in teamNameDivs:
+ # Get the anchor inside the div that holds the team abbreviation
+ # The anchor should be the first child element of the div
+ teamNameAnchor = div.contents[0]
- # Check if we actually got the anchor and that it has a rel attribute
- # The rel attribute holds the team abbreviation
- # The All Star Game tds have a different structure, so this check is necessary. Silly All Star Game...
- if isinstance(teamNameAnchor, Tag) and 'rel' in teamNameAnchor.attrs:
- # Pull out the rel attribute for the away team and the home team
- # The away team td is always before the home team td
- if not awayTeam:
- awayTeam = teamNameAnchor['rel'][0]
- elif not homeTeam:
- homeTeam = teamNameAnchor['rel'][0]
+ # Check if we actually got the anchor and that it has a rel attribute
+ # The rel attribute holds the team abbreviation
+ # The All Star Game tds have a different structure, so this check is necessary. Silly All Star Game...
+ if isinstance(teamNameAnchor, Tag) and 'rel' in teamNameAnchor.attrs:
+ # Pull out the rel attribute for the away team and the home team
+ # The away team td is always before the home team td
+ if not awayTeam:
+ awayTeam = teamNameAnchor['rel'][0]
+ elif not homeTeam:
+ homeTeam = teamNameAnchor['rel'][0]
- # Try to get the anchor that contains the game preview link
- # The preview link contains the nhl game ID
- skedLinkAnchors = col('a', {'class', 'btn'})
- if skedLinkAnchors:
- # Loop through the anchors and look for the preview link
- for anchor in skedLinkAnchors:
- # Use the reg ex compiled above to check if it is a preview link
- regExMatch = gameLinkPattern.match(anchor['href'])
- if regExMatch:
- # Extract the nhl game ID from the end of the preview link
- nhlGameID = anchor['href'][regExMatch.end():]
+ # Try to get the anchor that contains the game preview link
+ # The preview link contains the nhl game ID
+ skedLinkAnchors = col('a', {'class', 'btn'})
+ if skedLinkAnchors:
+ # Loop through the anchors and look for the preview link
+ for anchor in skedLinkAnchors:
+ # Use the reg ex compiled above to check if it is a preview link
+ regExMatch = gameLinkPattern.match(anchor['href'])
+ if regExMatch:
+ # Extract the nhl game ID from the end of the preview link
+ nhlGameID = anchor['href'][regExMatch.end():]
- # Check if we parsed out all the info from the current tr
- if gameDate and awayTeam and homeTeam and nhlGameID:
- scrapeFile.write(seasonID + '***' + ('Regular' if gameType == '2' else 'Playoffs' ) + '***' + awayTeam + '***' + homeTeam + '***' + gameDate + '***' + nhlGameID + '\n')
- print 'Wrote to scrape file: ' + awayTeam + ' at ' + homeTeam + ' on ' + gameDate + ' (' + nhlGameID + ')'
- else:
- print "Missing something"
- print awayTeam
- print homeTeam
- print gameDate
- print nhlGameID
-
+ # Check if we parsed out all the info from the current tr
+ if gameDate and awayTeam and homeTeam and nhlGameID:
+ scrapeFile.write(seasonID + '***' + ('Regular' if gameType == '2' else 'Playoffs' ) + '***' + awayTeam + '***' + homeTeam + '***' + gameDate + '***' + nhlGameID + '\n')
+ print 'Wrote to scrape file: ' + awayTeam + ' at ' + homeTeam + ' on ' + gameDate + ' (' + nhlGameID + ')'
# Close the file connection
scrapeFile.close()
View
134 data/gamescraper.py
@@ -33,73 +33,73 @@ def scrapeForSeason(seasonID, gameType):
# Loop through the tr tags in the schedule table
for row in scheduleTable.tbody('tr'):
- # Variables to scrape
- gameDate = None
- homeTeam = None
- awayTeam = None
- nhlGameID = None
-
- # Get the td tags from the current tr tag
- tds = row('td')
-
- # Loop through td tags and look for the date, away team abbreviation, home team abbreviation, and nhl game ID
- for col in tds:
- # td classes are:
- # - date
- # - team (x2)
- # - time
- # - tvInfo
- # - skedLinks
-
- # Try to get the div that contains the game date
- dateDivs = col('div', {'class' : 'skedStartDateSite'})
- if dateDivs and not gameDate:
- # Found the game date div (More than one. Use the first div)
- gameDate = unicode(dateDivs[0].string)
-
- # Try to get the div that contains the team name (either away or home)
- teamNameDivs = col('div', {'class' : 'teamName'})
- if teamNameDivs:
- # Loop through the team divs and try to pull out the team abbreviation
- for div in teamNameDivs:
- # Get the anchor inside the div that holds the team abbreviation
- # The anchor should be the first child element of the div
- teamNameAnchor = div.contents[0]
-
- # Check if we actually got the anchor and that it has a rel attribute
- # The rel attribute holds the team abbreviation
- # The All Star Game tds have a different structure, so this check is necessary. Silly All Star Game...
- # Also check for Atlanta for seasons before 2011-2012. Atlanta will not have an anchor tag. Poor Thrashers...
- if isinstance(teamNameAnchor, Tag) and 'rel' in teamNameAnchor.attrs:
- # Pull out the rel attribute for the away team and the home team
- # The away team td is always before the home team td
- if not awayTeam:
- awayTeam = teamNameAnchor['rel'][0]
- elif not homeTeam:
- homeTeam = teamNameAnchor['rel'][0]
- elif div.string == 'Atlanta':
- # Hardcode ATL as the away/home team for Atlanta
- # The away team td is always before the home team td
- if not awayTeam:
- awayTeam = 'ATL'
- elif not homeTeam:
- homeTeam = 'ATL'
- # Try to get the anchor that contains the game recap link
- # The recap link contains the nhl game ID
- skedLinkAnchors = col('a', {'class', 'btn'})
- if skedLinkAnchors:
- # Loop through the anchors and look for the recap link
- for anchor in skedLinkAnchors:
- # Use the reg ex compiled above to check if it is a recap link
- regExMatch = gameLinkPattern.match(anchor['href'])
- if regExMatch:
- # Extract the nhl game ID from the end of the recap link
- nhlGameID = anchor['href'][regExMatch.end():]
-
- # Check if we parsed out all the info from the current tr
- if gameDate and awayTeam and homeTeam and nhlGameID:
- scrapeFile.write(seasonID + '***' + ('Regular' if gameType == '2' else 'Playoffs' ) + '***' + awayTeam + '***' + homeTeam + '***' + gameDate + '***' + nhlGameID + '\n')
- print 'Wrote to scrape file: ' + awayTeam + ' at ' + homeTeam + ' on ' + gameDate + ' (' + nhlGameID + ')'
+ # Variables to scrape
+ gameDate = None
+ homeTeam = None
+ awayTeam = None
+ nhlGameID = None
+
+ # Get the td tags from the current tr tag
+ tds = row('td')
+
+ # Loop through td tags and look for the date, away team abbreviation, home team abbreviation, and nhl game ID
+ for col in tds:
+ # td classes are:
+ # - date
+ # - team (x2)
+ # - time
+ # - tvInfo
+ # - skedLinks
+
+ # Try to get the div that contains the game date
+ dateDivs = col('div', {'class' : 'skedStartDateSite'})
+ if dateDivs and not gameDate:
+ # Found the game date div (More than one. Use the first div)
+ gameDate = unicode(dateDivs[0].string)
+
+ # Try to get the div that contains the team name (either away or home)
+ teamNameDivs = col('div', {'class' : 'teamName'})
+ if teamNameDivs:
+ # Loop through the team divs and try to pull out the team abbreviation
+ for div in teamNameDivs:
+ # Get the anchor inside the div that holds the team abbreviation
+ # The anchor should be the first child element of the div
+ teamNameAnchor = div.contents[0]
+
+ # Check if we actually got the anchor and that it has a rel attribute
+ # The rel attribute holds the team abbreviation
+ # The All Star Game tds have a different structure, so this check is necessary. Silly All Star Game...
+ # Also check for Atlanta for seasons before 2011-2012. Atlanta will not have an anchor tag. Poor Thrashers...
+ if isinstance(teamNameAnchor, Tag) and 'rel' in teamNameAnchor.attrs:
+ # Pull out the rel attribute for the away team and the home team
+ # The away team td is always before the home team td
+ if not awayTeam:
+ awayTeam = teamNameAnchor['rel'][0]
+ elif not homeTeam:
+ homeTeam = teamNameAnchor['rel'][0]
+ elif div.string == 'Atlanta':
+ # Hardcode ATL as the away/home team for Atlanta
+ # The away team td is always before the home team td
+ if not awayTeam:
+ awayTeam = 'ATL'
+ elif not homeTeam:
+ homeTeam = 'ATL'
+ # Try to get the anchor that contains the game recap link
+ # The recap link contains the nhl game ID
+ skedLinkAnchors = col('a', {'class', 'btn'})
+ if skedLinkAnchors:
+ # Loop through the anchors and look for the recap link
+ for anchor in skedLinkAnchors:
+ # Use the reg ex compiled above to check if it is a recap link
+ regExMatch = gameLinkPattern.match(anchor['href'])
+ if regExMatch:
+ # Extract the nhl game ID from the end of the recap link
+ nhlGameID = anchor['href'][regExMatch.end():]
+
+ # Check if we parsed out all the info from the current tr
+ if gameDate and awayTeam and homeTeam and nhlGameID:
+ scrapeFile.write(seasonID + '***' + ('Regular' if gameType == '2' else 'Playoffs' ) + '***' + awayTeam + '***' + homeTeam + '***' + gameDate + '***' + nhlGameID + '\n')
+ print 'Wrote to scrape file: ' + awayTeam + ' at ' + homeTeam + ' on ' + gameDate + ' (' + nhlGameID + ')'
# Close the file connection
scrapeFile.close()

No commit comments for this range

Something went wrong with that request. Please try again.