-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
158 lines (143 loc) · 5.02 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/python
# Ian Young
# December 2008
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
##
## Scrape the website for 89.3 The Current, retrieving all songs played
## during the given date range.
##
## Accepts one argument, the name of the database to write to
##
import urllib
from BeautifulSoup import BeautifulSoup
import sqlite3
import re
import datetime
import sys
# Initialize a DB, and scrape all songs from startdate through enddate into it
def populate_songs(db, startdate, enddate):
connection = sqlite3.connect(db)
c = connection.cursor()
# Comment all these statements if not starting with a blank slate
c.execute('drop table if exists songs')
c.execute('''create table songs
(id integer primary key,
time_played time,
date_played date,
artist text,
title text)''')
# parsed_info just tracks some summary information, it's not very crucial
c.execute('drop table if exists parsed_info')
c.execute('''create table parsed_info
(day date,
number_songs int)''')
thisdate = startdate
while thisdate <= enddate:
if thisdate.day == 1:
print("Working on "+str(thisdate.month)+"-"+str(thisdate.year))
count = 0
for song in songs_for_day(thisdate):
c.execute('insert into songs\
(date_played, time_played, artist, title)\
values (?, ?, ?, ?)', (thisdate.isoformat(),) + song)
count = count + 1
c.execute('insert into parsed_info\
values (?, ?)', (thisdate.isoformat(), count))
thisdate = thisdate + datetime.timedelta(1)
# Clean up duplicate entries that exist in the Current's playlists (SLOW)
c.execute('''delete from songs where id in \
(select b.id from songs a, songs b \
where a.date_played=b.date_played \
and a.time_played=b.time_played \
and a.artist=b.artist \
and a.title=b.title \
and a.id<b.id)''')
connection.commit()
c.close()
def normalize(string):
string = unicode(string)
# Filter anything out of the ordinary, including punctuation
rexp = re.compile('[^\w\s\d]')
string = rexp.sub('', string)
# Compress whitespace
rexp = re.compile('\s+')
string = rexp.sub(' ', string)
# Lowercase everything
string = string.lower()
# trim whitespace
string = string.strip()
return string
# Get all the songs for the given day from The Current's site
# Generator that yields (time, artist, title) tuples
def songs_for_day(date):
# nab the page
page = urllib.urlopen("http://minnesota.publicradio.org/radio/services/the_current/playlist/playlist.php?month="+str(date.month)+"&day="+str(date.day)+"&year="+str(date.year))
# Use BeautifulSoup to parse the page structure
soup = BeautifulSoup(page.read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
divs = soup.findAll(['div', 'span'], { "class" : ["playTime", "songInfo", "hourH3"] })
for div in divs:
# This is an hour header, do all sorts of black magic to figure out
# if we're in AM or PM time
if div['class'] == "hourH3":
pm_re = re.compile('(11.*12.*AM)|PM')
fake_pm_re = re.compile('(11.*12.*PM)')
if div.string and pm_re.search(div.string) and not fake_pm_re.search(div.string):
post_meridiem = True
else:
post_meridiem = False
# If it's a time, parse it into a time object
elif div['class'] == "playTime":
timebits = map(int, div.string.split(':'))
# Make the hour military time
if post_meridiem and timebits[0] < 12:
timebits[0] += 12
# Midnight becomes 0:00
if not post_meridiem and timebits[0] == 12:
timebits[0] = 0
play_time = datetime.time(timebits[0], timebits[1])
# Otherwise it's a artist and title string, parse it
elif div['class'] == "songInfo":
if (div.h4.a != None):
song_title = div.h4.a.string
artist = ''.join(div.h4.findAll(text=True, recursive=False))
# sanity check
if artist == None or song_title == None:
continue
else:
myStrings = div.h4.findAll(text=True, recursive=False)
# sanity check
if len(myStrings) < 2:
continue
artist = myStrings[0]
song_title = myStrings[1]
artist = normalize(artist)
song_title = normalize(song_title)
yield (play_time.isoformat(), artist, song_title)
# Just spit out some summary info to make sure things went okay
def dump_db():
connection = sqlite3.connect(db)
c = connection.cursor()
c.execute('select * from parsed_info')
for line in c:
print line
c.close()
# Get the DB from cli argument, or just use a default
if len(sys.argv) > 1:
db = sys.argv[1]
else:
db = './tmpdb'
start = datetime.date(2005, 12, 22)
end = datetime.date(2009, 5, 22)
populate_songs(db, start, end)
dump_db()