-
Notifications
You must be signed in to change notification settings - Fork 0
/
course_scraper.py
224 lines (207 loc) · 7.04 KB
/
course_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
"""
Scrapes NEU course information.
"""
import json
import itertools
import os
import re
import sys
import threading
import traceback
import urllib
import urllib2
import lxml.html
# Regex for extracting course number from title
TITLE_INFO_REGEX = re.compile('.*([0-9]{4}).*-(.*)')
# Base URL for the course display page
DISPLAY_COURSES_URL = 'https://wl11gp.neu.edu/udcprod8/bwckctlg.p_display_courses?'
# Default parameters for the course display page
DEFAULT_COURSES_PARAMS = {
"sel_levl": "dummy",
"call_proc_in": "",
"sel_to_cred": "",
"sel_crse_strt": "",
"sel_from_cred": "",
"sel_divs": "dummy",
"sel_schd": "dummy",
"sel_attr": "dummy",
"term_in": "201410",
"sel_coll": "dummy",
"sel_dept": "",
"sel_crse_end": "",
"sel_title": ""
}
def get_dom(url, data=None):
"""
Performs an HTTP request on the url and returns an LXML DOM of the HTML
response.
"""
response = urllib2.urlopen(url, data=data)
try:
return lxml.html.parse(response).getroot()
finally:
response.close()
def get_course_page(department):
"""
Gets the courses list for the given department
"""
params = DEFAULT_COURSES_PARAMS.copy()
params["sel_subj"] = department
# For some reason, sel_sub is required twice.
url = DISPLAY_COURSES_URL + 'sel_subj=dummy&' + urllib.urlencode(params)
return get_dom(url)
def get_departments():
"""
Returns a list of department strings
"""
# XXX: Do this by scraping.
print 'Getting departments (fake)'
curdir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(curdir, 'department_list.json'), 'r') as fobj:
return json.load(fobj)
def elems_to_content(elems):
"""
Takes a list of LXML elements and returns a list of their text content
"""
return [elem.text_content() for elem in elems]
def parse_hours(text):
"""
Parses a course hours or lecture hours string into a dict
"""
parts = text.split()
if len(parts) == 1:
return {
'type': 'single',
'value': float(parts[0])
}
elif len(parts) == 3 and parts[1] == 'TO':
return {
'type': 'range',
'min': float(parts[0]),
'max': float(parts[2])
}
elif len(parts) == 3 and parts[1] == 'OR':
return {
'type': 'options',
'options': [float(parts[0]), float(parts[2])]
}
def iter_child_textnodes(elem, strip_whitespace=False, remove_empty=False):
"""
Returns a generator which produces a list of the text contents of direct
child nodes.
"""
# Text nodes found after the first HTML child of this element are stored
# in the tail properties of the child elements rather than in the list of
# children.
tail_iter = (c.tail for c in elem.iterchildren() if c.tail is not None)
chained_iter = itertools.chain((elem.text,), tail_iter)
if strip_whitespace:
chained_iter = (s.strip() for s in chained_iter)
if remove_empty:
chained_iter = (s for s in chained_iter if s)
return chained_iter
def splitstrip(text, separator):
"""
Splits text on a separator, then strips white space from the resulting
strings and only outputs non-empty strings.
"""
return [t.strip() for t in text.split(separator) if t.strip()]
def parse_description(description):
"""
Parses a description element into a dictionary
"""
info = {}
try:
info['prereqstr'] = description.cssselect('i')[0].text_content()
except IndexError:
info['prereqstr'] = None
text_parts_iter = iter_child_textnodes(description, True, True)
# Get the actual description string. (Ends before line with credit hours)
desc_parts = []
for part in text_parts_iter:
if 'Credit hours' in part:
break
desc_parts.append(part)
info['description'] = ' '.join(desc_parts)
# Extract the credit and lecture hours
for part in text_parts_iter:
part = part.strip()
if 'Credit hours' in part:
info['creditHours'] = parse_hours(part.partition('Credit')[0])
elif 'Lecture hours' in part:
info['lectureHours'] = parse_hours(part.partition('Lecture')[0])
# Get the fieldlabeltext labled parameters using DOM siblings
for elem in description.cssselect('.fieldlabeltext'):
elemtext = elem.text.strip()
if elemtext == 'Levels:':
info['levels'] = splitstrip(elem.tail, ',')
elif elemtext == 'Schedule Types:':
info['scheduleTypes'] = splitstrip(elem.getnext().text or '', ',')
elif elemtext == 'Course Attributes:':
info['attributes'] = splitstrip(elem.getnext().tail, ',')
return info
def get_course_info(dept):
"""
Takes a department and returns a dictionary mapping course numbers to
dicts containing their information
"""
dom = get_course_page(dept)
titles = elems_to_content(dom.cssselect('td.nttitle'))
descriptions = dom.cssselect('td.ntdefault')[:len(titles)]
course_array = []
for title, description in zip(titles, descriptions):
course_num, title = TITLE_INFO_REGEX.match(title).groups()
course_num = int(course_num)
title = title.strip()
info = parse_description(description)
info['title'] = title
course_dict = {}
course_dict[course_num] = info
course_array.append(course_dict)
return course_array
def get_course_info_by_dept(depts, threadcount=10):
"""
Gets a dictionary mapping departments to their course information
"""
# Lock for department queue
dlock = threading.Lock()
output = {}
depts = itertools.tee(depts, 1)[0]
errors = []
def threadfunc():
"""
Runs inside of the threads. Gets information for departments in
a loop and returns when no deparments remain in the queue
"""
while True:
try:
dlock.acquire()
try:
dept = depts.next()
print 'Getting: %s' % dept
except StopIteration:
return
finally:
dlock.release()
output[dept] = get_course_info(dept)
except Exception, err:
errors.append((err, traceback.format_exc()))
return
threads = [threading.Thread(target=threadfunc) for i in xrange(threadcount)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if errors:
errstr = '\n'.join('%s\n%s' % pair for pair in errors)
raise Exception('Errors occurred fetching data:\n%s' % errstr)
return output
def main():
"""
Writes all course information to all_courses.json
"""
dept_dict = get_course_info_by_dept(get_departments())
with open('all_data_new.json', 'w') as fobj:
json.dump(dept_dict, fobj, indent=4, sort_keys=True)
if __name__ == '__main__':
main()