/
news_scraper.py
83 lines (70 loc) · 2.72 KB
/
news_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from lxml import html
from bs4 import BeautifulSoup
import cProfile
import pstats
page = open('data/nyt_mobile.html').read()
def run_lxml_xpath():
all_news = []
tree = html.document_fromstring(page)
articles = tree.xpath('.//div[@id="articles"]')[0]
for story in articles.xpath('ol/li'):
title = story.xpath('a/span[@class="title"]//text()')
link = story.xpath('a/@href')
blurb = story.xpath('a/p//text()')
all_news.append({
'title': (lambda x: x[0] if len(x) else '')(title),
'link': (lambda x: x[0] if len(x) else '')(link),
'blurb': (lambda x: x[0] if len(x) else '')(blurb),
})
def run_lxml_css():
all_news = []
tree = html.document_fromstring(page)
articles = tree.cssselect('div#articles')[0]
for story in articles.cssselect('ol li'):
title = story.cssselect('span.title')
link = story.find('a')
blurb = story.cssselect('p')
all_news.append({
'title': (lambda x: x[0].text if len(x) else '')(title),
'link': (lambda x: x[0].get('href')
if x is not None else '')(link),
'blurb': (lambda x: x[0].text_content() if len(x) else '')(blurb),
})
def run_beautiful_soup():
all_news = []
tree = BeautifulSoup(page)
articles = tree.find('div', {'id': 'articles'})
for story in articles.find_all('li'):
title = story.find('span', {'class': 'title'})
link = story.find('a')
blurb = story.find('p')
all_news.append({
'title': (lambda x: x.text if x is not None else '')(title),
'link': (lambda x: x.get('href') if x is not None else '')(link),
'blurb': (lambda x: x.text if x is not None else '')(blurb),
})
def get_averages(func, dict, rng):
dict[func]['num_runs'] = 0.0
dict[func]['ttl_calls'] = 0.0
dict[func]['ttl_time'] = 0.0
for i in range(rng):
cProfile.run(func, 'stats')
stats = pstats.Stats('stats')
dict[func]['ttl_calls'] += stats.total_calls
dict[func]['ttl_time'] += stats.total_tt
dict[func]['num_runs'] += 1
dict[func]['avg_time'] = dict[func]['ttl_time'] / dict[func]['num_runs']
dict[func]['avg_calls'] = dict[func]['ttl_calls'] / dict[func]['num_runs']
if __name__ == '__main__':
"""print "Beautiful Soup!"
cProfile.run('run_beautiful_soup()')
print "LXML with CSS Select!"
cProfile.run('run_lxml_css()')
print "LXML with XPATH!"
cProfile.run('run_lxml_xpath()')
"""
averages = {"run_beautiful_soup()": {}, "run_lxml_css()": {},
"run_lxml_xpath()": {}}
for func in averages.keys():
get_averages(func, averages, 500)
print averages