-
Notifications
You must be signed in to change notification settings - Fork 0
/
coindesk.py
210 lines (195 loc) · 8.26 KB
/
coindesk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
from lxml import html
import requests
import re
import time
import os
from handler import Handler
from formatter import TextFormatter
from dateutil import parser
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
URL = 'https://www.coindesk.com/news'
HEADER = {'Connection': 'keep-alive',
'Expires': '-1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
PAGES = ['news', 'features', 'opinion', 'category/markets', 'category/tech',
'category/business', 'category/policy-regulation', 'category/people']
BASE_URL = 'https://www.coindesk.com/'
class ContentError(Exception):
pass
class Coindesk:
def __init__(self):
self.__version__ = "0.3"
self.header = HEADER
self.handler = Handler()
options = Options()
options.headless = True
self.browser = webdriver.Firefox(options = options,
service_log_path=os.path.devnull)
self.assets_keys = ['title', 'author', 'published', 'updated', 'tags',
'link', 'text']
def __del__(self):
self.browser.quit()
def _download(self, link, clicks=None):
"""
Gets webpage source code by acting like a real person - clicking and
scrolling for "another set of articles" element.
"""
# Open headless browser with specified link
self.browser.get(link)
if clicks:
# Sets up actions fro browser
action = webdriver.ActionChains(self.browser)
while clicks != 0:
# Waits 5 seconds while all elements are loaded
time.sleep(5)
# Gets hight of the loaded window
last_height = self.browser.execute_script("return document.body.scrollHeight")
try:
# Finds "more articles" element
#element = self.browser.find_element_by_xpath('//div[@class="cta-content"]')
element = self.browser.find_element_by_xpath('//div[@class="cta-story-stack"]')
except:
return None
# Finds position of the element - top left corner in the window
pos = element.location['y'] - element.size['height']
# Scrolls window to that position
self.browser.execute_script(f"window.scrollTo(0, {pos});")
#action = webdriver.ActionChains(self.browser)
# Moves cursor over the element
action.move_to_element(element).perform()
# Waits 1 second just like a real user
time.sleep(1)
try:
# Clicks element to load next batch of articles
element.click()
#self.browser.find_element_by_xpath('//div[@class="cta-content"]').click()
# Counds down clicks number
clicks -= 1
except:
return None
return self.browser.page_source
def front_articles(self, link, clicks=None):
content = self._download(link, clicks=clicks)
if not content:
return None
tree = html.fromstring(content)
headings = tree.xpath('//section[@class="list-body"]//h4[@class="heading"]/text()')
authors = tree.xpath('//span[@class="credit"]/a/text()')
times = tree.xpath('//section[@class="list-body"]//time[@class="time"]/text()')
links = tree.xpath('//section[@class="list-body"]//div[@class="text-content"]/a[2]/@href')
links = ['https://www.coindesk.com' + i for i in links]
tags = tree.xpath('//section[@class="list-body"]//div[@class="text-content"]/a[1]/span/text()')
return list(zip(headings, authors, times, links, tags))
def article(self, link):
content = self._download(link)
tree = html.fromstring(content)
# They change this element often!!!
times = tree.xpath('//div[@class="timestamps"]/time/text()')
if not times:
times = tree.xpath('//div[@class="datetime"]/time/text()')
if not times:
times = tree.xpath('//div[@class="article-hero-datetime"]/time/text()')
# Need to redo this string position in list is fixed - it will break-up fast!!!
try:
published = times[0]
published = parser.parse(published)
if len(times) == 2:
updated = times[1].strip('Updated ')
updated = parser.parse(updated)
elif len(times) == 3:
updated = times[2]
updated = parser.parse(updated)
else:
updated = times[0]
updated = parser.parse(updated)
except:
print(f"Error: No date at: {link}")
published = ""
updated = ""
tf = TextFormatter()
text = tf.format(content, link)
return [published, updated, text]
def aggregator(self, link, limit=50):
""" Makes sure that number of articles are downloaded that are in the
limit. """
clicks = 0
articles = self.front_articles(link, clicks=clicks)
assets, items = self.back_articles(articles)
while items < limit:
clicks += 1
articles = self.front_articles(link, clicks=clicks)
if not articles:
return None, None
assets, items = self.back_articles(articles)
return assets, items
def back_articles(self, articles):
assets = {}
counter = 0
for item in articles:
article = {}
article[self.assets_keys[0]] = item[0]
id = self.handler.check_article(article)
if not id:
article[self.assets_keys[1]] = item[1]
article[self.assets_keys[5]] = item[3]
article[self.assets_keys[4]] = item[4]
article_components = self.article(item[3])
article[self.assets_keys[2]] = article_components[0]
article[self.assets_keys[3]] = article_components[1]
article[self.assets_keys[6]] = article_components[2]
id = self.handler.insert_article(article)
print(f"{id}: {article[self.assets_keys[0]]}.")
else:
article = self.get_article(id)
assets[id] = article
counter += 1
return assets, counter
def get_article(self, id):
items = self.handler.get_article(id)
if items:
article = {}
article[self.assets_keys[0]] = items[0]
article[self.assets_keys[1]] = items[1]
article[self.assets_keys[2]] = items[2]
article[self.assets_keys[3]] = items[3]
article[self.assets_keys[4]] = items[4]
article[self.assets_keys[5]] = items[5]
article[self.assets_keys[6]] = items[6]
return article
def loop_text_handler(self, link):
""" Function for handling article text parsing. """
content = self._download(link)
tf = TextFormatter()
return tf.format(content, link)
def main(self, limit=50, links=None):
""" Main function for looping through subpages of coindesk. """
if not links:
links = [BASE_URL + i for i in PAGES]
for link in links:
print(f"Getting articles in {link}.")
_, _ = self.aggregator(link, limit=limit)
def reparse_article_texts(self, forced=False):
""" This is a utility function for formatting old articles text into
new text with paras. Do not need to to be run again. """
ids = self.handler.get_ids()
for id in ids:
content = self.handler.get_content_by_id(id)
if not '\n' in content or forced:
link = self.handler.get_link_by_id(id)
text = self.loop_text_handler(link)
self.handler.update_content_by_id(id, text)
print(f"Article ID: {id} updated.")
time.sleep(5)
else:
continue
if __name__ == '__main__':
import sys
coindesk = Coindesk()
if len(sys.argv) >= 2:
coindesk.main(limit=int(sys.argv[1]))
else:
coindesk.main()