-
Notifications
You must be signed in to change notification settings - Fork 45
/
view.py
129 lines (95 loc) · 3.75 KB
/
view.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import tornado.web
import tornado.autoreload
import tornado
import os
import shutil
from sky.crawler import crawl
from sky.crawler.crawling import get_image_set
from sky.configs import DEFAULT_CRAWL_CONFIG
from sky.helper import extractDomain
from sky.scraper import Scraper
import json
# from textblob import TextBlob
def is_numeric(x):
try:
int(x)
return True
except ValueError:
return False
class MainHandler(tornado.web.RequestHandler):
def get(self):
self.render('page_template.html', items=[], cached=False)
def post(self):
CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG.copy()
CRAWL_CONFIG.update({
'collections_path': os.path.join(os.path.expanduser('~'), 'sky_view_collections/'),
# 'max_workers': 10,
})
args = self.request.arguments
print(args)
for arg in args:
value = args[arg][0].decode('utf8')
if value and arg != 'url' and arg != 'checkboxcache':
print('pre', arg, CRAWL_CONFIG[arg])
if isinstance(CRAWL_CONFIG[arg], list):
CRAWL_CONFIG[arg] = [int(value)] if is_numeric(value) else value.split(', ')
else:
CRAWL_CONFIG[arg] = int(value) if is_numeric(value) else value.split(', ')[0]
print('post', arg, CRAWL_CONFIG[arg])
url = self.get_argument('url', '')
use_cache = self.get_argument('checkboxcache', '')
domain = extractDomain(url)
CRAWL_CONFIG['seed_urls'] = [url]
if domain.startswith("http"):
CRAWL_CONFIG['collection_name'] = domain.split("/")[2]
else:
CRAWL_CONFIG['collection_name'] = domain.split("/")[0]
if use_cache != 'on':
col_path = os.path.join(CRAWL_CONFIG['collections_path'],
CRAWL_CONFIG['collection_name'])
print(col_path)
if os.path.exists(col_path):
shutil.rmtree(col_path)
crawl.start(CRAWL_CONFIG)
SCRAPE_CONFIG = CRAWL_CONFIG.copy()
SCRAPE_CONFIG.update({
'template_proportion': 0.4,
'max_templates': 100,
})
skindex = Scraper(SCRAPE_CONFIG)
skindex.load_local_pages()
skindex.add_template_elements()
res = skindex.process_all(remove_visuals=True,
maxn=CRAWL_CONFIG['max_saved_responses'])
items = []
for num, url in enumerate(res):
if num == CRAWL_CONFIG['max_saved_responses']:
break
dc = res[url]
dc['url'] = url
dc['source_name'] = domain
dc['images'] = [x for x in reversed(dc['images'][:5])]
# dc['blobs'] = [TextBlob(x) for x in dc['body'] if dc['body']]
items.append(dc)
# this is quite out of place like this
print('num unique images', len(get_image_set({x['url']: x for x in items})))
if items and 'money' in items[0]:
items = sorted(items, key=lambda x: len(x['money']), reverse=True)
self.render('page_template.html', items=items, cached=False)
settings = {
'template_path': os.path.join(os.path.dirname(__file__), 'templates'),
'static_path': os.path.join(os.path.dirname(__file__), 'static')
}
def main(host='127.0.0.1', port=7900):
# to run the server, type-in $ python view.py
application = tornado.web.Application([
(r"/", MainHandler),
], **settings)
application.listen(int(port), host)
ioloop = tornado.ioloop.IOLoop().instance()
print('serving skyViewer at "{}:{}" from file: {}'.format(host, port, __file__))
ioloop.start()
if __name__ == '__main__':
main()