/
config.yaml
142 lines (140 loc) · 6.07 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
---
log:
# fancy some colors? - disable if redirect to file is intended
ansi: false
# base-line log level
level: warn
# my_crate=info,my_crate::my_mod=debug,[my_span]=trace
# see https://tracing.rs/tracing_subscriber/filter/struct.envfilter
#"[task{name=Crusty::go}]=info", "[task{name=Crusty::job_reader}]=info"
#filter: ["[task{name=Crusty::go}]=info", "[task{name=Crusty::job_reader}]=info"]
host: crawler-1 # for metrics
app_id: crusty # for metrics
# Clickhouse database settings
clickhouse:
url: http://clickhouse:8123
username: default
password: ""
database: crusty
# We persist various queue metrics
metrics_queue:
table_name: metrics_queue
label: ""
# we always try to write in bulk, buffer up to max items before writing
buffer_capacity: 1000
# while we're waiting for buffer to fill wake once in a while to check for force_write_duration
check_for_force_write_duration: 100ms
# if force_write_duration elapsed since last write but we yet not filled buffer_capacity force the write anyway
force_write_duration: 500ms
# We persist some db metrics for further analysis
metrics_db:
table_name: metrics_db
label: ""
buffer_capacity: 1000
check_for_force_write_duration: 100ms
force_write_duration: 500ms
# We persist metrics and various meta-data for each visited page
metrics_task:
table_name: metrics_task
label: ""
buffer_capacity: 10000
check_for_force_write_duration: 100ms
force_write_duration: 500ms
# We persist candidates for newly discovered domains, db will perform final deduplication
domain_discovery_insert:
table_name: domain_discovery
label: insert
buffer_capacity: 10000
check_for_force_write_duration: 500ms
force_write_duration: 2500ms
# We persist confirmations that domain has been checked so that it won't be re-selected unless special criteria is met
domain_discovery_update:
table_name: domain_discovery
label: update
buffer_capacity: 10000
check_for_force_write_duration: 500ms
force_write_duration: 2500ms
# resolver settings
# leave empty for auto-conf
#resolver:
# number of concurrent green threads for name resolution(be mindful of your dns server capacity)
# this should be configured carefully, low setting will lead to job starvation(inability to satisfy requested concurrency_profile.domain_concurrency)
#concurrency: 64
# domain discovery cache capacity, this cache helps to ease load on clickhouse(so we do not insert billions of duplicated records)
# but because cache is local it's effectiveness will drop when adding new crawler nodes
# so if one were to try running this on google scale it would most likely require a dedicated dedup layer before hitting clickhouse
ddc_cap: 25000000
# recently discovered domains live in cache up to this duration
ddc_lifetime: 1h
# We monitor various internal queues and persist their status to db
queue_monitor_interval: 1s
# We parse HTML in a separate thread pool, stack size is configurable
# apparently even 32mib is not enough given max_response_size of 2mib...
parser_processor_stack_size: 128mib
# Fancy local address binding for monster setups with several NICs(local port limitation)
networking_profile:
values:
bind_local_ipv4:
bind_local_ipv6:
socket_read_buffer_size: 32kib
socket_write_buffer_size: 32kib
connect_timeout: 5s
#leave commented for auto-conf
#concurrency_profile:
#= N of physical cores by default
#parser_concurrency:
# We check multiple domains concurrency, set accordingly to saturate your hardware(cpu/network bound)
#domain_concurrency: 100
# We select new jobs(domains) from queue-like structure hosted in clickhouse
job_reader:
domain_table_name: domain_discovery
#we resolve IP of all discovered domains and calculate addr_key
#1. Take only ipv4
#2. Sort
#3. Take first IP and apply addr_key_mask masking
#4. addr_key = addr_key | addr_key_4_mask;
#we now use addr_key in shard calculation, we never select more than domain_top_n domains from a given addr_key
#this ensures we are being polite to websites with different domains hosted on the same IP(or subnet, depending on addr_key_4_mask)
addr_key_mask: 24 #read as /24 meaning first 24 bits are significant while last 8 are not(will be masked)
# re-select checked domains after some time
re_after_days: 3
# queue is sharded, do not ask the same shard for job unless duration has passed since last time we asked
shard_min_last_read: 1s
# min shard number we have access to
shard_min: 1
# max shard number we have access to
shard_max: 25
# total number of all shards, in a multi-node setup shard_total > shard_max - shard_min + 1 (always)
shard_total: 25
# select up to N domains from a shard at once
shard_select_limit: 100000
# buffer up to N domains and do not try to fetch new if we have enough
job_buffer: 100000
# select up to N domains belonging to the same IP, (a.tumblr.com, b.tumblr.com, c.tumblr.com but not d.tumblr.com)
domain_top_n: 2
# those settings relate to a crawler running on some particular domain
default_crawling_settings:
# up to N pages concurrently, keep this number low to avoid excess stress
concurrency: 2
internal_read_buffer_size: 32kib
max_response_size: 2mib
# follow up to N redirects before giving up
max_redirect: 5
# 1s-5s is a safe bet to keep to avoid extra stress
delay: 1s
# vary delay time by this jitter(0..)
delay_jitter: 1s
# timeout for page loading and buffering
load_timeout: 10s
# after soft timeout elapses we no longer queue new tasks for domain
job_soft_timeout: 30s
# after hard timeout elapses we forcibly stop the crawling job for this domain
job_hard_timeout: 60s
user_agent: "crusty/0.12.0"
compression: true
# custom headers are supported
custom_headers:
accept:
- "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
# initial list of seed URLs to start the broad crawling from, additionally we also read seeds from CRUSTY_SEEDS env. variable
seeds: []