/
PreSieve.cpp
452 lines (404 loc) Β· 17.4 KB
/
PreSieve.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
///
/// @file PreSieve.cpp
/// @brief Pre-sieve multiples of small primes < 100 to speed up the
/// sieve of Eratosthenes. The idea is to allocate several
/// arrays (buffers_) and remove the multiples of small primes
/// from them at initialization. Each buffer is assigned
/// different primes, for example:
///
/// Buffer 0 removes multiplies of: { 7, 67, 71 }
/// Buffer 1 removes multiplies of: { 11, 41, 73 }
/// Buffer 2 removes multiplies of: { 13, 43, 59 }
/// Buffer 3 removes multiplies of: { 17, 37, 53 }
/// Buffer 4 removes multiplies of: { 19, 29, 61 }
/// Buffer 5 removes multiplies of: { 23, 31, 47 }
/// Buffer 6 removes multiplies of: { 79, 97 }
/// Buffer 7 removes multiplies of: { 83, 89 }
///
/// Then whilst sieving, we perform a bitwise AND on the
/// buffers_ arrays and store the result in the sieve array.
/// Pre-sieving provides a speedup of up to 30% when
/// sieving the primes < 10^10 using primesieve.
///
/// Copyright (C) 2023 Kim Walisch, <kim.walisch@gmail.com>
/// Copyright (C) 2022 @zielaj, https://github.com/zielaj
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///
#include <primesieve/PreSieve.hpp>
#include <primesieve/EratSmall.hpp>
#include <primesieve/macros.hpp>
#include <stdint.h>
#include <algorithm>
#include <cmath>
#include <initializer_list>
/// All x64 CPUs support the SSE2 vector instruction set
#if defined(__SSE2__) && \
__has_include(<emmintrin.h>)
#include <emmintrin.h>
#define HAS_SSE2
#endif
// All ARM64 CPUs support the NEON vector instruction set
#if (defined(__ARM_NEON) || defined(__aarch64__)) && \
__has_include(<arm_neon.h>)
#include <arm_neon.h>
#define HAS_ARM_NEON
#endif
using std::copy_n;
using primesieve::Array;
namespace {
/// Pre-sieve with the primes <= 13
const Array<uint8_t, 7*11*13> buffer_7_11_13 =
{
0xf8, 0xef, 0x77, 0x3f, 0xdb, 0xed, 0x9e, 0xfc, 0xea, 0x37,
0xaf, 0xf9, 0xf5, 0xd3, 0x7e, 0x4f, 0x77, 0x9e, 0xeb, 0xf9,
0xdd, 0xee, 0xad, 0x77, 0xb7, 0x73, 0xd9, 0xdf, 0x3e, 0xef,
0x53, 0xaf, 0xeb, 0xfd, 0xde, 0xb6, 0x6f, 0x57, 0xb7, 0xba,
0xfd, 0x5b, 0xfe, 0xcf, 0x65, 0xbf, 0xf1, 0x7c, 0x9f, 0xfe,
0xae, 0x77, 0xbb, 0xfb, 0x6d, 0xdd, 0xde, 0xe7, 0x77, 0x9d,
0xfa, 0xbc, 0xdf, 0xfa, 0xe7, 0x63, 0xbd, 0x7b, 0xf5, 0x5f,
0xce, 0xef, 0x34, 0xbe, 0xbb, 0xfd, 0xcf, 0xf4, 0xeb, 0x77,
0x3f, 0xdb, 0xdd, 0x8e, 0xfe, 0xe9, 0x76, 0xaf, 0xf9, 0xfd,
0xd7, 0x7a, 0xcf, 0x77, 0xbe, 0xdb, 0xe9, 0xdf, 0xec, 0xec,
0x37, 0xb7, 0x7b, 0xd5, 0xdb, 0xbe, 0x6f, 0x73, 0x9f, 0xeb,
0xfd, 0xdd, 0xf6, 0x2f, 0x57, 0xbf, 0xb2, 0xf9, 0xdb, 0x7e,
0xef, 0x55, 0xaf, 0xf3, 0x7d, 0xde, 0xbe, 0xae, 0x77, 0xb3,
0xfb, 0xed, 0x5d, 0xfe, 0xc7, 0x67, 0x9f, 0xf9, 0xbc, 0x9f,
0xfa, 0xef, 0x67, 0xb9, 0xfb, 0x75, 0x5f, 0xde, 0xef, 0x36,
0xbd, 0xfa, 0xbd, 0xcf, 0xfc, 0xe7, 0x73, 0x3f, 0x5b, 0xfd,
0x9e, 0xee, 0xeb, 0x75, 0xae, 0xb9, 0xfd, 0xd7, 0x76, 0xcb,
0x77, 0x3e, 0xfb, 0xd9, 0xcf, 0xee, 0xed, 0x76, 0xb7, 0x7b,
0xdd, 0xd7, 0xba, 0xef, 0x73, 0xbf, 0xcb, 0xed, 0xdf, 0xf4,
0x6e, 0x17, 0xbf, 0xba, 0xf5, 0xdb, 0xfe, 0x6f, 0x75, 0x9f,
0xe3, 0x7d, 0xdd, 0xfe, 0xae, 0x77, 0xbb, 0xf3, 0xe9, 0xdd,
0x7e, 0xe7, 0x57, 0x8f, 0xfb, 0xbc, 0xde, 0xba, 0xef, 0x67,
0xb5, 0xfb, 0xf5, 0x5f, 0xde, 0xcf, 0x26, 0xbf, 0xf9, 0xfc,
0x8f, 0xfc, 0xef, 0x77, 0x3b, 0xdb, 0x7d, 0x9e, 0xde, 0xeb,
0x77, 0xad, 0xf8, 0xbd, 0xd7, 0x7e, 0xc7, 0x73, 0xbe, 0x7b,
0xf9, 0xdf, 0xee, 0xed, 0x75, 0xb6, 0x3b, 0xdd, 0xdf, 0xb6,
0xeb, 0x73, 0x3f, 0xeb, 0xdd, 0xcf, 0xf6, 0x6d, 0x56, 0xbf,
0xba, 0xfd, 0xd3, 0xfa, 0xef, 0x75, 0xbf, 0xd3, 0x6d, 0xdf,
0xfc, 0xae, 0x37, 0xbb, 0xfb, 0xe5, 0xd9, 0xfe, 0x67, 0x77,
0x9f, 0xeb, 0xbc, 0xdd, 0xfa, 0xaf, 0x67, 0xbd, 0xf3, 0xf1,
0x5f, 0x5e, 0xef, 0x16, 0xaf, 0xfb, 0xfd, 0xce, 0xbc, 0xef,
0x77, 0x37, 0xdb, 0xfd, 0x1e, 0xfe, 0xcb, 0x67, 0xaf, 0xf9,
0xfc, 0x97, 0x7e, 0xcf, 0x77, 0xba, 0xfb, 0x79, 0xdf, 0xce,
0xed, 0x77, 0xb5, 0x7a, 0x9d, 0xdf, 0xbe, 0xe7, 0x73, 0xbf,
0x6b, 0xfd, 0xdf, 0xe6, 0x6f, 0x55, 0xbe, 0xba, 0xfd, 0xdb,
0xf6, 0xeb, 0x75, 0x3f, 0xf3, 0x5d, 0xcf, 0xfe, 0xac, 0x76,
0xbb, 0xfb, 0xed, 0xd5, 0xfa, 0xe7, 0x77, 0x9f, 0xdb, 0xac,
0xdf, 0xf8, 0xee, 0x27, 0xbd, 0xfb, 0xf5, 0x5b, 0xde, 0x6f,
0x36, 0x9f, 0xeb, 0xfd, 0xcd, 0xfc, 0xaf, 0x77, 0x3f, 0xd3,
0xf9, 0x9e, 0x7e, 0xeb, 0x57, 0xaf, 0xf9, 0xfd, 0xd6, 0x3e,
0xcf, 0x77, 0xb6, 0xfb, 0xf9, 0x5f, 0xee, 0xcd, 0x67, 0xb7,
0x79, 0xdc, 0x9f, 0xbe, 0xef, 0x73, 0xbb, 0xeb, 0x7d, 0xdf,
0xd6, 0x6f, 0x57, 0xbd, 0xba, 0xbd, 0xdb, 0xfe, 0xe7, 0x71,
0xbf, 0x73, 0x7d, 0xdf, 0xee, 0xae, 0x75, 0xba, 0xbb, 0xed,
0xdd, 0xf6, 0xe3, 0x77, 0x1f, 0xfb, 0x9c, 0xcf, 0xfa, 0xed,
0x66, 0xbd, 0xfb, 0xf5, 0x57, 0xda, 0xef, 0x36, 0xbf, 0xdb,
0xed, 0xcf, 0xfc, 0xee, 0x37, 0x3f, 0xdb, 0xf5, 0x9a, 0xfe,
0x6b, 0x77, 0x8f, 0xe9, 0xfd, 0xd5, 0x7e, 0x8f, 0x77, 0xbe,
0xf3, 0xf9, 0xdf, 0x6e, 0xed, 0x57, 0xa7, 0x7b, 0xdd, 0xde,
0xbe, 0xef, 0x73, 0xb7, 0xeb, 0xfd, 0x5f, 0xf6, 0x4f, 0x47,
0xbf, 0xb8, 0xfc, 0x9b, 0xfe, 0xef, 0x75, 0xbb, 0xf3, 0x7d,
0xdf, 0xde, 0xae, 0x77, 0xb9, 0xfa, 0xad, 0xdd, 0xfe, 0xe7,
0x73, 0x9f, 0x7b, 0xbc, 0xdf, 0xea, 0xef, 0x65, 0xbc, 0xbb,
0xf5, 0x5f, 0xd6, 0xeb, 0x36, 0x3f, 0xfb, 0xdd, 0xcf, 0xfc,
0xed, 0x76, 0x3f, 0xdb, 0xfd, 0x96, 0xfa, 0xeb, 0x77, 0xaf,
0xd9, 0xed, 0xd7, 0x7c, 0xce, 0x37, 0xbe, 0xfb, 0xf1, 0xdb,
0xee, 0x6d, 0x77, 0x97, 0x6b, 0xdd, 0xdd, 0xbe, 0xaf, 0x73,
0xbf, 0xe3, 0xf9, 0xdf, 0x76, 0x6f, 0x57, 0xaf, 0xba, 0xfd,
0xda, 0xbe, 0xef, 0x75, 0xb7, 0xf3, 0x7d, 0x5f, 0xfe, 0x8e,
0x67, 0xbb, 0xf9, 0xec, 0x9d, 0xfe, 0xe7, 0x77, 0x9b, 0xfb,
0x3c, 0xdf, 0xda, 0xef, 0x67, 0xbd, 0xfa, 0xb5, 0x5f, 0xde,
0xe7, 0x32, 0xbf, 0x7b, 0xfd, 0xcf, 0xec, 0xef, 0x75, 0x3e,
0x9b, 0xfd, 0x9e, 0xf6, 0xeb, 0x77, 0x2f, 0xf9, 0xdd, 0xc7,
0x7e, 0xcd, 0x76, 0xbe, 0xfb, 0xf9, 0xd7, 0xea, 0xed, 0x77,
0xb7, 0x5b, 0xcd, 0xdf, 0xbc, 0xee, 0x33, 0xbf, 0xeb, 0xf5,
0xdb, 0xf6, 0x6f, 0x57, 0x9f, 0xaa, 0xfd, 0xd9, 0xfe, 0xaf,
0x75, 0xbf, 0xf3, 0x79, 0xdf, 0x7e, 0xae, 0x57, 0xab, 0xfb,
0xed, 0xdc, 0xbe, 0xe7, 0x77, 0x97, 0xfb, 0xbc, 0x5f, 0xfa,
0xcf, 0x67, 0xbd, 0xf9, 0xf4, 0x1f, 0xde, 0xef, 0x36, 0xbb,
0xfb, 0x7d, 0xcf, 0xdc, 0xef, 0x77, 0x3d, 0xda, 0xbd, 0x9e,
0xfe, 0xe3, 0x73, 0xaf, 0x79, 0xfd, 0xd7, 0x6e, 0xcf, 0x75,
0xbe, 0xbb, 0xf9, 0xdf, 0xe6, 0xe9, 0x77, 0x37, 0x7b, 0xdd,
0xcf, 0xbe, 0xed, 0x72, 0xbf, 0xeb, 0xfd, 0xd7, 0xf2, 0x6f,
0x57, 0xbf, 0x9a, 0xed, 0xdb, 0xfc, 0xee, 0x35, 0xbf, 0xf3,
0x75, 0xdb, 0xfe, 0x2e, 0x77, 0x9b, 0xeb, 0xed, 0xdd, 0xfe,
0xa7, 0x77, 0x9f, 0xf3, 0xb8, 0xdf, 0x7a, 0xef, 0x47, 0xad,
0xfb, 0xf5, 0x5e, 0x9e, 0xef, 0x36, 0xb7, 0xfb, 0xfd, 0x4f,
0xfc, 0xcf, 0x67, 0x3f, 0xd9, 0xfc, 0x9e, 0xfe, 0xeb, 0x77,
0xab, 0xf9, 0x7d, 0xd7, 0x5e, 0xcf, 0x77, 0xbc, 0xfa, 0xb9,
0xdf, 0xee, 0xe5, 0x73, 0xb7, 0x7b, 0xdd, 0xdf, 0xae, 0xef,
0x71, 0xbe, 0xab, 0xfd, 0xdf, 0xf6, 0x6b, 0x57, 0x3f, 0xba,
0xdd, 0xcb, 0xfe, 0xed, 0x74, 0xbf, 0xf3, 0x7d, 0xd7, 0xfa,
0xae, 0x77, 0xbb, 0xdb, 0xed, 0xdd, 0xfc, 0xe6, 0x37, 0x9f,
0xfb, 0xb4, 0xdb, 0xfa, 0x6f, 0x67, 0x9d, 0xeb, 0xf5, 0x5d,
0xde, 0xaf, 0x36, 0xbf, 0xf3, 0xf9, 0xcf, 0x7c, 0xef, 0x57,
0x2f, 0xdb, 0xfd, 0x9e, 0xbe, 0xeb, 0x77, 0xa7, 0xf9, 0xfd,
0x57, 0x7e, 0xcf, 0x67, 0xbe, 0xf9, 0xf8, 0x9f, 0xee, 0xed,
0x77, 0xb3, 0x7b, 0x5d, 0xdf, 0x9e, 0xef, 0x73, 0xbd, 0xea,
0xbd, 0xdf, 0xf6, 0x67, 0x53, 0xbf, 0x3a, 0xfd, 0xdb, 0xee,
0xef, 0x75, 0xbe, 0xb3, 0x7d, 0xdf, 0xf6, 0xaa, 0x77, 0x3b,
0xfb, 0xcd, 0xcd, 0xfe, 0xe5, 0x76, 0x9f, 0xfb, 0xbc, 0xd7,
0xfa, 0xef, 0x67, 0xbd, 0xdb, 0xe5, 0x5f, 0xdc, 0xee, 0x36,
0xbf, 0xfb, 0xf5, 0xcb, 0xfc, 0x6f, 0x77, 0x1f, 0xcb, 0xfd,
0x9c, 0xfe, 0xab, 0x77, 0xaf, 0xf1, 0xf9, 0xd7, 0x7e, 0xcf,
0x57, 0xae, 0xfb, 0xf9, 0xde, 0xae, 0xed, 0x77, 0xb7, 0x7b,
0xdd, 0x5f, 0xbe, 0xcf, 0x63, 0xbf, 0xe9, 0xfc, 0x9f, 0xf6,
0x6f, 0x57, 0xbb, 0xba, 0x7d, 0xdb, 0xde, 0xef, 0x75, 0xbd,
0xf2, 0x3d, 0xdf, 0xfe, 0xa6, 0x73, 0xbb, 0x7b, 0xed, 0xdd,
0xee, 0xe7, 0x75, 0x9e, 0xbb, 0xbc, 0xdf, 0xf2, 0xeb, 0x67,
0x3d, 0xfb, 0xd5, 0x4f, 0xde, 0xed, 0x36, 0xbf, 0xfb, 0xfd,
0xc7
};
/// Pre-sieve with the primes < 100
const Array<std::initializer_list<uint64_t>, 8> bufferPrimes =
{{
{ 7, 67, 71 }, // 32 KiB
{ 11, 41, 73 }, // 32 KiB
{ 13, 43, 59 }, // 32 KiB
{ 17, 37, 53 }, // 32 KiB
{ 19, 29, 61 }, // 32 KiB
{ 23, 31, 47 }, // 32 KiB
{ 79, 97 }, // 8 KiB
{ 83, 89 } // 7 KiB
}};
/// Each byte represents an interval of 30 integers
const uint64_t buffersDist =
( 7 * 67 * 71) * 30 +
(11 * 41 * 73) * 30 +
(13 * 43 * 59) * 30 +
(17 * 37 * 53) * 30 +
(19 * 29 * 61) * 30 +
(23 * 31 * 47) * 30 +
(79 * 97) * 30 +
(83 * 89) * 30;
#if defined(HAS_SSE2)
/// Since compiler auto-vectorization is not 100% reliable, we have
/// manually vectorized the andBuffers() function for x64 CPUs.
/// This algorithm is portable since all x64 CPUs support the SSE2
/// instruction set.
///
void andBuffers(const uint8_t* __restrict buf0,
const uint8_t* __restrict buf1,
const uint8_t* __restrict buf2,
const uint8_t* __restrict buf3,
const uint8_t* __restrict buf4,
const uint8_t* __restrict buf5,
const uint8_t* __restrict buf6,
const uint8_t* __restrict buf7,
uint8_t* __restrict output,
std::size_t bytes)
{
std::size_t i = 0;
std::size_t limit = bytes - bytes % sizeof(__m128i);
// Note that I also tried vectorizing this algorithm using AVX2
// which has double the vector width compared to SSE2, but this did
// not provide any speedup. On average, this loop processes only
// 2192 bytes, hence there aren't many vector loop iterations and
// by increasing the vector width this also increases the number of
// scalar loop iterations after the vector loop finishes which
// could potentially even become a bottleneck.
for (; i < limit; i += sizeof(__m128i))
{
_mm_storeu_si128((__m128i*) &output[i],
_mm_and_si128(
_mm_and_si128(
_mm_and_si128(_mm_loadu_si128((const __m128i*) &buf0[i]), _mm_loadu_si128((const __m128i*) &buf1[i])),
_mm_and_si128(_mm_loadu_si128((const __m128i*) &buf2[i]), _mm_loadu_si128((const __m128i*) &buf3[i]))),
_mm_and_si128(
_mm_and_si128(_mm_loadu_si128((const __m128i*) &buf4[i]), _mm_loadu_si128((const __m128i*) &buf5[i])),
_mm_and_si128(_mm_loadu_si128((const __m128i*) &buf6[i]), _mm_loadu_si128((const __m128i*) &buf7[i])))));
}
for (; i < bytes; i++)
output[i] = buf0[i] & buf1[i] & buf2[i] & buf3[i] &
buf4[i] & buf5[i] & buf6[i] & buf7[i];
}
#elif defined(HAS_ARM_NEON)
/// Homebrew compiles its C/C++ packages on macOS using Clang -Os
/// (instead of -O2 or -O3) which does not auto-vectorize our simple
/// loop with Bitwise AND. If this loop is not vectorized this
/// deteriorates the performance of primesieve by up to 40%. As a
/// workaround for this Homebrew issue we have manually vectorized
/// the Bitwise AND loop using ARM NEON.
///
void andBuffers(const uint8_t* __restrict buf0,
const uint8_t* __restrict buf1,
const uint8_t* __restrict buf2,
const uint8_t* __restrict buf3,
const uint8_t* __restrict buf4,
const uint8_t* __restrict buf5,
const uint8_t* __restrict buf6,
const uint8_t* __restrict buf7,
uint8_t* __restrict output,
std::size_t bytes)
{
std::size_t i = 0;
std::size_t limit = bytes - bytes % sizeof(uint8x16_t);
for (; i < limit; i += sizeof(uint8x16_t))
{
vst1q_u8(&output[i],
vandq_u8(
vandq_u8(
vandq_u8(vld1q_u8(&buf0[i]), vld1q_u8(&buf1[i])),
vandq_u8(vld1q_u8(&buf2[i]), vld1q_u8(&buf3[i]))),
vandq_u8(
vandq_u8(vld1q_u8(&buf4[i]), vld1q_u8(&buf5[i])),
vandq_u8(vld1q_u8(&buf6[i]), vld1q_u8(&buf7[i])))));
}
for (; i < bytes; i++)
output[i] = buf0[i] & buf1[i] & buf2[i] & buf3[i] &
buf4[i] & buf5[i] & buf6[i] & buf7[i];
}
#else
void andBuffers(const uint8_t* __restrict buf0,
const uint8_t* __restrict buf1,
const uint8_t* __restrict buf2,
const uint8_t* __restrict buf3,
const uint8_t* __restrict buf4,
const uint8_t* __restrict buf5,
const uint8_t* __restrict buf6,
const uint8_t* __restrict buf7,
uint8_t* __restrict output,
std::size_t bytes)
{
// This loop will get auto-vectorized if compiled with GCC/Clang
// using -O3. Using GCC -O2 does not auto-vectorize this loop
// because -O2 uses the "very-cheap" vector cost model. To fix
// this issue we enable -ftree-vectorize -fvect-cost-model=dynamic
// if the compiler supports it in auto_vectorization.cmake.
for (std::size_t i = 0; i < bytes; i++)
output[i] = buf0[i] & buf1[i] & buf2[i] & buf3[i] &
buf4[i] & buf5[i] & buf6[i] & buf7[i];
}
#endif
} // namespace
namespace primesieve {
void PreSieve::init(uint64_t start,
uint64_t stop)
{
// Already initialized
if (!buffers_[0].empty())
return;
// To reduce the initialization overhead, we only enable
// pre-sieving if the sieving distance is at least 20x
// larger than the distance (size) of the pre-sieve buffers.
uint64_t dist = std::max(start, stop) - start;
uint64_t sqrtStop = (uint64_t) std::sqrt(stop);
dist = std::max(dist, sqrtStop);
// When sieving backwards using primesieve::iterator.prev_prime()
// the sieving distance is subdivided into smaller chunks. By
// keeping track of the total sieving distance here, we can
// figure out if the total sieving distance is large and if it
// is worth enabling pre-sieving.
totalDist_ += dist;
// For small intervals we pre-sieve using the
// static buffer_7_11_13 lookup table. In this
// case no initialization is required.
if (totalDist_ < buffersDist * 20)
return;
initBuffers();
}
/// We are sieving a large interval, we have to
/// initialize all pre-sieve buffers.
///
void PreSieve::initBuffers()
{
for (std::size_t i = 0; i < buffers_.size(); i++)
{
uint64_t product = 30;
for (uint64_t prime : bufferPrimes[i])
product *= prime;
uint64_t start = product;
uint64_t stop = start + product;
buffers_[i].resize(product / 30);
std::fill(buffers_[i].begin(), buffers_[i].end(), 0xff);
uint64_t maxPrime = *(bufferPrimes[i].end() - 1);
ASSERT(maxPrime == *std::max_element(bufferPrimes[i].begin(), bufferPrimes[i].end()));
ASSERT(start >= maxPrime * maxPrime);
maxPrime_ = std::max(maxPrime_, maxPrime);
EratSmall eratSmall;
eratSmall.init(stop, buffers_[i].size(), maxPrime);
for (uint64_t prime : bufferPrimes[i])
eratSmall.addSievingPrime(prime, start);
eratSmall.crossOff(buffers_[i]);
}
}
void PreSieve::preSieve(Vector<uint8_t>& sieve,
uint64_t segmentLow) const
{
if (buffers_[0].empty())
preSieveSmall(sieve, segmentLow);
else
preSieveLarge(sieve, segmentLow);
// Pre-sieving removes the primes < 100. We
// have to undo that work and reset these bits
// to 1 (but 49 = 7 * 7 is not a prime).
if (segmentLow < 120)
{
uint64_t i = segmentLow / 30;
uint8_t* sieveArray = sieve.data();
Array<uint8_t, 8> primeBits = { 0xff, 0xef, 0x77, 0x3f, 0xdb, 0xed, 0x9e, 0xfc };
ASSERT(sieve.capacity() >= 4);
for (std::size_t j = 0; j < 4; j++)
sieveArray[j] = primeBits[i + j];
}
}
/// Pre-sieve with the primes <= 13
void PreSieve::preSieveSmall(Vector<uint8_t>& sieve,
uint64_t segmentLow)
{
uint64_t size = buffer_7_11_13.size();
uint64_t primeProduct = size * 30;
uint64_t i = (segmentLow % primeProduct) / 30;
uint64_t sizeLeft = size - i;
auto buffer = buffer_7_11_13.data();
if (sieve.size() <= sizeLeft)
copy_n(&buffer[i], sieve.size(), sieve.begin());
else
{
// Copy the last remaining bytes of buffer
// to the beginning of the sieve array
copy_n(&buffer[i], sizeLeft, sieve.begin());
// Restart copying at the beginning of buffer
for (i = sizeLeft; i + size < sieve.size(); i += size)
copy_n(buffer, size, &sieve[i]);
// Copy the last remaining bytes
copy_n(buffer, sieve.size() - i, &sieve[i]);
}
}
/// Pre-sieve with the primes < 100
void PreSieve::preSieveLarge(Vector<uint8_t>& sieve,
uint64_t segmentLow) const
{
uint64_t offset = 0;
Array<uint64_t, 8> pos;
for (std::size_t i = 0; i < buffers_.size(); i++)
pos[i] = (segmentLow % (buffers_[i].size() * 30)) / 30;
while (offset < sieve.size()) {
uint64_t bytesToCopy = sieve.size() - offset;
for (std::size_t i = 0; i < buffers_.size(); i++) {
uint64_t left = buffers_[i].size() - pos[i];
bytesToCopy = std::min(left, bytesToCopy);
}
andBuffers(&buffers_[0][pos[0]],
&buffers_[1][pos[1]],
&buffers_[2][pos[2]],
&buffers_[3][pos[3]],
&buffers_[4][pos[4]],
&buffers_[5][pos[5]],
&buffers_[6][pos[6]],
&buffers_[7][pos[7]],
&sieve[offset],
bytesToCopy);
offset += bytesToCopy;
for (std::size_t i = 0; i < pos.size(); i++) {
pos[i] += bytesToCopy;
if (pos[i] >= buffers_[i].size())
pos[i] = 0;
}
}
}
} // namespace