-
Notifications
You must be signed in to change notification settings - Fork 756
/
dumpsubr.c
3136 lines (2718 loc) · 79.8 KB
/
dumpsubr.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2018 Joyent, Inc.
* Copyright 2018 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/vm.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/conf.h>
#include <sys/kmem.h>
#include <sys/mem.h>
#include <sys/mman.h>
#include <sys/vnode.h>
#include <sys/errno.h>
#include <sys/memlist.h>
#include <sys/dumphdr.h>
#include <sys/dumpadm.h>
#include <sys/ksyms.h>
#include <sys/compress.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/cmn_err.h>
#include <sys/bitmap.h>
#include <sys/modctl.h>
#include <sys/utsname.h>
#include <sys/systeminfo.h>
#include <sys/vmem.h>
#include <sys/log.h>
#include <sys/var.h>
#include <sys/debug.h>
#include <sys/sunddi.h>
#include <fs/fs_subr.h>
#include <sys/fs/snode.h>
#include <sys/ontrap.h>
#include <sys/panic.h>
#include <sys/dkio.h>
#include <sys/vtoc.h>
#include <sys/errorq.h>
#include <sys/fm/util.h>
#include <sys/fs/zfs.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
#include <sys/clock_impl.h>
#include <sys/hold_page.h>
#include <sys/cpu.h>
#include <bzip2/bzlib.h>
#define ONE_GIG (1024 * 1024 * 1024UL)
/*
* Crash dump time is dominated by disk write time. To reduce this,
* the stronger compression method bzip2 is applied to reduce the dump
* size and hence reduce I/O time. However, bzip2 is much more
* computationally expensive than the existing lzjb algorithm, so to
* avoid increasing compression time, CPUs that are otherwise idle
* during panic are employed to parallelize the compression task.
* Many helper CPUs are needed to prevent bzip2 from being a
* bottleneck, and on systems with too few CPUs, the lzjb algorithm is
* parallelized instead. Lastly, I/O and compression are performed by
* different CPUs, and are hence overlapped in time, unlike the older
* serial code.
*
* Another important consideration is the speed of the dump
* device. Faster disks need less CPUs in order to benefit from
* parallel lzjb versus parallel bzip2. Therefore, the CPU count
* threshold for switching from parallel lzjb to paralled bzip2 is
* elevated for faster disks. The dump device speed is adduced from
* the setting for dumpbuf.iosize, see dump_update_clevel.
*/
/*
* exported vars
*/
kmutex_t dump_lock; /* lock for dump configuration */
dumphdr_t *dumphdr; /* dump header */
int dump_conflags = DUMP_KERNEL; /* dump configuration flags */
vnode_t *dumpvp; /* dump device vnode pointer */
u_offset_t dumpvp_size; /* size of dump device, in bytes */
char *dumppath; /* pathname of dump device */
int dump_timeout = 120; /* timeout for dumping pages */
int dump_timeleft; /* portion of dump_timeout remaining */
int dump_ioerr; /* dump i/o error */
int dump_check_used; /* enable check for used pages */
char *dump_stack_scratch; /* scratch area for saving stack summary */
/*
* Tunables for dump compression and parallelism. These can be set via
* /etc/system.
*
* dump_ncpu_low number of helpers for parallel lzjb
* This is also the minimum configuration.
*
* dump_bzip2_level bzip2 compression level: 1-9
* Higher numbers give greater compression, but take more memory
* and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
*
* dump_plat_mincpu the cross-over limit for using bzip2 (per platform):
* if dump_plat_mincpu == 0, then always do single threaded dump
* if ncpu >= dump_plat_mincpu then try to use bzip2
*
* dump_metrics_on if set, metrics are collected in the kernel, passed
* to savecore via the dump file, and recorded by savecore in
* METRICS.txt.
*/
uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */
uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */
/* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
#define MINCPU_NOT_SET ((uint_t)-1)
uint_t dump_plat_mincpu = MINCPU_NOT_SET;
/* tunables for pre-reserved heap */
uint_t dump_kmem_permap = 1024;
uint_t dump_kmem_pages = 0;
/* Define multiple buffers per helper to avoid stalling */
#define NCBUF_PER_HELPER 2
#define NCMAP_PER_HELPER 4
/* minimum number of helpers configured */
#define MINHELPERS (dump_ncpu_low)
#define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER)
/*
* Define constant parameters.
*
* CBUF_SIZE size of an output buffer
*
* CBUF_MAPSIZE size of virtual range for mapping pages
*
* CBUF_MAPNP size of virtual range in pages
*
*/
#define DUMP_1KB ((size_t)1 << 10)
#define DUMP_1MB ((size_t)1 << 20)
#define CBUF_SIZE ((size_t)1 << 17)
#define CBUF_MAPSHIFT (22)
#define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT)
#define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT))
/*
* Compression metrics are accumulated nano-second subtotals. The
* results are normalized by the number of pages dumped. A report is
* generated when dumpsys() completes and is saved in the dump image
* after the trailing dump header.
*
* Metrics are always collected. Set the variable dump_metrics_on to
* cause metrics to be saved in the crash file, where savecore will
* save it in the file METRICS.txt.
*/
#define PERPAGES \
PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
PERPAGE(copy) PERPAGE(compress) \
PERPAGE(write) \
PERPAGE(inwait) PERPAGE(outwait)
typedef struct perpage {
#define PERPAGE(x) hrtime_t x;
PERPAGES
#undef PERPAGE
} perpage_t;
/*
* This macro controls the code generation for collecting dump
* performance information. By default, the code is generated, but
* automatic saving of the information is disabled. If dump_metrics_on
* is set to 1, the timing information is passed to savecore via the
* crash file, where it is appended to the file dump-dir/METRICS.txt.
*/
#define COLLECT_METRICS
#ifdef COLLECT_METRICS
uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */
#define HRSTART(v, m) v##ts.m = gethrtime()
#define HRSTOP(v, m) v.m += gethrtime() - v##ts.m
#define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s
#define HREND(v, m) v.m += gethrtime() - v##ts.m
#define HRNORM(v, m, n) v.m /= (n)
#else
#define HRSTART(v, m)
#define HRSTOP(v, m)
#define HRBEGIN(v, m, s)
#define HREND(v, m)
#define HRNORM(v, m, n)
#endif /* COLLECT_METRICS */
/*
* Buffers for copying and compressing memory pages.
*
* cbuf_t buffer controllers: used for both input and output.
*
* The buffer state indicates how it is being used:
*
* CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for
* mapping input pages.
*
* CBUF_INREADY: input pages are mapped and ready for compression by a
* helper.
*
* CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap.
*
* CBUF_FREEBUF: CBUF_SIZE output buffer, which is available.
*
* CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper,
* ready to write out.
*
* CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper
* (reports UE errors.)
*/
typedef enum cbufstate {
CBUF_FREEMAP,
CBUF_INREADY,
CBUF_USEDMAP,
CBUF_FREEBUF,
CBUF_WRITE,
CBUF_ERRMSG
} cbufstate_t;
typedef struct cbuf cbuf_t;
struct cbuf {
cbuf_t *next; /* next in list */
cbufstate_t state; /* processing state */
size_t used; /* amount used */
size_t size; /* mem size */
char *buf; /* kmem or vmem */
pgcnt_t pagenum; /* index to pfn map */
pgcnt_t bitnum; /* first set bitnum */
pfn_t pfn; /* first pfn in mapped range */
int off; /* byte offset to first pfn */
};
static char dump_osimage_uuid[36 + 1];
#define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
#define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
((ch) >= 'A' && (ch) <= 'F'))
/*
* cqueue_t queues: a uni-directional channel for communication
* from the master to helper tasks or vice-versa using put and
* get primitives. Both mappings and data buffers are passed via
* queues. Producers close a queue when done. The number of
* active producers is reference counted so the consumer can
* detect end of data. Concurrent access is mediated by atomic
* operations for panic dump, or mutex/cv for live dump.
*
* There a four queues, used as follows:
*
* Queue Dataflow NewState
* --------------------------------------------------
* mainq master -> master FREEMAP
* master has initialized or unmapped an input buffer
* --------------------------------------------------
* helperq master -> helper INREADY
* master has mapped input for use by helper
* --------------------------------------------------
* mainq master <- helper USEDMAP
* helper is done with input
* --------------------------------------------------
* freebufq master -> helper FREEBUF
* master has initialized or written an output buffer
* --------------------------------------------------
* mainq master <- helper WRITE
* block of compressed pages from a helper
* --------------------------------------------------
* mainq master <- helper ERRMSG
* error messages from a helper (memory error case)
* --------------------------------------------------
* writerq master <- master WRITE
* non-blocking queue of blocks to write
* --------------------------------------------------
*/
typedef struct cqueue {
cbuf_t *volatile first; /* first in list */
cbuf_t *last; /* last in list */
hrtime_t ts; /* timestamp */
hrtime_t empty; /* total time empty */
kmutex_t mutex; /* live state lock */
kcondvar_t cv; /* live wait var */
lock_t spinlock; /* panic mode spin lock */
volatile uint_t open; /* producer ref count */
} cqueue_t;
/*
* Convenience macros for using the cqueue functions
* Note that the caller must have defined "dumpsync_t *ds"
*/
#define CQ_IS_EMPTY(q) \
(ds->q.first == NULL)
#define CQ_OPEN(q) \
atomic_inc_uint(&ds->q.open)
#define CQ_CLOSE(q) \
dumpsys_close_cq(&ds->q, ds->live)
#define CQ_PUT(q, cp, st) \
dumpsys_put_cq(&ds->q, cp, st, ds->live)
#define CQ_GET(q) \
dumpsys_get_cq(&ds->q, ds->live)
/*
* Dynamic state when dumpsys() is running.
*/
typedef struct dumpsync {
pgcnt_t npages; /* subtotal of pages dumped */
pgcnt_t pages_mapped; /* subtotal of pages mapped */
pgcnt_t pages_used; /* subtotal of pages used per map */
size_t nwrite; /* subtotal of bytes written */
uint_t live; /* running live dump */
uint_t neednl; /* will need to print a newline */
uint_t percent; /* dump progress */
uint_t percent_done; /* dump progress reported */
int sec_done; /* dump progress last report time */
cqueue_t freebufq; /* free kmem bufs for writing */
cqueue_t mainq; /* input for main task */
cqueue_t helperq; /* input for helpers */
cqueue_t writerq; /* input for writer */
hrtime_t start; /* start time */
hrtime_t elapsed; /* elapsed time when completed */
hrtime_t iotime; /* time spent writing nwrite bytes */
hrtime_t iowait; /* time spent waiting for output */
hrtime_t iowaitts; /* iowait timestamp */
perpage_t perpage; /* metrics */
perpage_t perpagets;
int dumpcpu; /* master cpu */
} dumpsync_t;
static dumpsync_t dumpsync; /* synchronization vars */
/*
* helper_t helpers: contains the context for a stream. CPUs run in
* parallel at dump time; each CPU creates a single stream of
* compression data. Stream data is divided into CBUF_SIZE blocks.
* The blocks are written in order within a stream. But, blocks from
* multiple streams can be interleaved. Each stream is identified by a
* unique tag.
*/
typedef struct helper {
int helper; /* bound helper id */
int tag; /* compression stream tag */
perpage_t perpage; /* per page metrics */
perpage_t perpagets; /* per page metrics (timestamps) */
taskqid_t taskqid; /* live dump task ptr */
int in, out; /* buffer offsets */
cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */
dumpsync_t *ds; /* pointer to sync vars */
size_t used; /* counts input consumed */
char *page; /* buffer for page copy */
char *lzbuf; /* lzjb output */
bz_stream bzstream; /* bzip2 state */
} helper_t;
#define MAINHELPER (-1) /* helper is also the main task */
#define FREEHELPER (-2) /* unbound helper */
#define DONEHELPER (-3) /* helper finished */
/*
* configuration vars for dumpsys
*/
typedef struct dumpcfg {
int threshold; /* ncpu threshold for bzip2 */
int nhelper; /* number of helpers */
int nhelper_used; /* actual number of helpers used */
int ncmap; /* number VA pages for compression */
int ncbuf; /* number of bufs for compression */
int ncbuf_used; /* number of bufs in use */
uint_t clevel; /* dump compression level */
helper_t *helper; /* array of helpers */
cbuf_t *cmap; /* array of input (map) buffers */
cbuf_t *cbuf; /* array of output buffers */
ulong_t *helpermap; /* set of dumpsys helper CPU ids */
ulong_t *bitmap; /* bitmap for marking pages to dump */
ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */
pgcnt_t bitmapsize; /* size of bitmap */
pgcnt_t rbitmapsize; /* size of bitmap for ranges */
pgcnt_t found4m; /* number ranges allocated by dump */
pgcnt_t foundsm; /* number small pages allocated by dump */
pid_t *pids; /* list of process IDs at dump time */
size_t maxsize; /* memory size needed at dump time */
size_t maxvmsize; /* size of reserved VM */
char *maxvm; /* reserved VM for spare pages */
lock_t helper_lock; /* protect helper state */
char helpers_wanted; /* flag to enable parallelism */
} dumpcfg_t;
static dumpcfg_t dumpcfg; /* config vars */
/*
* The dump I/O buffer.
*
* There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
* sized according to the optimum device transfer speed.
*/
typedef struct dumpbuf {
vnode_t *cdev_vp; /* VCHR open of the dump device */
len_t vp_limit; /* maximum write offset */
offset_t vp_off; /* current dump device offset */
char *cur; /* dump write pointer */
char *start; /* dump buffer address */
char *end; /* dump buffer end */
size_t size; /* size of dumpbuf in bytes */
size_t iosize; /* best transfer size for device */
} dumpbuf_t;
dumpbuf_t dumpbuf; /* I/O buffer */
/*
* For parallel dump, defines maximum time main task thread will wait
* for at least one helper to register in dumpcfg.helpermap, before
* assuming there are no helpers and falling back to serial mode.
* Value is chosen arbitrary and provides *really* long wait for any
* available helper to register.
*/
#define DUMP_HELPER_MAX_WAIT 1000 /* millisec */
/*
* The dump I/O buffer must be at least one page, at most xfer_size
* bytes, and should scale with physmem in between. The transfer size
* passed in will either represent a global default (maxphys) or the
* best size for the device. The size of the dumpbuf I/O buffer is
* limited by dumpbuf_limit (8MB by default) because the dump
* performance saturates beyond a certain size. The default is to
* select 1/4096 of the memory.
*/
static int dumpbuf_fraction = 12; /* memory size scale factor */
static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */
static size_t
dumpbuf_iosize(size_t xfer_size)
{
size_t iosize = ptob(physmem >> dumpbuf_fraction);
if (iosize < PAGESIZE)
iosize = PAGESIZE;
else if (iosize > xfer_size)
iosize = xfer_size;
if (iosize > dumpbuf_limit)
iosize = dumpbuf_limit;
return (iosize & PAGEMASK);
}
/*
* resize the I/O buffer
*/
static void
dumpbuf_resize(void)
{
char *old_buf = dumpbuf.start;
size_t old_size = dumpbuf.size;
char *new_buf;
size_t new_size;
ASSERT(MUTEX_HELD(&dump_lock));
new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys));
if (new_size <= old_size)
return; /* no need to reallocate buffer */
new_buf = kmem_alloc(new_size, KM_SLEEP);
dumpbuf.size = new_size;
dumpbuf.start = new_buf;
dumpbuf.end = new_buf + new_size;
kmem_free(old_buf, old_size);
}
/*
* dump_update_clevel is called when dumpadm configures the dump device.
* Calculate number of helpers and buffers.
* Allocate the minimum configuration for now.
*
* When the dump file is configured we reserve a minimum amount of
* memory for use at crash time. But we reserve VA for all the memory
* we really want in order to do the fastest dump possible. The VA is
* backed by pages not being dumped, according to the bitmap. If
* there is insufficient spare memory, however, we fall back to the
* minimum.
*
* Live dump (savecore -L) always uses the minimum config.
*
* clevel 0 is single threaded lzjb
* clevel 1 is parallel lzjb
* clevel 2 is parallel bzip2
*
* The ncpu threshold is selected with dump_plat_mincpu.
* On OPL, set_platform_defaults() overrides the sun4u setting.
* The actual values are defined via DUMP_PLAT_*_MINCPU macros.
*
* Architecture Threshold Algorithm
* sun4u < 51 parallel lzjb
* sun4u >= 51 parallel bzip2(*)
* sun4u OPL < 8 parallel lzjb
* sun4u OPL >= 8 parallel bzip2(*)
* sun4v < 128 parallel lzjb
* sun4v >= 128 parallel bzip2(*)
* x86 < 11 parallel lzjb
* x86 >= 11 parallel bzip2(*)
* 32-bit N/A single-threaded lzjb
*
* (*) bzip2 is only chosen if there is sufficient available
* memory for buffers at dump time. See dumpsys_get_maxmem().
*
* Faster dump devices have larger I/O buffers. The threshold value is
* increased according to the size of the dump I/O buffer, because
* parallel lzjb performs better with faster disks. For buffers >= 1MB
* the threshold is 3X; for buffers >= 256K threshold is 2X.
*
* For parallel dumps, the number of helpers is ncpu-1. The CPU
* running panic runs the main task. For single-threaded dumps, the
* panic CPU does lzjb compression (it is tagged as MAINHELPER.)
*
* Need multiple buffers per helper so that they do not block waiting
* for the main task.
* parallel single-threaded
* Number of output buffers: nhelper*2 1
* Number of mapping buffers: nhelper*4 1
*
*/
static void
dump_update_clevel()
{
int tag;
size_t bz2size;
helper_t *hp, *hpend;
cbuf_t *cp, *cpend;
dumpcfg_t *old = &dumpcfg;
dumpcfg_t newcfg = *old;
dumpcfg_t *new = &newcfg;
ASSERT(MUTEX_HELD(&dump_lock));
/*
* Free the previously allocated bufs and VM.
*/
if (old->helper != NULL) {
/* helpers */
hpend = &old->helper[old->nhelper];
for (hp = old->helper; hp != hpend; hp++) {
if (hp->lzbuf != NULL)
kmem_free(hp->lzbuf, PAGESIZE);
if (hp->page != NULL)
kmem_free(hp->page, PAGESIZE);
}
kmem_free(old->helper, old->nhelper * sizeof (helper_t));
/* VM space for mapping pages */
cpend = &old->cmap[old->ncmap];
for (cp = old->cmap; cp != cpend; cp++)
vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE);
kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t));
/* output bufs */
cpend = &old->cbuf[old->ncbuf];
for (cp = old->cbuf; cp != cpend; cp++)
if (cp->buf != NULL)
kmem_free(cp->buf, cp->size);
kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t));
/* reserved VM for dumpsys_get_maxmem */
if (old->maxvmsize > 0)
vmem_xfree(heap_arena, old->maxvm, old->maxvmsize);
}
/*
* Allocate memory and VM.
* One CPU runs dumpsys, the rest are helpers.
*/
new->nhelper = ncpus - 1;
if (new->nhelper < 1)
new->nhelper = 1;
if (new->nhelper > DUMP_MAX_NHELPER)
new->nhelper = DUMP_MAX_NHELPER;
/* use platform default, unless /etc/system overrides */
if (dump_plat_mincpu == MINCPU_NOT_SET)
dump_plat_mincpu = dump_plat_mincpu_default;
/* increase threshold for faster disks */
new->threshold = dump_plat_mincpu;
if (dumpbuf.iosize >= DUMP_1MB)
new->threshold *= 3;
else if (dumpbuf.iosize >= (256 * DUMP_1KB))
new->threshold *= 2;
/* figure compression level based upon the computed threshold. */
if (dump_plat_mincpu == 0 || new->nhelper < 2) {
new->clevel = 0;
new->nhelper = 1;
} else if ((new->nhelper + 1) >= new->threshold) {
new->clevel = DUMP_CLEVEL_BZIP2;
} else {
new->clevel = DUMP_CLEVEL_LZJB;
}
if (new->clevel == 0) {
new->ncbuf = 1;
new->ncmap = 1;
} else {
new->ncbuf = NCBUF_PER_HELPER * new->nhelper;
new->ncmap = NCMAP_PER_HELPER * new->nhelper;
}
/*
* Allocate new data structures and buffers for MINHELPERS,
* and also figure the max desired size.
*/
bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
new->maxsize = 0;
new->maxvmsize = 0;
new->maxvm = NULL;
tag = 1;
new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP);
hpend = &new->helper[new->nhelper];
for (hp = new->helper; hp != hpend; hp++) {
hp->tag = tag++;
if (hp < &new->helper[MINHELPERS]) {
hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP);
hp->page = kmem_alloc(PAGESIZE, KM_SLEEP);
} else if (new->clevel < DUMP_CLEVEL_BZIP2) {
new->maxsize += 2 * PAGESIZE;
} else {
new->maxsize += PAGESIZE;
}
if (new->clevel >= DUMP_CLEVEL_BZIP2)
new->maxsize += bz2size;
}
new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP);
cpend = &new->cbuf[new->ncbuf];
for (cp = new->cbuf; cp != cpend; cp++) {
cp->state = CBUF_FREEBUF;
cp->size = CBUF_SIZE;
if (cp < &new->cbuf[MINCBUFS])
cp->buf = kmem_alloc(cp->size, KM_SLEEP);
else
new->maxsize += cp->size;
}
new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
cpend = &new->cmap[new->ncmap];
for (cp = new->cmap; cp != cpend; cp++) {
cp->state = CBUF_FREEMAP;
cp->size = CBUF_MAPSIZE;
cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
0, 0, NULL, NULL, VM_SLEEP);
}
/* reserve VA to be backed with spare pages at crash time */
if (new->maxsize > 0) {
new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
}
/*
* Reserve memory for kmem allocation calls made during crash dump. The
* hat layer allocates memory for each mapping created, and the I/O path
* allocates buffers and data structs.
*
* On larger systems, we easily exceed the lower amount, so we need some
* more space; the cut-over point is relatively arbitrary. If we run
* out, the only impact is that kmem state in the dump becomes
* inconsistent.
*/
if (dump_kmem_pages == 0) {
if (physmem > (16 * ONE_GIG) / PAGESIZE)
dump_kmem_pages = 20;
else
dump_kmem_pages = 8;
}
kmem_dump_init((new->ncmap * dump_kmem_permap) +
(dump_kmem_pages * PAGESIZE));
/* set new config pointers */
*old = *new;
}
/*
* Define a struct memlist walker to optimize bitnum to pfn
* lookup. The walker maintains the state of the list traversal.
*/
typedef struct dumpmlw {
struct memlist *mp; /* current memlist */
pgcnt_t basenum; /* bitnum base offset */
pgcnt_t mppages; /* current memlist size */
pgcnt_t mpleft; /* size to end of current memlist */
pfn_t mpaddr; /* first pfn in memlist */
} dumpmlw_t;
/* initialize the walker */
static inline void
dump_init_memlist_walker(dumpmlw_t *pw)
{
pw->mp = phys_install;
pw->basenum = 0;
pw->mppages = pw->mp->ml_size >> PAGESHIFT;
pw->mpleft = pw->mppages;
pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
}
/*
* Lookup pfn given bitnum. The memlist can be quite long on some
* systems (e.g.: one per board). To optimize sequential lookups, the
* caller initializes and presents a memlist walker.
*/
static pfn_t
dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw)
{
bitnum -= pw->basenum;
while (pw->mp != NULL) {
if (bitnum < pw->mppages) {
pw->mpleft = pw->mppages - bitnum;
return (pw->mpaddr + bitnum);
}
bitnum -= pw->mppages;
pw->basenum += pw->mppages;
pw->mp = pw->mp->ml_next;
if (pw->mp != NULL) {
pw->mppages = pw->mp->ml_size >> PAGESHIFT;
pw->mpleft = pw->mppages;
pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
}
}
return (PFN_INVALID);
}
static pgcnt_t
dump_pfn_to_bitnum(pfn_t pfn)
{
struct memlist *mp;
pgcnt_t bitnum = 0;
for (mp = phys_install; mp != NULL; mp = mp->ml_next) {
if (pfn >= (mp->ml_address >> PAGESHIFT) &&
pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT))
return (bitnum + pfn - (mp->ml_address >> PAGESHIFT));
bitnum += mp->ml_size >> PAGESHIFT;
}
return ((pgcnt_t)-1);
}
/*
* Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The
* mapping of pfn to range index is imperfect because pfn and bitnum
* do not have the same phase. To make sure a CBUF_MAPSIZE range is
* covered, call this for both ends:
* dump_set_used(base)
* dump_set_used(base+CBUF_MAPNP-1)
*
* This is used during a panic dump to mark pages allocated by
* dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by
* page_get_mnode_freelist() to make sure pages used by dump are never
* allocated.
*/
#define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT))
static void
dump_set_used(pfn_t pfn)
{
pgcnt_t bitnum, rbitnum;
bitnum = dump_pfn_to_bitnum(pfn);
ASSERT(bitnum != (pgcnt_t)-1);
rbitnum = CBUF_MAPP2R(bitnum);
ASSERT(rbitnum < dumpcfg.rbitmapsize);
BT_SET(dumpcfg.rbitmap, rbitnum);
}
int
dump_test_used(pfn_t pfn)
{
pgcnt_t bitnum, rbitnum;
bitnum = dump_pfn_to_bitnum(pfn);
ASSERT(bitnum != (pgcnt_t)-1);
rbitnum = CBUF_MAPP2R(bitnum);
ASSERT(rbitnum < dumpcfg.rbitmapsize);
return (BT_TEST(dumpcfg.rbitmap, rbitnum));
}
/*
* dumpbzalloc and dumpbzfree are callbacks from the bzip2 library.
* dumpsys_get_maxmem() uses them for BZ2_bzCompressInit().
*/
static void *
dumpbzalloc(void *opaque, int items, int size)
{
size_t *sz;
char *ret;
ASSERT(opaque != NULL);
sz = opaque;
ret = dumpcfg.maxvm + *sz;
*sz += items * size;
*sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN);
ASSERT(*sz <= dumpcfg.maxvmsize);
return (ret);
}
/*ARGSUSED*/
static void
dumpbzfree(void *opaque, void *addr)
{
}
/*
* Perform additional checks on the page to see if we can really use
* it. The kernel (kas) pages are always set in the bitmap. However,
* boot memory pages (prom_ppages or P_BOOTPAGES) are not in the
* bitmap. So we check for them.
*/
static inline int
dump_pfn_check(pfn_t pfn)
{
page_t *pp = page_numtopp_nolock(pfn);
if (pp == NULL || pp->p_pagenum != pfn ||
#if defined(__sparc)
pp->p_vnode == &promvp ||
#else
PP_ISBOOTPAGES(pp) ||
#endif
pp->p_toxic != 0)
return (0);
return (1);
}
/*
* Check a range to see if all contained pages are available and
* return non-zero if the range can be used.
*/
static inline int
dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn)
{
for (; start < end; start++, pfn++) {
if (BT_TEST(dumpcfg.bitmap, start))
return (0);
if (!dump_pfn_check(pfn))
return (0);
}
return (1);
}
/*
* dumpsys_get_maxmem() is called during panic. Find unused ranges
* and use them for buffers. If we find enough memory switch to
* parallel bzip2, otherwise use parallel lzjb.
*
* It searches the dump bitmap in 2 passes. The first time it looks
* for CBUF_MAPSIZE ranges. On the second pass it uses small pages.
*/
static void
dumpsys_get_maxmem()
{
dumpcfg_t *cfg = &dumpcfg;
cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf];
helper_t *endhp = &cfg->helper[cfg->nhelper];
pgcnt_t bitnum, end;
size_t sz, endsz, bz2size;
pfn_t pfn, off;
cbuf_t *cp;
helper_t *hp, *ohp;
dumpmlw_t mlw;
int k;
/*
* Setting dump_plat_mincpu to 0 at any time forces a serial
* dump.
*/
if (dump_plat_mincpu == 0) {
cfg->clevel = 0;
return;
}
/*
* There may be no point in looking for spare memory. If
* dumping all memory, then none is spare. If doing a serial
* dump, then already have buffers.
*/
if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB ||
(dump_conflags & DUMP_ALL) != 0) {
if (cfg->clevel > DUMP_CLEVEL_LZJB)
cfg->clevel = DUMP_CLEVEL_LZJB;
return;
}
sz = 0;
cfg->found4m = 0;
cfg->foundsm = 0;
/* bitmap of ranges used to estimate which pfns are being used */
bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize));
/* find ranges that are not being dumped to use for buffers */
dump_init_memlist_walker(&mlw);
for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
dump_timeleft = dump_timeout;
end = bitnum + CBUF_MAPNP;
pfn = dump_bitnum_to_pfn(bitnum, &mlw);
ASSERT(pfn != PFN_INVALID);
/* skip partial range at end of mem segment */
if (mlw.mpleft < CBUF_MAPNP) {
end = bitnum + mlw.mpleft;
continue;
}
/* skip non aligned pages */
off = P2PHASE(pfn, CBUF_MAPNP);
if (off != 0) {
end -= off;
continue;
}
if (!dump_range_check(bitnum, end, pfn))
continue;
ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize);
hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn,
PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
sz += CBUF_MAPSIZE;
cfg->found4m++;
/* set the bitmap for both ends to be sure to cover the range */
dump_set_used(pfn);
dump_set_used(pfn + CBUF_MAPNP - 1);
if (sz >= cfg->maxsize)
goto foundmax;
}
/* Add small pages if we can't find enough large pages. */
dump_init_memlist_walker(&mlw);
for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
dump_timeleft = dump_timeout;
end = bitnum + CBUF_MAPNP;
pfn = dump_bitnum_to_pfn(bitnum, &mlw);
ASSERT(pfn != PFN_INVALID);
/* Find any non-aligned pages at start and end of segment. */
off = P2PHASE(pfn, CBUF_MAPNP);
if (mlw.mpleft < CBUF_MAPNP) {
end = bitnum + mlw.mpleft;
} else if (off != 0) {
end -= off;
} else if (cfg->found4m && dump_test_used(pfn)) {
continue;
}
for (; bitnum < end; bitnum++, pfn++) {
dump_timeleft = dump_timeout;
if (BT_TEST(dumpcfg.bitmap, bitnum))
continue;
if (!dump_pfn_check(pfn))
continue;