/
seg_vn.c
10342 lines (9483 loc) · 280 KB
/
seg_vn.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2018 Joyent, Inc.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
/*
* VM - shared or copy-on-write from a vnode/anonymous memory.
*/
#include <sys/types.h>
#include <sys/param.h>
#include <sys/t_lock.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/debug.h>
#include <sys/cred.h>
#include <sys/vmsystm.h>
#include <sys/tuneable.h>
#include <sys/bitmap.h>
#include <sys/swap.h>
#include <sys/kmem.h>
#include <sys/sysmacros.h>
#include <sys/vtrace.h>
#include <sys/cmn_err.h>
#include <sys/callb.h>
#include <sys/vm.h>
#include <sys/dumphdr.h>
#include <sys/lgrp.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_vn.h>
#include <vm/pvn.h>
#include <vm/anon.h>
#include <vm/page.h>
#include <vm/vpage.h>
#include <sys/proc.h>
#include <sys/task.h>
#include <sys/project.h>
#include <sys/zone.h>
#include <sys/shm_impl.h>
/*
* segvn_fault needs a temporary page list array. To avoid calling kmem all
* the time, it creates a small (PVN_GETPAGE_NUM entry) array and uses it if
* it can. In the rare case when this page list is not large enough, it
* goes and gets a large enough array from kmem.
*
* This small page list array covers either 8 pages or 64kB worth of pages -
* whichever is smaller.
*/
#define PVN_MAX_GETPAGE_SZ 0x10000
#define PVN_MAX_GETPAGE_NUM 0x8
#if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE
#define PVN_GETPAGE_SZ ptob(PVN_MAX_GETPAGE_NUM)
#define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM
#else
#define PVN_GETPAGE_SZ PVN_MAX_GETPAGE_SZ
#define PVN_GETPAGE_NUM btop(PVN_MAX_GETPAGE_SZ)
#endif
/*
* Private seg op routines.
*/
static int segvn_dup(struct seg *seg, struct seg *newseg);
static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
static void segvn_free(struct seg *seg);
static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
caddr_t addr, size_t len, enum fault_type type,
enum seg_rw rw);
static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
static int segvn_setprot(struct seg *seg, caddr_t addr,
size_t len, uint_t prot);
static int segvn_checkprot(struct seg *seg, caddr_t addr,
size_t len, uint_t prot);
static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
static size_t segvn_swapout(struct seg *seg);
static int segvn_sync(struct seg *seg, caddr_t addr, size_t len,
int attr, uint_t flags);
static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len,
char *vec);
static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
int attr, int op, ulong_t *lockmap, size_t pos);
static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
uint_t *protv);
static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr);
static int segvn_gettype(struct seg *seg, caddr_t addr);
static int segvn_getvp(struct seg *seg, caddr_t addr,
struct vnode **vpp);
static int segvn_advise(struct seg *seg, caddr_t addr, size_t len,
uint_t behav);
static void segvn_dump(struct seg *seg);
static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
struct page ***ppp, enum lock_type type, enum seg_rw rw);
static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
uint_t szc);
static int segvn_getmemid(struct seg *seg, caddr_t addr,
memid_t *memidp);
static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t);
static int segvn_capable(struct seg *seg, segcapability_t capable);
static int segvn_inherit(struct seg *, caddr_t, size_t, uint_t);
struct seg_ops segvn_ops = {
segvn_dup,
segvn_unmap,
segvn_free,
segvn_fault,
segvn_faulta,
segvn_setprot,
segvn_checkprot,
segvn_kluster,
segvn_swapout,
segvn_sync,
segvn_incore,
segvn_lockop,
segvn_getprot,
segvn_getoffset,
segvn_gettype,
segvn_getvp,
segvn_advise,
segvn_dump,
segvn_pagelock,
segvn_setpagesize,
segvn_getmemid,
segvn_getpolicy,
segvn_capable,
segvn_inherit
};
/*
* Common zfod structures, provided as a shorthand for others to use.
*/
static segvn_crargs_t zfod_segvn_crargs =
SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
static segvn_crargs_t kzfod_segvn_crargs =
SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
PROT_ALL & ~PROT_USER);
static segvn_crargs_t stack_noexec_crargs =
SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */
caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */
caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */
caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
#define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */
size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */
size_t segvn_pglock_comb_thrshld = (1UL << 16); /* 64K */
size_t segvn_pglock_comb_balign = (1UL << 16); /* 64K */
uint_t segvn_pglock_comb_bshift;
size_t segvn_pglock_comb_palign;
static int segvn_concat(struct seg *, struct seg *, int);
static int segvn_extend_prev(struct seg *, struct seg *,
struct segvn_crargs *, size_t);
static int segvn_extend_next(struct seg *, struct seg *,
struct segvn_crargs *, size_t);
static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
static void segvn_pagelist_rele(page_t **);
static void segvn_setvnode_mpss(vnode_t *);
static void segvn_relocate_pages(page_t **, page_t *);
static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
uint_t, page_t **, page_t **, uint_t *, int *);
static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
u_offset_t, struct vpage *, page_t **, uint_t,
enum fault_type, enum seg_rw, int);
static void segvn_vpage(struct seg *);
static size_t segvn_count_swap_by_vpages(struct seg *);
static void segvn_purge(struct seg *seg);
static int segvn_reclaim(void *, caddr_t, size_t, struct page **,
enum seg_rw, int);
static int shamp_reclaim(void *, caddr_t, size_t, struct page **,
enum seg_rw, int);
static int sameprot(struct seg *, caddr_t, size_t);
static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t);
static int segvn_clrszc(struct seg *);
static struct seg *segvn_split_seg(struct seg *, caddr_t);
static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
ulong_t, uint_t);
static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
size_t, void *, u_offset_t);
static struct kmem_cache *segvn_cache;
static struct kmem_cache **segvn_szc_cache;
#ifdef VM_STATS
static struct segvnvmstats_str {
ulong_t fill_vp_pages[31];
ulong_t fltvnpages[49];
ulong_t fullszcpages[10];
ulong_t relocatepages[3];
ulong_t fltanpages[17];
ulong_t pagelock[2];
ulong_t demoterange[3];
} segvnvmstats;
#endif /* VM_STATS */
#define SDR_RANGE 1 /* demote entire range */
#define SDR_END 2 /* demote non aligned ends only */
#define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \
if ((len) != 0) { \
lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \
ASSERT(lpgaddr >= (seg)->s_base); \
lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \
(len)), pgsz); \
ASSERT(lpgeaddr > lpgaddr); \
ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \
} else { \
lpgeaddr = lpgaddr = (addr); \
} \
}
/*ARGSUSED*/
static int
segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
{
struct segvn_data *svd = buf;
rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
svd->svn_trnext = svd->svn_trprev = NULL;
return (0);
}
/*ARGSUSED1*/
static void
segvn_cache_destructor(void *buf, void *cdrarg)
{
struct segvn_data *svd = buf;
rw_destroy(&svd->lock);
mutex_destroy(&svd->segfree_syncmtx);
}
/*ARGSUSED*/
static int
svntr_cache_constructor(void *buf, void *cdrarg, int kmflags)
{
bzero(buf, sizeof (svntr_t));
return (0);
}
/*
* Patching this variable to non-zero allows the system to run with
* stacks marked as "not executable". It's a bit of a kludge, but is
* provided as a tweakable for platforms that export those ABIs
* (e.g. sparc V8) that have executable stacks enabled by default.
* There are also some restrictions for platforms that don't actually
* implement 'noexec' protections.
*
* Once enabled, the system is (therefore) unable to provide a fully
* ABI-compliant execution environment, though practically speaking,
* most everything works. The exceptions are generally some interpreters
* and debuggers that create executable code on the stack and jump
* into it (without explicitly mprotecting the address range to include
* PROT_EXEC).
*
* One important class of applications that are disabled are those
* that have been transformed into malicious agents using one of the
* numerous "buffer overflow" attacks. See 4007890.
*/
int noexec_user_stack = 0;
int noexec_user_stack_log = 1;
int segvn_lpg_disable = 0;
uint_t segvn_maxpgszc = 0;
ulong_t segvn_vmpss_clrszc_cnt;
ulong_t segvn_vmpss_clrszc_err;
ulong_t segvn_fltvnpages_clrszc_cnt;
ulong_t segvn_fltvnpages_clrszc_err;
ulong_t segvn_setpgsz_align_err;
ulong_t segvn_setpgsz_anon_align_err;
ulong_t segvn_setpgsz_getattr_err;
ulong_t segvn_setpgsz_eof_err;
ulong_t segvn_faultvnmpss_align_err1;
ulong_t segvn_faultvnmpss_align_err2;
ulong_t segvn_faultvnmpss_align_err3;
ulong_t segvn_faultvnmpss_align_err4;
ulong_t segvn_faultvnmpss_align_err5;
ulong_t segvn_vmpss_pageio_deadlk_err;
int segvn_use_regions = 1;
/*
* Segvn supports text replication optimization for NUMA platforms. Text
* replica's are represented by anon maps (amp). There's one amp per text file
* region per lgroup. A process chooses the amp for each of its text mappings
* based on the lgroup assignment of its main thread (t_tid = 1). All
* processes that want a replica on a particular lgroup for the same text file
* mapping share the same amp. amp's are looked up in svntr_hashtab hash table
* with vp,off,size,szc used as a key. Text replication segments are read only
* MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
* forcing COW faults from vnode to amp and mapping amp pages instead of vnode
* pages. Replication amp is assigned to a segment when it gets its first
* pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
* rechecks periodically if the process still maps an amp local to the main
* thread. If not async thread forces process to remap to an amp in the new
* home lgroup of the main thread. Current text replication implementation
* only provides the benefit to workloads that do most of their work in the
* main thread of a process or all the threads of a process run in the same
* lgroup. To extend text replication benefit to different types of
* multithreaded workloads further work would be needed in the hat layer to
* allow the same virtual address in the same hat to simultaneously map
* different physical addresses (i.e. page table replication would be needed
* for x86).
*
* amp pages are used instead of vnode pages as long as segment has a very
* simple life cycle. It's created via segvn_create(), handles S_EXEC
* (S_READ) pagefaults and is fully unmapped. If anything more complicated
* happens such as protection is changed, real COW fault happens, pagesize is
* changed, MC_LOCK is requested or segment is partially unmapped we turn off
* text replication by converting the segment back to vnode only segment
* (unmap segment's address range and set svd->amp to NULL).
*
* The original file can be changed after amp is inserted into
* svntr_hashtab. Processes that are launched after the file is already
* changed can't use the replica's created prior to the file change. To
* implement this functionality hash entries are timestamped. Replica's can
* only be used if current file modification time is the same as the timestamp
* saved when hash entry was created. However just timestamps alone are not
* sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
* deal with file changes via MAP_SHARED mappings differently. When writable
* MAP_SHARED mappings are created to vnodes marked as executable we mark all
* existing replica's for this vnode as not usable for future text
* mappings. And we don't create new replica's for files that currently have
* potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
* true).
*/
#define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20)
size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR;
static ulong_t svntr_hashtab_sz = 512;
static svntr_bucket_t *svntr_hashtab = NULL;
static struct kmem_cache *svntr_cache;
static svntr_stats_t *segvn_textrepl_stats;
static ksema_t segvn_trasync_sem;
int segvn_disable_textrepl = 1;
size_t textrepl_size_thresh = (size_t)-1;
size_t segvn_textrepl_bytes = 0;
size_t segvn_textrepl_max_bytes = 0;
clock_t segvn_update_textrepl_interval = 0;
int segvn_update_tr_time = 10;
int segvn_disable_textrepl_update = 0;
static void segvn_textrepl(struct seg *);
static void segvn_textunrepl(struct seg *, int);
static void segvn_inval_trcache(vnode_t *);
static void segvn_trasync_thread(void);
static void segvn_trupdate_wakeup(void *);
static void segvn_trupdate(void);
static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *,
ulong_t);
/*
* Initialize segvn data structures
*/
void
segvn_init(void)
{
uint_t maxszc;
uint_t szc;
size_t pgsz;
segvn_cache = kmem_cache_create("segvn_cache",
sizeof (struct segvn_data), 0,
segvn_cache_constructor, segvn_cache_destructor, NULL,
NULL, NULL, 0);
if (segvn_lpg_disable == 0) {
szc = maxszc = page_num_pagesizes() - 1;
if (szc == 0) {
segvn_lpg_disable = 1;
}
if (page_get_pagesize(0) != PAGESIZE) {
panic("segvn_init: bad szc 0");
/*NOTREACHED*/
}
while (szc != 0) {
pgsz = page_get_pagesize(szc);
if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
panic("segvn_init: bad szc %d", szc);
/*NOTREACHED*/
}
szc--;
}
if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
segvn_maxpgszc = maxszc;
}
if (segvn_maxpgszc) {
segvn_szc_cache = (struct kmem_cache **)kmem_alloc(
(segvn_maxpgszc + 1) * sizeof (struct kmem_cache *),
KM_SLEEP);
}
for (szc = 1; szc <= segvn_maxpgszc; szc++) {
char str[32];
(void) sprintf(str, "segvn_szc_cache%d", szc);
segvn_szc_cache[szc] = kmem_cache_create(str,
page_get_pagecnt(szc) * sizeof (page_t *), 0,
NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
}
if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL))
segvn_use_regions = 0;
/*
* For now shared regions and text replication segvn support
* are mutually exclusive. This is acceptable because
* currently significant benefit from text replication was
* only observed on AMD64 NUMA platforms (due to relatively
* small L2$ size) and currently we don't support shared
* regions on x86.
*/
if (segvn_use_regions && !segvn_disable_textrepl) {
segvn_disable_textrepl = 1;
}
#if defined(_LP64)
if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 &&
!segvn_disable_textrepl) {
ulong_t i;
size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t);
svntr_cache = kmem_cache_create("svntr_cache",
sizeof (svntr_t), 0, svntr_cache_constructor, NULL,
NULL, NULL, NULL, 0);
svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP);
for (i = 0; i < svntr_hashtab_sz; i++) {
mutex_init(&svntr_hashtab[i].tr_lock, NULL,
MUTEX_DEFAULT, NULL);
}
segvn_textrepl_max_bytes = ptob(physmem) /
segvn_textrepl_max_bytes_factor;
segvn_textrepl_stats = kmem_zalloc(NCPU *
sizeof (svntr_stats_t), KM_SLEEP);
sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
(void) thread_create(NULL, 0, segvn_trasync_thread,
NULL, 0, &p0, TS_RUN, minclsyspri);
}
#endif
if (!ISP2(segvn_pglock_comb_balign) ||
segvn_pglock_comb_balign < PAGESIZE) {
segvn_pglock_comb_balign = 1UL << 16; /* 64K */
}
segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1;
segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign);
}
#define SEGVN_PAGEIO ((void *)0x1)
#define SEGVN_NOPAGEIO ((void *)0x2)
static void
segvn_setvnode_mpss(vnode_t *vp)
{
int err;
ASSERT(vp->v_mpssdata == NULL ||
vp->v_mpssdata == SEGVN_PAGEIO ||
vp->v_mpssdata == SEGVN_NOPAGEIO);
if (vp->v_mpssdata == NULL) {
if (vn_vmpss_usepageio(vp)) {
err = VOP_PAGEIO(vp, (page_t *)NULL,
(u_offset_t)0, 0, 0, CRED(), NULL);
} else {
err = ENOSYS;
}
/*
* set v_mpssdata just once per vnode life
* so that it never changes.
*/
mutex_enter(&vp->v_lock);
if (vp->v_mpssdata == NULL) {
if (err == EINVAL) {
vp->v_mpssdata = SEGVN_PAGEIO;
} else {
vp->v_mpssdata = SEGVN_NOPAGEIO;
}
}
mutex_exit(&vp->v_lock);
}
}
int
segvn_create(struct seg **segpp, void *argsp)
{
struct seg *seg = *segpp;
extern lgrp_mem_policy_t lgrp_mem_default_policy;
struct segvn_crargs *a = (struct segvn_crargs *)argsp;
struct segvn_data *svd;
size_t swresv = 0;
struct cred *cred;
struct anon_map *amp;
int error = 0;
size_t pgsz;
lgrp_mem_policy_t mpolicy = lgrp_mem_default_policy;
int use_rgn = 0;
int trok = 0;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
panic("segvn_create type");
/*NOTREACHED*/
}
/*
* Check arguments. If a shared anon structure is given then
* it is illegal to also specify a vp.
*/
if (a->amp != NULL && a->vp != NULL) {
panic("segvn_create anon_map");
/*NOTREACHED*/
}
if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) &&
a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) &&
segvn_use_regions) {
use_rgn = 1;
}
/* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
if (a->type == MAP_SHARED)
a->flags &= ~MAP_NORESERVE;
if (a->szc != 0) {
if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) ||
(a->amp != NULL && a->type == MAP_PRIVATE) ||
(a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
a->szc = 0;
} else {
if (a->szc > segvn_maxpgszc)
a->szc = segvn_maxpgszc;
pgsz = page_get_pagesize(a->szc);
if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
!IS_P2ALIGNED(seg->s_size, pgsz)) {
a->szc = 0;
} else if (a->vp != NULL) {
if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
/*
* paranoid check.
* hat_page_demote() is not supported
* on swapfs pages.
*/
a->szc = 0;
} else if (map_addr_vacalign_check(seg->s_base,
a->offset & PAGEMASK)) {
a->szc = 0;
}
} else if (a->amp != NULL) {
pgcnt_t anum = btopr(a->offset);
pgcnt_t pgcnt = page_get_pagecnt(a->szc);
if (!IS_P2ALIGNED(anum, pgcnt)) {
a->szc = 0;
}
}
}
}
/*
* If segment may need private pages, reserve them now.
*/
if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
(a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
if (anon_resv_zone(seg->s_size,
seg->s_as->a_proc->p_zone) == 0)
return (EAGAIN);
swresv = seg->s_size;
TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
seg, swresv, 1);
}
/*
* Reserve any mapping structures that may be required.
*
* Don't do it for segments that may use regions. It's currently a
* noop in the hat implementations anyway.
*/
if (!use_rgn) {
hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
}
if (a->cred) {
cred = a->cred;
crhold(cred);
} else {
crhold(cred = CRED());
}
/* Inform the vnode of the new mapping */
if (a->vp != NULL) {
error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
seg->s_as, seg->s_base, seg->s_size, a->prot,
a->maxprot, a->type, cred, NULL);
if (error) {
if (swresv != 0) {
anon_unresv_zone(swresv,
seg->s_as->a_proc->p_zone);
TRACE_3(TR_FAC_VM, TR_ANON_PROC,
"anon proc:%p %lu %u", seg, swresv, 0);
}
crfree(cred);
if (!use_rgn) {
hat_unload(seg->s_as->a_hat, seg->s_base,
seg->s_size, HAT_UNLOAD_UNMAP);
}
return (error);
}
/*
* svntr_hashtab will be NULL if we support shared regions.
*/
trok = ((a->flags & MAP_TEXT) &&
(seg->s_size > textrepl_size_thresh ||
(a->flags & _MAP_TEXTREPL)) &&
lgrp_optimizations() && svntr_hashtab != NULL &&
a->type == MAP_PRIVATE && swresv == 0 &&
!(a->flags & MAP_NORESERVE) &&
seg->s_as != &kas && a->vp->v_type == VREG);
ASSERT(!trok || !use_rgn);
}
/*
* MAP_NORESERVE mappings don't count towards the VSZ of a process
* until we fault the pages in.
*/
if ((a->vp == NULL || a->vp->v_type != VREG) &&
a->flags & MAP_NORESERVE) {
seg->s_as->a_resvsize -= seg->s_size;
}
/*
* If more than one segment in the address space, and they're adjacent
* virtually, try to concatenate them. Don't concatenate if an
* explicit anon_map structure was supplied (e.g., SystemV shared
* memory) or if we'll use text replication for this segment.
*/
if (a->amp == NULL && !use_rgn && !trok) {
struct seg *pseg, *nseg;
struct segvn_data *psvd, *nsvd;
lgrp_mem_policy_t ppolicy, npolicy;
uint_t lgrp_mem_policy_flags = 0;
/*
* Memory policy flags (lgrp_mem_policy_flags) is valid when
* extending stack/heap segments.
*/
if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
!(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
} else {
/*
* Get policy when not extending it from another segment
*/
mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
}
/*
* First, try to concatenate the previous and new segments
*/
pseg = AS_SEGPREV(seg->s_as, seg);
if (pseg != NULL &&
pseg->s_base + pseg->s_size == seg->s_base &&
pseg->s_ops == &segvn_ops) {
/*
* Get memory allocation policy from previous segment.
* When extension is specified (e.g. for heap) apply
* this policy to the new segment regardless of the
* outcome of segment concatenation. Extension occurs
* for non-default policy otherwise default policy is
* used and is based on extended segment size.
*/
psvd = (struct segvn_data *)pseg->s_data;
ppolicy = psvd->policy_info.mem_policy;
if (lgrp_mem_policy_flags ==
LGRP_MP_FLAG_EXTEND_UP) {
if (ppolicy != lgrp_mem_default_policy) {
mpolicy = ppolicy;
} else {
mpolicy = lgrp_mem_policy_default(
pseg->s_size + seg->s_size,
a->type);
}
}
if (mpolicy == ppolicy &&
(pseg->s_size + seg->s_size <=
segvn_comb_thrshld || psvd->amp == NULL) &&
segvn_extend_prev(pseg, seg, a, swresv) == 0) {
/*
* success! now try to concatenate
* with following seg
*/
crfree(cred);
nseg = AS_SEGNEXT(pseg->s_as, pseg);
if (nseg != NULL &&
nseg != pseg &&
nseg->s_ops == &segvn_ops &&
pseg->s_base + pseg->s_size ==
nseg->s_base)
(void) segvn_concat(pseg, nseg, 0);
ASSERT(pseg->s_szc == 0 ||
(a->szc == pseg->s_szc &&
IS_P2ALIGNED(pseg->s_base, pgsz) &&
IS_P2ALIGNED(pseg->s_size, pgsz)));
/*
* Communicate out the newly concatenated
* segment as part of the result.
*/
*segpp = pseg;
return (0);
}
}
/*
* Failed, so try to concatenate with following seg
*/
nseg = AS_SEGNEXT(seg->s_as, seg);
if (nseg != NULL &&
seg->s_base + seg->s_size == nseg->s_base &&
nseg->s_ops == &segvn_ops) {
/*
* Get memory allocation policy from next segment.
* When extension is specified (e.g. for stack) apply
* this policy to the new segment regardless of the
* outcome of segment concatenation. Extension occurs
* for non-default policy otherwise default policy is
* used and is based on extended segment size.
*/
nsvd = (struct segvn_data *)nseg->s_data;
npolicy = nsvd->policy_info.mem_policy;
if (lgrp_mem_policy_flags ==
LGRP_MP_FLAG_EXTEND_DOWN) {
if (npolicy != lgrp_mem_default_policy) {
mpolicy = npolicy;
} else {
mpolicy = lgrp_mem_policy_default(
nseg->s_size + seg->s_size,
a->type);
}
}
if (mpolicy == npolicy &&
segvn_extend_next(seg, nseg, a, swresv) == 0) {
crfree(cred);
ASSERT(nseg->s_szc == 0 ||
(a->szc == nseg->s_szc &&
IS_P2ALIGNED(nseg->s_base, pgsz) &&
IS_P2ALIGNED(nseg->s_size, pgsz)));
/*
* Communicate out the newly concatenated
* segment as part of the result.
*/
*segpp = nseg;
return (0);
}
}
}
if (a->vp != NULL) {
VN_HOLD(a->vp);
if (a->type == MAP_SHARED)
lgrp_shm_policy_init(NULL, a->vp);
}
svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
seg->s_ops = &segvn_ops;
seg->s_data = (void *)svd;
seg->s_szc = a->szc;
svd->seg = seg;
svd->vp = a->vp;
/*
* Anonymous mappings have no backing file so the offset is meaningless.
*/
svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
svd->prot = a->prot;
svd->maxprot = a->maxprot;
svd->pageprot = 0;
svd->type = a->type;
svd->vpage = NULL;
svd->cred = cred;
svd->advice = MADV_NORMAL;
svd->pageadvice = 0;
svd->flags = (ushort_t)a->flags;
svd->softlockcnt = 0;
svd->softlockcnt_sbase = 0;
svd->softlockcnt_send = 0;
svd->svn_inz = 0;
svd->rcookie = HAT_INVALID_REGION_COOKIE;
svd->pageswap = 0;
if (a->szc != 0 && a->vp != NULL) {
segvn_setvnode_mpss(a->vp);
}
if (svd->type == MAP_SHARED && svd->vp != NULL &&
(svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) {
ASSERT(vn_is_mapped(svd->vp, V_WRITE));
segvn_inval_trcache(svd->vp);
}
amp = a->amp;
if ((svd->amp = amp) == NULL) {
svd->anon_index = 0;
if (svd->type == MAP_SHARED) {
svd->swresv = 0;
/*
* Shared mappings to a vp need no other setup.
* If we have a shared mapping to an anon_map object
* which hasn't been allocated yet, allocate the
* struct now so that it will be properly shared
* by remembering the swap reservation there.
*/
if (a->vp == NULL) {
svd->amp = anonmap_alloc(seg->s_size, swresv,
ANON_SLEEP);
svd->amp->a_szc = seg->s_szc;
}
} else {
/*
* Private mapping (with or without a vp).
* Allocate anon_map when needed.
*/
svd->swresv = swresv;
}
} else {
pgcnt_t anon_num;
/*
* Mapping to an existing anon_map structure without a vp.
* For now we will insure that the segment size isn't larger
* than the size - offset gives us. Later on we may wish to
* have the anon array dynamically allocated itself so that
* we don't always have to allocate all the anon pointer slots.
* This of course involves adding extra code to check that we
* aren't trying to use an anon pointer slot beyond the end
* of the currently allocated anon array.
*/
if ((amp->size - a->offset) < seg->s_size) {
panic("segvn_create anon_map size");
/*NOTREACHED*/
}
anon_num = btopr(a->offset);
if (a->type == MAP_SHARED) {
/*
* SHARED mapping to a given anon_map.
*/
ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
amp->refcnt++;
if (a->szc > amp->a_szc) {
amp->a_szc = a->szc;
}
ANON_LOCK_EXIT(&->a_rwlock);
svd->anon_index = anon_num;
svd->swresv = 0;
} else {
/*
* PRIVATE mapping to a given anon_map.
* Make sure that all the needed anon
* structures are created (so that we will
* share the underlying pages if nothing
* is written by this mapping) and then
* duplicate the anon array as is done
* when a privately mapped segment is dup'ed.
*/
struct anon *ap;
caddr_t addr;
caddr_t eaddr;
ulong_t anon_idx;
int hat_flag = HAT_LOAD;
if (svd->flags & MAP_TEXT) {
hat_flag |= HAT_LOAD_TEXT;
}
svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
svd->amp->a_szc = seg->s_szc;
svd->anon_index = 0;
svd->swresv = swresv;
/*
* Prevent 2 threads from allocating anon
* slots simultaneously.
*/
ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
eaddr = seg->s_base + seg->s_size;
for (anon_idx = anon_num, addr = seg->s_base;
addr < eaddr; addr += PAGESIZE, anon_idx++) {
page_t *pp;
if ((ap = anon_get_ptr(amp->ahp,
anon_idx)) != NULL)
continue;
/*
* Allocate the anon struct now.
* Might as well load up translation
* to the page while we're at it...
*/
pp = anon_zero(seg, addr, &ap, cred);
if (ap == NULL || pp == NULL) {
panic("segvn_create anon_zero");
/*NOTREACHED*/
}
/*
* Re-acquire the anon_map lock and
* initialize the anon array entry.
*/
ASSERT(anon_get_ptr(amp->ahp,
anon_idx) == NULL);
(void) anon_set_ptr(amp->ahp, anon_idx, ap,
ANON_SLEEP);
ASSERT(seg->s_szc == 0);
ASSERT(!IS_VMODSORT(pp->p_vnode));
ASSERT(use_rgn == 0);
hat_memload(seg->s_as->a_hat, addr, pp,
svd->prot & ~PROT_WRITE, hat_flag);
page_unlock(pp);
}
ASSERT(seg->s_szc == 0);
anon_dup(amp->ahp, anon_num, svd->amp->ahp,
0, seg->s_size);
ANON_LOCK_EXIT(&->a_rwlock);
}
}
/*
* Set default memory allocation policy for segment
*
* Always set policy for private memory at least for initialization
* even if this is a shared memory segment
*/
(void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
if (svd->type == MAP_SHARED)
(void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
svd->vp, svd->offset, seg->s_size);