/
lofi.c
3897 lines (3397 loc) · 99.8 KB
/
lofi.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
*
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016 Andrey Sokolov
* Copyright 2019 Joyent, Inc.
* Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
* Copyright 2021 Toomas Soome <tsoome@me.com>
* Copyright 2023 Oxide Computer Company
*/
/*
* lofi (loopback file) driver - allows you to attach a file to a device,
* which can then be accessed through that device. The simple model is that
* you tell lofi to open a file, and then use the block device you get as
* you would any block device. lofi translates access to the block device
* into I/O on the underlying file. This is mostly useful for
* mounting images of filesystems.
*
* lofi is controlled through /dev/lofictl - this is the only device exported
* during attach, and is instance number 0. lofiadm communicates with lofi
* through ioctls on this device. When a file is attached to lofi, block and
* character devices are exported in /dev/lofi and /dev/rlofi. These devices
* are identified by lofi instance number, and the instance number is also used
* as the name in /dev/lofi.
*
* Virtual disks, or, labeled lofi, implements virtual disk support to
* support partition table and related tools. Such mappings will cause
* block and character devices to be exported in /dev/dsk and /dev/rdsk
* directories.
*
* To support virtual disks, the instance number space is divided to two
* parts, upper part for instance number and lower part for minor number
* space to identify partitions and slices. The virtual disk support is
* implemented by stacking cmlb module. For virtual disks, the partition
* related ioctl calls are routed to cmlb module. Compression and encryption
* is not supported for virtual disks.
*
* Mapped devices are tracked with state structures handled with
* ddi_soft_state(9F) for simplicity.
*
* A file attached to lofi is opened when attached and not closed until
* explicitly detached from lofi. This seems more sensible than deferring
* the open until the /dev/lofi device is opened, for a number of reasons.
* One is that any failure is likely to be noticed by the person (or script)
* running lofiadm. Another is that it would be a security problem if the
* file was replaced by another one after being added but before being opened.
*
* The only hard part about lofi is the ioctls. In order to support things
* like 'newfs' on a lofi device, it needs to support certain disk ioctls.
* So it has to fake disk geometry and partition information. More may need
* to be faked if your favorite utility doesn't work and you think it should
* (fdformat doesn't work because it really wants to know the type of floppy
* controller to talk to, and that didn't seem easy to fake. Or possibly even
* necessary, since we have mkfs_pcfs now).
*
* Normally, a lofi device cannot be detached if it is open (i.e. busy). To
* support simulation of hotplug events, an optional force flag is provided.
* If a lofi device is open when a force detach is requested, then the
* underlying file is closed and any subsequent operations return EIO. When the
* device is closed for the last time, it will be cleaned up at that time. In
* addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is
* detached but not removed.
*
* If detach was requested and lofi device is not open, we will perform
* unmap and remove the lofi instance.
*
* If the lofi device is open and the li_cleanup is set on ioctl request,
* we set ls_cleanup flag to notify the cleanup is requested, and the
* last lofi_close will perform the unmapping and this lofi instance will be
* removed.
*
* If the lofi device is open and the li_force is set on ioctl request,
* we set ls_cleanup flag to notify the cleanup is requested,
* we also set ls_vp_closereq to notify IO tasks to return EIO on new
* IO requests and wait in process IO count to become 0, indicating there
* are no more IO requests. Since ls_cleanup is set, the last lofi_close
* will perform unmap and this lofi instance will be removed.
* See also lofi_unmap_file() for details.
*
* Once ls_cleanup is set for the instance, we do not allow lofi_open()
* calls to succeed and can have last lofi_close() to remove the instance.
*
* Known problems:
*
* UFS logging. Mounting a UFS filesystem image "logging"
* works for basic copy testing but wedges during a build of ON through
* that image. Some deadlock in lufs holding the log mutex and then
* getting stuck on a buf. So for now, don't do that.
*
* Direct I/O. Since the filesystem data is being cached in the buffer
* cache, _and_ again in the underlying filesystem, it's tempting to
* enable direct I/O on the underlying file. Don't, because that deadlocks.
* I think to fix the cache-twice problem we might need filesystem support.
*
* Interesting things to do:
*
* Allow multiple files for each device. A poor-man's metadisk, basically.
*
* Pass-through ioctls on block devices. You can (though it's not
* documented), give lofi a block device as a file name. Then we shouldn't
* need to fake a geometry, however, it may be relevant if you're replacing
* metadisk, or using lofi to get crypto.
* It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1
* and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home.
* In fact this even makes sense if you have lofi "above" metadisk.
*
* Encryption:
* Each lofi device can have its own symmetric key and cipher.
* They are passed to us by lofiadm(8) in the correct format for use
* with the misc/kcf crypto_* routines.
*
* Each block has its own IV, that is calculated in lofi_blk_mech(), based
* on the "master" key held in the lsp and the block number of the buffer.
*/
#include <sys/types.h>
#include <netinet/in.h>
#include <sys/sysmacros.h>
#include <sys/uio.h>
#include <sys/kmem.h>
#include <sys/cred.h>
#include <sys/mman.h>
#include <sys/errno.h>
#include <sys/aio_req.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/debug.h>
#include <sys/vnode.h>
#include <sys/lofi.h>
#include <sys/lofi_impl.h> /* for cache structure */
#include <sys/fcntl.h>
#include <sys/pathname.h>
#include <sys/filio.h>
#include <sys/fdio.h>
#include <sys/open.h>
#include <sys/disp.h>
#include <vm/seg_map.h>
#include <sys/ddi.h>
#include <sys/dkioc_free_util.h>
#include <sys/sunddi.h>
#include <sys/zmod.h>
#include <sys/id_space.h>
#include <sys/mkdev.h>
#include <sys/crypto/common.h>
#include <sys/crypto/api.h>
#include <sys/rctl.h>
#include <sys/vtoc.h>
#include <sys/scsi/scsi.h> /* for DTYPE_DIRECT */
#include <sys/scsi/impl/uscsi.h>
#include <sys/sysevent/dev.h>
#include <sys/efi_partition.h>
#include <LzmaDec.h>
#define NBLOCKS_PROP_NAME "Nblocks"
#define SIZE_PROP_NAME "Size"
#define ZONE_PROP_NAME "zone"
#define SETUP_C_DATA(cd, buf, len) \
(cd).cd_format = CRYPTO_DATA_RAW; \
(cd).cd_offset = 0; \
(cd).cd_miscdata = NULL; \
(cd).cd_length = (len); \
(cd).cd_raw.iov_base = (buf); \
(cd).cd_raw.iov_len = (len);
#define UIO_CHECK(uio) \
if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \
((uio)->uio_resid % DEV_BSIZE) != 0) { \
return (EINVAL); \
}
#define LOFI_TIMEOUT 120
int lofi_timeout = LOFI_TIMEOUT;
static void *lofi_statep;
static kmutex_t lofi_lock; /* state lock */
static id_space_t *lofi_id; /* lofi ID values */
static list_t lofi_list;
static zone_key_t lofi_zone_key;
/*
* Because lofi_taskq_nthreads limits the actual swamping of the device, the
* maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively
* high. If we want to be assured that the underlying device is always busy,
* we must be sure that the number of bytes enqueued when the number of
* enqueued tasks exceeds maxalloc is sufficient to keep the device busy for
* the duration of the sleep time in taskq_ent_alloc(). That is, lofi should
* set maxalloc to be the maximum throughput (in bytes per second) of the
* underlying device divided by the minimum I/O size. We assume a realistic
* maximum throughput of one hundred megabytes per second; we set maxalloc on
* the lofi task queue to be 104857600 divided by DEV_BSIZE.
*/
static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE;
static int lofi_taskq_nthreads = 4; /* # of taskq threads per device */
const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC;
/*
* To avoid decompressing data in a compressed segment multiple times
* when accessing small parts of a segment's data, we cache and reuse
* the uncompressed segment's data.
*
* A single cached segment is sufficient to avoid lots of duplicate
* segment decompress operations. A small cache size also reduces the
* memory footprint.
*
* lofi_max_comp_cache is the maximum number of decompressed data segments
* cached for each compressed lofi image. It can be set to 0 to disable
* caching.
*/
uint32_t lofi_max_comp_cache = 1;
static int gzip_decompress(void *src, size_t srclen, void *dst,
size_t *destlen, int level);
static int lzma_decompress(void *src, size_t srclen, void *dst,
size_t *dstlen, int level);
lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = {
{gzip_decompress, NULL, 6, "gzip"}, /* default */
{gzip_decompress, NULL, 6, "gzip-6"},
{gzip_decompress, NULL, 9, "gzip-9"},
{lzma_decompress, NULL, 0, "lzma"}
};
static void lofi_strategy_task(void *);
static int lofi_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t,
size_t, void *);
static int lofi_tg_getinfo(dev_info_t *, int, void *, void *);
struct cmlb_tg_ops lofi_tg_ops = {
TG_DK_OPS_VERSION_1,
lofi_tg_rdwr,
lofi_tg_getinfo
};
typedef enum {
RDWR_RAW,
RDWR_BCOPY
} lofi_rdrw_method_t;
static void
*SzAlloc(void *p __unused, size_t size)
{
return (kmem_alloc(size, KM_SLEEP));
}
static void
SzFree(void *p __unused, void *address, size_t size)
{
kmem_free(address, size);
}
static ISzAlloc g_Alloc = { SzAlloc, SzFree };
/*
* Free data referenced by the linked list of cached uncompressed
* segments.
*/
static void
lofi_free_comp_cache(struct lofi_state *lsp)
{
struct lofi_comp_cache *lc;
while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) {
kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
kmem_free(lc, sizeof (struct lofi_comp_cache));
lsp->ls_comp_cache_count--;
}
ASSERT(lsp->ls_comp_cache_count == 0);
}
static int
is_opened(struct lofi_state *lsp)
{
int i;
boolean_t last = B_TRUE;
ASSERT(MUTEX_HELD(&lofi_lock));
for (i = 0; i < LOFI_PART_MAX; i++) {
if (lsp->ls_open_lyr[i]) {
last = B_FALSE;
break;
}
}
for (i = 0; last && (i < OTYP_LYR); i++) {
if (lsp->ls_open_reg[i]) {
last = B_FALSE;
}
}
return (!last);
}
static void
lofi_set_cleanup(struct lofi_state *lsp)
{
ASSERT(MUTEX_HELD(&lofi_lock));
lsp->ls_cleanup = B_TRUE;
/* wake up any threads waiting on dkiocstate */
cv_broadcast(&lsp->ls_vp_cv);
}
static void
lofi_free_crypto(struct lofi_state *lsp)
{
ASSERT(MUTEX_HELD(&lofi_lock));
if (lsp->ls_crypto_enabled) {
/*
* Clean up the crypto state so that it doesn't hang around
* in memory after we are done with it.
*/
if (lsp->ls_key.ck_data != NULL) {
bzero(lsp->ls_key.ck_data,
CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
kmem_free(lsp->ls_key.ck_data,
CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
lsp->ls_key.ck_data = NULL;
lsp->ls_key.ck_length = 0;
}
if (lsp->ls_mech.cm_param != NULL) {
kmem_free(lsp->ls_mech.cm_param,
lsp->ls_mech.cm_param_len);
lsp->ls_mech.cm_param = NULL;
lsp->ls_mech.cm_param_len = 0;
}
if (lsp->ls_iv_mech.cm_param != NULL) {
kmem_free(lsp->ls_iv_mech.cm_param,
lsp->ls_iv_mech.cm_param_len);
lsp->ls_iv_mech.cm_param = NULL;
lsp->ls_iv_mech.cm_param_len = 0;
}
mutex_destroy(&lsp->ls_crypto_lock);
}
}
static int
lofi_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
size_t length, void *tg_cookie __unused)
{
struct lofi_state *lsp;
buf_t *bp;
int instance;
int rv = 0;
instance = ddi_get_instance(dip);
if (instance == 0) /* control node does not have disk */
return (ENXIO);
lsp = ddi_get_soft_state(lofi_statep, instance);
if (lsp == NULL)
return (ENXIO);
if (cmd != TG_READ && cmd != TG_WRITE)
return (EINVAL);
/*
* Make sure the mapping is set up by checking lsp->ls_vp_ready.
*/
mutex_enter(&lsp->ls_vp_lock);
while (lsp->ls_vp_ready == B_FALSE)
cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
mutex_exit(&lsp->ls_vp_lock);
if (P2PHASE(length, (1U << lsp->ls_lbshift)) != 0) {
/* We can only transfer whole blocks at a time! */
return (EINVAL);
}
bp = getrbuf(KM_SLEEP);
if (cmd == TG_READ) {
bp->b_flags = B_READ;
} else {
if (lsp->ls_readonly == B_TRUE) {
freerbuf(bp);
return (EROFS);
}
bp->b_flags = B_WRITE;
}
bp->b_un.b_addr = bufaddr;
bp->b_bcount = length;
bp->b_lblkno = start;
bp->b_private = NULL;
bp->b_edev = lsp->ls_dev;
if (lsp->ls_kstat) {
mutex_enter(lsp->ls_kstat->ks_lock);
kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat));
mutex_exit(lsp->ls_kstat->ks_lock);
}
(void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP);
(void) biowait(bp);
rv = geterror(bp);
freerbuf(bp);
return (rv);
}
/*
* Get device geometry info for cmlb.
*
* We have mapped disk image as virtual block device and have to report
* physical/virtual geometry to cmlb.
*
* So we have two principal cases:
* 1. Uninitialised image without any existing labels,
* for this case we fabricate the data based on mapped image.
* 2. Image with existing label information.
* Since we have no information how the image was created (it may be
* dump from some physical device), we need to rely on label information
* from image, or we get "corrupted label" errors.
* NOTE: label can be MBR, MBR+SMI, GPT
*/
static int
lofi_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie __unused)
{
struct lofi_state *lsp;
int instance;
int ashift;
instance = ddi_get_instance(dip);
if (instance == 0) /* control device has no storage */
return (ENXIO);
lsp = ddi_get_soft_state(lofi_statep, instance);
if (lsp == NULL)
return (ENXIO);
/*
* Make sure the mapping is set up by checking lsp->ls_vp_ready.
*
* When mapping is created, new lofi instance is created and
* lofi_attach() will call cmlb_attach() as part of the procedure
* to set the mapping up. This chain of events will happen in
* the same thread.
* Since cmlb_attach() will call lofi_tg_getinfo to get
* capacity, we return error on that call if cookie is set,
* otherwise lofi_attach will be stuck as the mapping is not yet
* finalized and lofi is not yet ready.
* Note, such error is not fatal for cmlb, as the label setup
* will be finalized when cmlb_validate() is called.
*/
mutex_enter(&lsp->ls_vp_lock);
if (tg_cookie != NULL && lsp->ls_vp_ready == B_FALSE) {
mutex_exit(&lsp->ls_vp_lock);
return (ENXIO);
}
while (lsp->ls_vp_ready == B_FALSE)
cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
mutex_exit(&lsp->ls_vp_lock);
ashift = lsp->ls_lbshift;
switch (cmd) {
case TG_GETPHYGEOM: {
cmlb_geom_t *geomp = arg;
geomp->g_capacity =
(lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift;
geomp->g_nsect = lsp->ls_dkg.dkg_nsect;
geomp->g_nhead = lsp->ls_dkg.dkg_nhead;
geomp->g_acyl = lsp->ls_dkg.dkg_acyl;
geomp->g_ncyl = lsp->ls_dkg.dkg_ncyl;
geomp->g_secsize = (1U << ashift);
geomp->g_intrlv = lsp->ls_dkg.dkg_intrlv;
geomp->g_rpm = lsp->ls_dkg.dkg_rpm;
return (0);
}
case TG_GETCAPACITY:
*(diskaddr_t *)arg =
(lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift;
return (0);
case TG_GETBLOCKSIZE:
*(uint32_t *)arg = (1U << ashift);
return (0);
case TG_GETATTR: {
tg_attribute_t *tgattr = arg;
tgattr->media_is_writable = !lsp->ls_readonly;
tgattr->media_is_solid_state = B_FALSE;
tgattr->media_is_rotational = B_FALSE;
return (0);
}
default:
return (EINVAL);
}
}
static void
lofi_teardown_task(void *arg)
{
struct lofi_state *lsp = arg;
int id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
mutex_enter(&lofi_lock);
while (ndi_devi_offline(lsp->ls_dip, NDI_DEVI_REMOVE) != NDI_SUCCESS) {
mutex_exit(&lofi_lock);
/* do a sleeping wait for one second */;
delay(drv_usectohz(MICROSEC));
mutex_enter(&lofi_lock);
}
id_free(lofi_id, id);
mutex_exit(&lofi_lock);
}
static void
lofi_destroy(struct lofi_state *lsp, cred_t *credp)
{
int id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
int i;
ASSERT(MUTEX_HELD(&lofi_lock));
/*
* Before we can start to release the other resources,
* make sure we have all tasks completed and taskq removed.
*/
if (lsp->ls_taskq != NULL) {
taskq_destroy(lsp->ls_taskq);
lsp->ls_taskq = NULL;
}
list_remove(&lofi_list, lsp);
lofi_free_crypto(lsp);
/*
* Free pre-allocated compressed buffers
*/
if (lsp->ls_comp_bufs != NULL) {
for (i = 0; i < lofi_taskq_nthreads; i++) {
if (lsp->ls_comp_bufs[i].bufsize > 0)
kmem_free(lsp->ls_comp_bufs[i].buf,
lsp->ls_comp_bufs[i].bufsize);
}
kmem_free(lsp->ls_comp_bufs,
sizeof (struct compbuf) * lofi_taskq_nthreads);
}
if (lsp->ls_vp != NULL) {
(void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_FREE, credp, NULL);
(void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag,
1, 0, credp, NULL);
VN_RELE(lsp->ls_vp);
}
if (lsp->ls_stacked_vp != lsp->ls_vp)
VN_RELE(lsp->ls_stacked_vp);
lsp->ls_vp = lsp->ls_stacked_vp = NULL;
if (lsp->ls_kstat != NULL) {
kstat_delete(lsp->ls_kstat);
lsp->ls_kstat = NULL;
}
/*
* Free cached decompressed segment data
*/
lofi_free_comp_cache(lsp);
list_destroy(&lsp->ls_comp_cache);
if (lsp->ls_uncomp_seg_sz > 0) {
kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz);
lsp->ls_uncomp_seg_sz = 0;
}
rctl_decr_lofi(lsp->ls_zone.zref_zone, 1);
zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI);
mutex_destroy(&lsp->ls_comp_cache_lock);
mutex_destroy(&lsp->ls_comp_bufs_lock);
mutex_destroy(&lsp->ls_kstat_lock);
mutex_destroy(&lsp->ls_vp_lock);
cv_destroy(&lsp->ls_vp_cv);
lsp->ls_vp_ready = B_FALSE;
lsp->ls_vp_closereq = B_FALSE;
ASSERT(ddi_get_soft_state(lofi_statep, id) == lsp);
/*
* Instance state is allocated in lofi_attach() and freed in
* lofi_detach(). New instance is created when we create new mapping.
* Instance removal is performed by unmap ioctl on lofi control
* instance (0).
*
* If the unmap is performed with instance which is still in use,
* we either cancel unmap with error or we can perform delayed unmap
* by blocking all IO, waiting the consumers to close access to this
* instance and once there are no more consumers, complete the unmap.
*
* Delayed unmap will trigger instance removal on last lofi_close(),
* but we can not remove device instance while the instance is still
* in use due to lofi_close() is running.
* Spawn task to complete device instance offlining in separate thread.
*/
(void) taskq_dispatch(system_taskq, lofi_teardown_task, lsp, KM_SLEEP);
}
static void
lofi_free_dev(struct lofi_state *lsp)
{
ASSERT(MUTEX_HELD(&lofi_lock));
if (lsp->ls_cmlbhandle != NULL) {
cmlb_invalidate(lsp->ls_cmlbhandle, 0);
cmlb_detach(lsp->ls_cmlbhandle, 0);
cmlb_free_handle(&lsp->ls_cmlbhandle);
lsp->ls_cmlbhandle = NULL;
}
(void) ddi_prop_remove_all(lsp->ls_dip);
ddi_remove_minor_node(lsp->ls_dip, NULL);
}
static void
lofi_zone_shutdown(zoneid_t zoneid, void *arg __unused)
{
struct lofi_state *lsp;
struct lofi_state *next;
mutex_enter(&lofi_lock);
for (lsp = list_head(&lofi_list); lsp != NULL; lsp = next) {
/* lofi_destroy() frees lsp */
next = list_next(&lofi_list, lsp);
if (lsp->ls_zone.zref_zone->zone_id != zoneid)
continue;
/*
* No in-zone processes are running, but something has this
* open. It's either a global zone process, or a lofi
* mount. In either case we set ls_cleanup so the last
* user destroys the device.
*/
if (is_opened(lsp)) {
lofi_set_cleanup(lsp);
} else {
lofi_free_dev(lsp);
lofi_destroy(lsp, kcred);
}
}
mutex_exit(&lofi_lock);
}
static int
lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp __unused)
{
int id;
minor_t part;
uint64_t mask;
diskaddr_t nblks;
diskaddr_t lba;
boolean_t ndelay;
struct lofi_state *lsp;
if (otyp >= OTYPCNT)
return (EINVAL);
ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
/*
* lofiadm -a /dev/lofi/1 gets us here.
*/
if (mutex_owner(&lofi_lock) == curthread)
return (EINVAL);
mutex_enter(&lofi_lock);
id = LOFI_MINOR2ID(getminor(*devp));
part = LOFI_PART(getminor(*devp));
mask = (1U << part);
/* master control device */
if (id == 0) {
mutex_exit(&lofi_lock);
return (0);
}
/* otherwise, the mapping should already exist */
lsp = ddi_get_soft_state(lofi_statep, id);
if (lsp == NULL) {
mutex_exit(&lofi_lock);
return (EINVAL);
}
if (lsp->ls_cleanup == B_TRUE) {
mutex_exit(&lofi_lock);
return (ENXIO);
}
if (lsp->ls_vp == NULL) {
mutex_exit(&lofi_lock);
return (ENXIO);
}
if (lsp->ls_readonly && (flag & FWRITE)) {
mutex_exit(&lofi_lock);
return (EROFS);
}
if ((lsp->ls_open_excl) & (mask)) {
mutex_exit(&lofi_lock);
return (EBUSY);
}
if (flag & FEXCL) {
if (lsp->ls_open_lyr[part]) {
mutex_exit(&lofi_lock);
return (EBUSY);
}
for (int i = 0; i < OTYP_LYR; i++) {
if (lsp->ls_open_reg[i] & mask) {
mutex_exit(&lofi_lock);
return (EBUSY);
}
}
}
if (lsp->ls_cmlbhandle != NULL) {
if (cmlb_validate(lsp->ls_cmlbhandle, 0, 0) != 0) {
/*
* non-blocking opens are allowed to succeed to
* support format and fdisk to create partitioning.
*/
if (!ndelay) {
mutex_exit(&lofi_lock);
return (ENXIO);
}
} else if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &nblks, &lba,
NULL, NULL, 0) == 0) {
if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
mutex_exit(&lofi_lock);
return (ENXIO);
}
} else if (!ndelay) {
mutex_exit(&lofi_lock);
return (ENXIO);
}
}
if (otyp == OTYP_LYR) {
lsp->ls_open_lyr[part]++;
} else {
lsp->ls_open_reg[otyp] |= mask;
}
if (flag & FEXCL) {
lsp->ls_open_excl |= mask;
}
mutex_exit(&lofi_lock);
return (0);
}
static int
lofi_close(dev_t dev, int flag __unused, int otyp, struct cred *credp)
{
minor_t part;
int id;
uint64_t mask;
struct lofi_state *lsp;
id = LOFI_MINOR2ID(getminor(dev));
part = LOFI_PART(getminor(dev));
mask = (1U << part);
mutex_enter(&lofi_lock);
lsp = ddi_get_soft_state(lofi_statep, id);
if (lsp == NULL) {
mutex_exit(&lofi_lock);
return (EINVAL);
}
if (id == 0) {
mutex_exit(&lofi_lock);
return (0);
}
if (lsp->ls_open_excl & mask)
lsp->ls_open_excl &= ~mask;
if (otyp == OTYP_LYR) {
lsp->ls_open_lyr[part]--;
} else {
lsp->ls_open_reg[otyp] &= ~mask;
}
/*
* If we forcibly closed the underlying device (li_force), or
* asked for cleanup (li_cleanup), finish up if we're the last
* out of the door.
*/
if (!is_opened(lsp) &&
(lsp->ls_cleanup == B_TRUE || lsp->ls_vp == NULL)) {
lofi_free_dev(lsp);
lofi_destroy(lsp, credp);
}
mutex_exit(&lofi_lock);
return (0);
}
/*
* Sets the mechanism's initialization vector (IV) if one is needed.
* The IV is computed from the data block number. lsp->ls_mech is
* altered so that:
* lsp->ls_mech.cm_param_len is set to the IV len.
* lsp->ls_mech.cm_param is set to the IV.
*/
static int
lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno)
{
int ret;
crypto_data_t cdata;
char *iv;
size_t iv_len;
size_t min;
void *data;
size_t datasz;
ASSERT(MUTEX_HELD(&lsp->ls_crypto_lock));
if (lsp == NULL)
return (CRYPTO_DEVICE_ERROR);
/* lsp->ls_mech.cm_param{_len} has already been set for static iv */
if (lsp->ls_iv_type == IVM_NONE) {
return (CRYPTO_SUCCESS);
}
/*
* if kmem already alloced from previous call and it's the same size
* we need now, just recycle it; allocate new kmem only if we have to
*/
if (lsp->ls_mech.cm_param == NULL ||
lsp->ls_mech.cm_param_len != lsp->ls_iv_len) {
iv_len = lsp->ls_iv_len;
iv = kmem_zalloc(iv_len, KM_SLEEP);
} else {
iv_len = lsp->ls_mech.cm_param_len;
iv = lsp->ls_mech.cm_param;
bzero(iv, iv_len);
}
switch (lsp->ls_iv_type) {
case IVM_ENC_BLKNO:
/* iv is not static, lblkno changes each time */
data = &lblkno;
datasz = sizeof (lblkno);
break;
default:
data = 0;
datasz = 0;
break;
}
/*
* write blkno into the iv buffer padded on the left in case
* blkno ever grows bigger than its current longlong_t size
* or a variation other than blkno is used for the iv data
*/
min = MIN(datasz, iv_len);
bcopy(data, iv + (iv_len - min), min);
/* encrypt the data in-place to get the IV */
SETUP_C_DATA(cdata, iv, iv_len);
ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key,
NULL, NULL, NULL);
if (ret != CRYPTO_SUCCESS) {
cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)",
lblkno, ret);
if (lsp->ls_mech.cm_param != iv)
kmem_free(iv, iv_len);
return (ret);
}
/* clean up the iv from the last computation */
if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv)
kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len);
lsp->ls_mech.cm_param_len = iv_len;
lsp->ls_mech.cm_param = iv;
return (CRYPTO_SUCCESS);
}
/*
* Performs encryption and decryption of a chunk of data of size "len",
* one DEV_BSIZE block at a time. "len" is assumed to be a multiple of
* DEV_BSIZE.
*/
static int
lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext,
caddr_t ciphertext, size_t len, boolean_t op_encrypt)
{
crypto_data_t cdata;
crypto_data_t wdata;
int ret;
longlong_t lblkno = bp->b_lblkno;
mutex_enter(&lsp->ls_crypto_lock);
/*
* though we could encrypt/decrypt entire "len" chunk of data, we need
* to break it into DEV_BSIZE pieces to capture blkno incrementing
*/
SETUP_C_DATA(cdata, plaintext, len);
cdata.cd_length = DEV_BSIZE;
if (ciphertext != NULL) { /* not in-place crypto */
SETUP_C_DATA(wdata, ciphertext, len);
wdata.cd_length = DEV_BSIZE;
}
do {
ret = lofi_blk_mech(lsp, lblkno);
if (ret != CRYPTO_SUCCESS)
continue;
if (op_encrypt) {
ret = crypto_encrypt(&lsp->ls_mech, &cdata,
&lsp->ls_key, NULL,
((ciphertext != NULL) ? &wdata : NULL), NULL);
} else {
ret = crypto_decrypt(&lsp->ls_mech, &cdata,
&lsp->ls_key, NULL,
((ciphertext != NULL) ? &wdata : NULL), NULL);
}
cdata.cd_offset += DEV_BSIZE;
if (ciphertext != NULL)
wdata.cd_offset += DEV_BSIZE;
lblkno++;
} while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len);
mutex_exit(&lsp->ls_crypto_lock);
if (ret != CRYPTO_SUCCESS) {
cmn_err(CE_WARN, "%s failed for block %lld: (0x%x)",
op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()",
lblkno, ret);
}
return (ret);
}
static int
lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp,
struct lofi_state *lsp, size_t len, lofi_rdrw_method_t method,
caddr_t bcopy_locn)
{
ssize_t resid;
int isread;
int error;
/*
* Handles reads/writes for both plain and encrypted lofi
* Note: offset is already shifted by lsp->ls_crypto_offset
* when it gets here.
*/