diff --git a/Documentation/btrfs-property.rst b/Documentation/btrfs-property.rst index 5896faa2b2..e29f135eb3 100644 --- a/Documentation/btrfs-property.rst +++ b/Documentation/btrfs-property.rst @@ -47,7 +47,11 @@ get [-t ] [] device as object. For a mounted filesystem, specify a mount point. compression compression algorithm set for an inode, possible values: *lzo*, *zlib*, *zstd*. - To disable compression use "" (empty string), *no* or *none*. + To disable compression use *no* or *none*. Empty value resets the + property and sets a default value. + .. note:: + This has changed in version 5.18 of btrfs-progs and + requires kernel 5.14 or newer to work. list [-t ] Lists available properties with their descriptions for the given object. diff --git a/Documentation/ch-seeding-device.rst b/Documentation/ch-seeding-device.rst index 78451e5815..340f0b76b4 100644 --- a/Documentation/ch-seeding-device.rst +++ b/Documentation/ch-seeding-device.rst @@ -18,11 +18,14 @@ UUID that is normally attached to a device is automatically changed to a random UUID on each mount. Once the seeding device is mounted, it needs the writable device. After adding -it, something like **remount -o remount,rw /path** makes the filesystem at -*/path* ready for use. The simplest use case is to throw away all changes by -unmounting the filesystem when convenient. - -Alternatively, deleting the seeding device from the filesystem can turn it into +it, unmounting and mounting with **umount /path; mount /path** or +remounting read-write with **remount -o remount,rw** makes the filesystem at +*/path* ready for use. N.B., there is a known bug with using remount to make +the mount writeable: remount will leave the filesystem in a state where it is +unable to clean deleted snapshots, so it will leak space until it is unmounted +and mounted properly. + +Furthermore, deleting the seeding device from the filesystem can turn it into a normal filesystem, provided that the writable device can also contain all the data from the seeding device. @@ -42,8 +45,9 @@ Example how to create and use one seeding device: # umount /mnt/mnt1 # btrfstune -S 1 /dev/sda # mount /dev/sda /mnt/mnt1 - # btrfs device add /dev/sdb /mnt - # mount -o remount,rw /mnt/mnt1 + # btrfs device add /dev/sdb /mnt/mnt1 + # umount /mnt/mnt1 + # mount /dev/sdb /mnt/mnt1 # ... /mnt/mnt1 is now writable Now */mnt/mnt1* can be used normally. The device */dev/sda* can be mounted @@ -53,7 +57,8 @@ again with a another writable device: # mount /dev/sda /mnt/mnt2 # btrfs device add /dev/sdc /mnt/mnt2 - # mount -o remount,rw /mnt/mnt2 + # umount /mnt/mnt2 + # mount /dev/sdc /mnt/mnt2 ... /mnt/mnt2 is now writable The writable device (*/dev/sdb*) can be decoupled from the seeding device and @@ -74,3 +79,5 @@ A few things to note: * block group profiles *single* and *dup* support the use cases above * the label is copied from the seeding device and can be changed by **btrfs filesystem label** * each new mount of the seeding device gets a new random UUID +* **umount /path; mount wr-dev /path** can be replaced with **mount -o remount,rw /path** + but it has bugs in kernels older than diff --git a/btrfs-corrupt-block.c b/btrfs-corrupt-block.c index fb1f15f0c7..e961255d5d 100644 --- a/btrfs-corrupt-block.c +++ b/btrfs-corrupt-block.c @@ -40,53 +40,34 @@ static int debug_corrupt_block(struct extent_buffer *eb, struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 copy) { int ret; - u64 length; - struct btrfs_multi_bio *multi = NULL; - struct btrfs_device *device; int num_copies; int mirror_num = 1; - length = blocksize; while (1) { - ret = btrfs_map_block(root->fs_info, READ, eb->start, &length, - &multi, mirror_num, NULL); - if (ret) { - error("cannot map block %llu length %llu mirror %d: %d", - (unsigned long long)eb->start, - (unsigned long long)length, - mirror_num, ret); - return ret; - } - device = multi->stripes[0].dev; - eb->fd = device->fd; - device->total_ios++; - eb->dev_bytenr = multi->stripes[0].physical; - - fprintf(stdout, - "mirror %d logical %llu physical %llu device %s\n", - mirror_num, (unsigned long long)bytenr, - (unsigned long long)eb->dev_bytenr, device->name); - free(multi); - if (!copy || mirror_num == copy) { - ret = read_extent_from_disk(eb, 0, eb->len); + u64 read_len = eb->len; + + ret = read_data_from_disk(eb->fs_info, eb->data, + eb->start, &read_len, + mirror_num); + if (read_len < eb->len) + ret = -EIO; if (ret < 0) { errno = -ret; error("cannot read eb bytenr %llu: %m", - (unsigned long long)eb->dev_bytenr); + (unsigned long long)eb->start); return ret; } printf("corrupting %llu copy %d\n", eb->start, mirror_num); memset(eb->data, 0, eb->len); - ret = write_extent_to_disk(eb); + ret = write_and_map_eb(eb->fs_info, eb); if (ret < 0) { errno = -ret; error("cannot write eb bytenr %llu: %m", - (unsigned long long)eb->dev_bytenr); + (unsigned long long)eb->start); return ret; } - fsync(eb->fd); } num_copies = btrfs_num_copies(root->fs_info, eb->start, @@ -162,7 +143,7 @@ static void corrupt_keys(struct btrfs_trans_handle *trans, u16 csum_type = fs_info->csum_type; csum_tree_block_size(eb, csum_size, 0, csum_type); - write_extent_to_disk(eb); + write_and_map_eb(eb->fs_info, eb); } } diff --git a/btrfs-map-logical.c b/btrfs-map-logical.c index b3a7526b22..860c196d6d 100644 --- a/btrfs-map-logical.c +++ b/btrfs-map-logical.c @@ -173,8 +173,9 @@ static int write_extent_content(struct btrfs_fs_info *fs_info, int out_fd, while (cur_offset < length) { cur_len = min_t(u64, length - cur_offset, BUFFER_SIZE); - ret = read_extent_data(fs_info, buffer, - logical + cur_offset, &cur_len, mirror); + ret = read_data_from_disk(fs_info, buffer, + logical + cur_offset, &cur_len, + mirror); if (ret < 0) { errno = -ret; fprintf(stderr, diff --git a/btrfstune.c b/btrfstune.c index 33c83bf162..c9a92349a4 100644 --- a/btrfstune.c +++ b/btrfstune.c @@ -333,7 +333,8 @@ static int populate_csum(struct btrfs_trans_handle *trans, while (offset < len) { sectorsize = fs_info->sectorsize; - ret = read_extent_data(fs_info, buf, start + offset, §orsize, 0); + ret = read_data_from_disk(fs_info, buf, start + offset, + §orsize, 0); if (ret) break; ret = btrfs_csum_file_block(trans, start + len, start + offset, diff --git a/check/main.c b/check/main.c index dbf0a6b005..e6e85784d5 100644 --- a/check/main.c +++ b/check/main.c @@ -5791,7 +5791,7 @@ static int check_extent_csums(struct btrfs_root *root, u64 bytenr, for (mirror = 1; mirror <= num_copies; mirror++) { read_len = num_bytes - offset; /* read as much space once a time */ - ret = read_extent_data(gfs_info, (char *)data + offset, + ret = read_data_from_disk(gfs_info, (char *)data + offset, bytenr + offset, &read_len, mirror); if (ret) goto out; diff --git a/check/mode-common.c b/check/mode-common.c index 26e2c0c98b..2a3018428f 100644 --- a/check/mode-common.c +++ b/check/mode-common.c @@ -1203,8 +1203,8 @@ static int populate_csum(struct btrfs_trans_handle *trans, while (offset < len) { sectorsize = gfs_info->sectorsize; - ret = read_extent_data(gfs_info, buf, start + offset, - §orsize, 0); + ret = read_data_from_disk(gfs_info, buf, start + offset, + §orsize, 0); if (ret) break; ret = btrfs_csum_file_block(trans, start + len, start + offset, diff --git a/cmds/property.c b/cmds/property.c index b3ccc0ff69..ec1b408899 100644 --- a/cmds/property.c +++ b/cmds/property.c @@ -190,8 +190,6 @@ static int prop_compression(enum prop_object_type type, xattr_name[XATTR_BTRFS_PREFIX_LEN + strlen(name)] = '\0'; if (value) { - if (strcmp(value, "no") == 0 || strcmp(value, "none") == 0) - value = ""; sret = fsetxattr(fd, xattr_name, value, strlen(value), 0); } else { sret = fgetxattr(fd, xattr_name, NULL, 0); diff --git a/cmds/restore.c b/cmds/restore.c index 81ca6cd57c..5923d571c9 100644 --- a/cmds/restore.c +++ b/cmds/restore.c @@ -407,8 +407,8 @@ static int copy_one_extent(struct btrfs_root *root, int fd, cur = bytenr; while (cur < bytenr + size_left) { length = bytenr + size_left - cur; - ret = read_extent_data(root->fs_info, inbuf + cur - bytenr, cur, - &length, mirror_num); + ret = read_data_from_disk(root->fs_info, inbuf + cur - bytenr, cur, + &length, mirror_num); if (ret < 0) { mirror_num++; if (mirror_num > num_copies) { diff --git a/image/main.c b/image/main.c index c6af0cc2f5..5bcd10f021 100644 --- a/image/main.c +++ b/image/main.c @@ -615,7 +615,7 @@ static int read_data_extent(struct metadump_struct *md, for (cur_mirror = 1; cur_mirror <= num_copies; cur_mirror++) { while (bytes_left) { read_len = bytes_left; - ret = read_extent_data(fs_info, + ret = read_data_from_disk(fs_info, (char *)(async->buffer + offset), logical, &read_len, cur_mirror); if (ret < 0) diff --git a/kernel-shared/ctree.c b/kernel-shared/ctree.c index 758a388207..2707e0e64f 100644 --- a/kernel-shared/ctree.c +++ b/kernel-shared/ctree.c @@ -590,10 +590,9 @@ static void generic_err(const struct extent_buffer *buf, int slot, { va_list args; - fprintf(stderr, "corrupt %s: root=%lld block=%llu physical=%llu slot=%d, ", + fprintf(stderr, "corrupt %s: root=%lld block=%llu slot=%d, ", btrfs_header_level(buf) == 0 ? "leaf": "node", - btrfs_header_owner(buf), btrfs_header_bytenr(buf), - buf->dev_bytenr, slot); + btrfs_header_owner(buf), btrfs_header_bytenr(buf), slot); va_start(args, fmt); vfprintf(stderr, fmt, args); va_end(args); diff --git a/kernel-shared/disk-io.c b/kernel-shared/disk-io.c index 4964cd3827..4aae7a3576 100644 --- a/kernel-shared/disk-io.c +++ b/kernel-shared/disk-io.c @@ -291,59 +291,48 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, } +static int read_on_restore(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_device *device; + int ret; + + /* + * For on_restoring mode, there should be only one device, and logical + * address is mapped 1:1 to device physical offset. + */ + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + if (device->devid == 1) + break; + } + device->total_ios++; + + ret = btrfs_pread(device->fd, eb->data, eb->len, eb->start, + eb->fs_info->zoned); + if (ret != eb->len) + ret = -EIO; + else + ret = 0; + return ret; +} int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirror) { unsigned long offset = 0; - struct btrfs_multi_bio *multi = NULL; - struct btrfs_device *device; int ret = 0; - u64 read_len; unsigned long bytes_left = eb->len; while (bytes_left) { - read_len = bytes_left; - device = NULL; - - if (!info->on_restoring && - eb->start != BTRFS_SUPER_INFO_OFFSET) { - ret = btrfs_map_block(info, READ, eb->start + offset, - &read_len, &multi, mirror, NULL); - if (ret) { - printk("Couldn't map the block %llu\n", eb->start + offset); - kfree(multi); - return -EIO; - } - device = multi->stripes[0].dev; + u64 read_len = bytes_left; - if (device->fd <= 0) { - kfree(multi); - return -EIO; - } + if (info->on_restoring) + return read_on_restore(eb); - eb->fd = device->fd; - device->total_ios++; - eb->dev_bytenr = multi->stripes[0].physical; - kfree(multi); - multi = NULL; - } else { - /* special case for restore metadump */ - list_for_each_entry(device, &info->fs_devices->devices, dev_list) { - if (device->devid == 1) - break; - } - - eb->fd = device->fd; - eb->dev_bytenr = eb->start; - device->total_ios++; - } - - if (read_len > bytes_left) - read_len = bytes_left; - - ret = read_extent_from_disk(eb, offset, read_len); - if (ret) - return -EIO; + ret = read_data_from_disk(info, eb->data + offset, + eb->start + offset, &read_len, + mirror); + if (ret < 0) + return ret; offset += read_len; bytes_left -= read_len; } @@ -460,51 +449,15 @@ struct extent_buffer* read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, return ERR_PTR(ret); } -int read_extent_data(struct btrfs_fs_info *fs_info, char *data, u64 logical, - u64 *len, int mirror) -{ - u64 offset = 0; - struct btrfs_multi_bio *multi = NULL; - struct btrfs_device *device; - int ret = 0; - u64 max_len = *len; - - ret = btrfs_map_block(fs_info, READ, logical, len, &multi, mirror, - NULL); - if (ret) { - fprintf(stderr, "Couldn't map the block %llu\n", - logical + offset); - goto err; - } - device = multi->stripes[0].dev; - - if (*len > max_len) - *len = max_len; - if (device->fd < 0) { - ret = -EIO; - goto err; - } - - ret = btrfs_pread(device->fd, data, *len, multi->stripes[0].physical, - fs_info->zoned); - if (ret != *len) - ret = -EIO; - else - ret = 0; -err: - kfree(multi); - return ret; -} - int write_and_map_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { int ret; - int dev_nr; + int mirror_num; + int max_mirror; u64 length; u64 *raid_map = NULL; struct btrfs_multi_bio *multi = NULL; - dev_nr = 0; length = eb->len; ret = btrfs_map_block(fs_info, WRITE, eb->start, &length, &multi, 0, &raid_map); @@ -515,6 +468,7 @@ int write_and_map_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) goto out; } + /* RAID56 write back need RMW */ if (raid_map) { ret = write_raid56_with_parity(fs_info, eb, multi, length, raid_map); @@ -523,28 +477,28 @@ int write_and_map_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) error( "failed to write raid56 stripe for bytenr %llu length %llu: %m", eb->start, length); - goto out; } - } else while (dev_nr < multi->num_stripes) { - eb->fd = multi->stripes[dev_nr].dev->fd; - eb->dev_bytenr = multi->stripes[dev_nr].physical; - multi->stripes[dev_nr].dev->total_ios++; - dev_nr++; - ret = write_extent_to_disk(eb); + goto out; + } + + /* For non-RAID56, we just writeback data to each mirror */ + max_mirror = btrfs_num_copies(fs_info, eb->start, eb->len); + for (mirror_num = 1; mirror_num <= max_mirror; mirror_num++) { + ret = write_data_to_disk(fs_info, eb->data, eb->start, eb->len, + mirror_num); if (ret < 0) { errno = -ret; error( -"failed to write bytenr %llu length %u devid %llu dev_bytenr %llu: %m", - eb->start, eb->len, - multi->stripes[dev_nr].dev->devid, - eb->dev_bytenr); + "failed to write bytenr %llu length %u to mirror %d: %m", + eb->start, eb->len, mirror_num); goto out; } } + out: kfree(raid_map); kfree(multi); - return 0; + return ret; } int write_tree_block(struct btrfs_trans_handle *trans, diff --git a/kernel-shared/disk-io.h b/kernel-shared/disk-io.h index e07141a959..bba97fc1a8 100644 --- a/kernel-shared/disk-io.h +++ b/kernel-shared/disk-io.h @@ -141,8 +141,6 @@ int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirr struct extent_buffer* read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid); -int read_extent_data(struct btrfs_fs_info *fs_info, char *data, u64 logical, - u64 *len, int mirror); void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid); struct extent_buffer* btrfs_find_create_tree_block( diff --git a/kernel-shared/extent_io.c b/kernel-shared/extent_io.c index af09ade402..d6326ab2dc 100644 --- a/kernel-shared/extent_io.c +++ b/kernel-shared/extent_io.c @@ -26,8 +26,10 @@ #include "kerncompat.h" #include "kernel-shared/extent_io.h" #include "kernel-lib/list.h" +#include "kernel-lib/raid56.h" #include "kernel-shared/ctree.h" #include "kernel-shared/volumes.h" +#include "kernel-shared/disk-io.h" #include "common/utils.h" #include "common/device-utils.h" #include "common/internal.h" @@ -615,8 +617,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *info, eb->len = blocksize; eb->refs = 1; eb->flags = 0; - eb->fd = -1; - eb->dev_bytenr = (u64)-1; eb->cache_node.start = bytenr; eb->cache_node.size = blocksize; eb->fs_info = info; @@ -789,86 +789,151 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return ret; } -int read_extent_from_disk(struct extent_buffer *eb, - unsigned long offset, unsigned long len) +static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical, + u64 len, int mirror, struct btrfs_multi_bio *multi, + u64 *raid_map) { + const int num_stripes = multi->num_stripes; + const u64 full_stripe_start = raid_map[0]; + void **pointers = NULL; + int failed_a = -1; + int failed_b = -1; + int i; int ret; - ret = btrfs_pread(eb->fd, eb->data + offset, len, eb->dev_bytenr, - eb->fs_info->zoned); - if (ret < 0) { - ret = -errno; + + /* Only read repair should go this path */ + ASSERT(mirror > 1); + ASSERT(raid_map); + + /* The read length should be inside one stripe */ + ASSERT(len <= BTRFS_STRIPE_LEN); + + pointers = calloc(num_stripes, sizeof(void *)); + if (!pointers) { + ret = -ENOMEM; goto out; } - if (ret != len) { - ret = -EIO; - goto out; + /* Allocate memory for the full stripe */ + for (i = 0; i < num_stripes; i++) { + pointers[i] = malloc(BTRFS_STRIPE_LEN); + if (!pointers[i]) { + ret = -ENOMEM; + goto out; + } } - ret = 0; -out: - return ret; -} -int write_extent_to_disk(struct extent_buffer *eb) -{ - int ret; - ret = btrfs_pwrite(eb->fd, eb->data, eb->len, eb->dev_bytenr, - eb->fs_info->zoned); - if (ret < 0) - goto out; - if (ret != eb->len) { - ret = -EIO; - goto out; + /* + * Read the full stripe. + * + * The stripes in @multi is not rotated, thus can be used to read from + * disk directly. + */ + for (i = 0; i < num_stripes; i++) { + ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i], + BTRFS_STRIPE_LEN, multi->stripes[i].physical, + fs_info->zoned); + if (ret < BTRFS_STRIPE_LEN) { + ret = -EIO; + goto out; + } } + + /* + * Get the failed index. + * + * Since we're reading using mirror_num > 1 already, it means the data + * stripe where @logical lies in is definitely corrupted. + */ + failed_a = (logical - full_stripe_start) / BTRFS_STRIPE_LEN; + + /* + * For RAID6, we don't have good way to exhaust all the combinations, + * so here we can only go through the map to see if we have missing devices. + */ + if (multi->type & BTRFS_BLOCK_GROUP_RAID6) { + for (i = 0; i < num_stripes; i++) { + /* Skip failed_a, as it's already marked failed */ + if (i == failed_a) + continue; + /* Missing dev */ + if (multi->stripes[i].dev->fd == -1) { + failed_b = i; + break; + } + } + /* + * No missing device, we have no better idea, default to P + * corruption + */ + if (failed_b < 0) + failed_b = num_stripes - 2; + } + + /* Rebuild the full stripe */ + ret = raid56_recov(num_stripes, BTRFS_STRIPE_LEN, multi->type, + failed_a, failed_b, pointers); + ASSERT(ret == 0); + + /* Now copy the data back to original buf */ + memcpy(buf, pointers[failed_a] + (logical - full_stripe_start) % + BTRFS_STRIPE_LEN, len); ret = 0; out: + for (i = 0; i < num_stripes; i++) + free(pointers[i]); + free(pointers); return ret; } -int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 offset, - u64 bytes, int mirror) +int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 logical, + u64 *len, int mirror) { struct btrfs_multi_bio *multi = NULL; struct btrfs_device *device; - u64 bytes_left = bytes; - u64 read_len; - u64 total_read = 0; + u64 read_len = *len; + u64 *raid_map = NULL; int ret; - while (bytes_left) { - read_len = bytes_left; - ret = btrfs_map_block(info, READ, offset, &read_len, &multi, - mirror, NULL); - if (ret) { - fprintf(stderr, "Couldn't map the block %llu\n", - offset); - return -EIO; - } - device = multi->stripes[0].dev; - - read_len = min(bytes_left, read_len); - if (device->fd <= 0) { - kfree(multi); - return -EIO; - } + ret = btrfs_map_block(info, READ, logical, &read_len, &multi, mirror, + &raid_map); + if (ret) { + fprintf(stderr, "Couldn't map the block %llu\n", logical); + return -EIO; + } + read_len = min(*len, read_len); + + /* We need to rebuild from P/Q */ + if (mirror > 1 && multi->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = read_raid56(info, buf, logical, read_len, mirror, multi, + raid_map); + free(multi); + free(raid_map); + *len = read_len; + return ret; + } + free(raid_map); + device = multi->stripes[0].dev; - ret = btrfs_pread(device->fd, buf + total_read, read_len, - multi->stripes[0].physical, info->zoned); + if (device->fd <= 0) { kfree(multi); - if (ret < 0) { - fprintf(stderr, "Error reading %llu, %d\n", offset, - ret); - return ret; - } - if (ret != read_len) { - fprintf(stderr, "Short read for %llu, read %d, " - "read_len %llu\n", offset, ret, read_len); - return -EIO; - } + return -EIO; + } - bytes_left -= read_len; - offset += read_len; - total_read += read_len; + ret = btrfs_pread(device->fd, buf, read_len, + multi->stripes[0].physical, info->zoned); + kfree(multi); + if (ret < 0) { + fprintf(stderr, "Error reading %llu, %d\n", logical, + ret); + return ret; + } + if (ret != read_len) { + fprintf(stderr, + "Short read for %llu, read %d, read_len %llu\n", + logical, ret, read_len); + return -EIO; } + *len = read_len; return 0; } @@ -934,6 +999,7 @@ int write_data_to_disk(struct btrfs_fs_info *info, void *buf, u64 offset, dev_bytenr = multi->stripes[dev_nr].physical; this_len = min(this_len, bytes_left); dev_nr++; + device->total_ios++; ret = btrfs_pwrite(device->fd, buf + total_write, this_len, dev_bytenr, info->zoned); diff --git a/kernel-shared/extent_io.h b/kernel-shared/extent_io.h index a4c21360a9..aa4f34e187 100644 --- a/kernel-shared/extent_io.h +++ b/kernel-shared/extent_io.h @@ -88,13 +88,11 @@ struct extent_state { struct extent_buffer { struct cache_extent cache_node; u64 start; - u64 dev_bytenr; struct list_head lru; struct list_head recow; u32 len; int refs; u32 flags; - int fd; struct btrfs_fs_info *fs_info; char data[] __attribute__((aligned(8))); }; @@ -150,9 +148,6 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 bytenr, u32 blocksize); void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer_nocache(struct extent_buffer *eb); -int read_extent_from_disk(struct extent_buffer *eb, - unsigned long offset, unsigned long len); -int write_extent_to_disk(struct extent_buffer *eb); int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len); void read_extent_buffer(const struct extent_buffer *eb, void *dst, @@ -170,8 +165,8 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, unsigned long nr); int set_extent_buffer_dirty(struct extent_buffer *eb); int clear_extent_buffer_dirty(struct extent_buffer *eb); -int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 offset, - u64 bytes, int mirror); +int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 logical, + u64 *len, int mirror); int write_data_to_disk(struct btrfs_fs_info *info, void *buf, u64 offset, u64 bytes, int mirror); void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, diff --git a/kernel-shared/file.c b/kernel-shared/file.c index a31728102a..59d82a1dd5 100644 --- a/kernel-shared/file.c +++ b/kernel-shared/file.c @@ -225,11 +225,11 @@ int btrfs_read_file(struct btrfs_root *root, u64 ino, u64 start, int len, memset(dest, 0, len); while (1) { struct btrfs_file_extent_item *fi; + u64 offset = 0; u64 extent_start; u64 extent_len; u64 read_start; u64 read_len; - u64 read_len_ret; u64 disk_bytenr; leaf = path.nodes[0]; @@ -282,14 +282,16 @@ int btrfs_read_file(struct btrfs_root *root, u64 ino, u64 start, int len, disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi) + btrfs_file_extent_offset(leaf, fi); - read_len_ret = read_len; - ret = read_extent_data(fs_info, dest + read_start - start, disk_bytenr, - &read_len_ret, 0); - if (ret < 0) - break; - /* Short read, something went wrong */ - if (read_len_ret != read_len) - return -EIO; + while (offset < read_len) { + u64 read_len_ret = read_len - offset; + + ret = read_data_from_disk(fs_info, + dest + read_start - start + offset, + disk_bytenr + offset, &read_len_ret, 0); + if (ret < 0) + goto out; + offset += read_len_ret; + } read += read_len; next: ret = btrfs_next_item(root, &path); diff --git a/kernel-shared/free-space-cache.c b/kernel-shared/free-space-cache.c index e74a61e44d..83897f105f 100644 --- a/kernel-shared/free-space-cache.c +++ b/kernel-shared/free-space-cache.c @@ -118,6 +118,8 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct btrfs_root *root, } while (total_read < io_ctl->total_size) { + u64 offset = 0; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret) { @@ -150,11 +152,19 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct btrfs_root *root, bytenr = btrfs_file_extent_disk_bytenr(leaf, fi) + btrfs_file_extent_offset(leaf, fi); len = btrfs_file_extent_num_bytes(leaf, fi); - ret = read_data_from_disk(root->fs_info, - io_ctl->buffer + key.offset, bytenr, - len, 0); - if (ret) - break; + while (offset < len) { + u64 read_len = len - offset; + + ret = read_data_from_disk(root->fs_info, + io_ctl->buffer + key.offset + offset, + bytenr + offset, + &read_len, 0); + if (ret < 0) { + btrfs_release_path(path); + return ret; + } + offset += read_len; + } total_read += len; path->slots[0]++; } diff --git a/kernel-shared/volumes.c b/kernel-shared/volumes.c index 598ac55344..c745639428 100644 --- a/kernel-shared/volumes.c +++ b/kernel-shared/volumes.c @@ -30,6 +30,7 @@ #include "kernel-shared/volumes.h" #include "zoned.h" #include "common/utils.h" +#include "common/device-utils.h" #include "kernel-lib/raid56.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { @@ -1795,6 +1796,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int stripes_required = 1; int stripe_index; int i; + bool need_raid_map = false; struct btrfs_multi_bio *multi = NULL; if (multi_ret && rw == READ) { @@ -1832,17 +1834,18 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) { - /* RAID[56] write or recovery. Return all stripes */ - stripes_required = map->num_stripes; - - /* Only allocate the map if we've already got a large enough multi_ret */ - if (stripes_allocated >= stripes_required) { - raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); - if (!raid_map) { - kfree(multi); - return -ENOMEM; - } - } + need_raid_map = true; + /* RAID[56] write or recovery. Return all stripes */ + stripes_required = map->num_stripes; + + /* Only allocate the map if we've already got a large enough multi_ret */ + if (stripes_allocated >= stripes_required) { + raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + if (!raid_map) { + kfree(multi); + return -ENOMEM; + } + } } /* if our multi bio struct is too small, back off and try again */ @@ -1880,6 +1883,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, goto out; multi->num_stripes = 1; + multi->type = map->type; stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { if (rw == WRITE) @@ -1906,7 +1910,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, else if (mirror_num) stripe_index = mirror_num - 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (raid_map) { + if (need_raid_map && raid_map) { int rot; u64 tmp; u64 raid56_full_stripe_start; @@ -2610,8 +2614,6 @@ static int split_eb_for_raid56(struct btrfs_fs_info *info, eb->len = stripe_len; eb->refs = 1; eb->flags = 0; - eb->fd = -1; - eb->dev_bytenr = (u64)-1; eb->fs_info = info; this_eb_start = raid_map[i]; @@ -2666,9 +2668,6 @@ int write_raid56_with_parity(struct btrfs_fs_info *info, for (i = 0; i < multi->num_stripes; i++) { struct extent_buffer *new_eb; if (raid_map[i] < BTRFS_RAID5_P_STRIPE) { - ebs[i]->dev_bytenr = multi->stripes[i].physical; - ebs[i]->fd = multi->stripes[i].dev->fd; - multi->stripes[i].dev->total_ios++; if (ebs[i]->start != raid_map[i]) { ret = -EINVAL; goto out_free_split; @@ -2680,8 +2679,6 @@ int write_raid56_with_parity(struct btrfs_fs_info *info, ret = -ENOMEM; goto out_free_split; } - new_eb->dev_bytenr = multi->stripes[i].physical; - new_eb->fd = multi->stripes[i].dev->fd; multi->stripes[i].dev->total_ios++; new_eb->len = stripe_len; new_eb->fs_info = info; @@ -2710,7 +2707,9 @@ int write_raid56_with_parity(struct btrfs_fs_info *info, } for (i = 0; i < multi->num_stripes; i++) { - ret = write_extent_to_disk(ebs[i]); + multi->stripes[i].dev->total_ios++; + ret = btrfs_pwrite(multi->stripes[i].dev->fd, ebs[i]->data, ebs[i]->len, + multi->stripes[i].physical, info->zoned); if (ret < 0) goto out_free_split; } diff --git a/kernel-shared/volumes.h b/kernel-shared/volumes.h index 5cfe7e39f6..d90065b98a 100644 --- a/kernel-shared/volumes.h +++ b/kernel-shared/volumes.h @@ -106,6 +106,7 @@ struct btrfs_bio_stripe { }; struct btrfs_multi_bio { + u64 type; int error; int num_stripes; struct btrfs_bio_stripe stripes[]; diff --git a/mkfs/main.c b/mkfs/main.c index a603ec5896..4e0a46a77a 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -1261,8 +1261,6 @@ int BOX_MAIN(mkfs)(int argc, char **argv) data_profile = tmp; } } else { - u32 best_nodesize = max_t(u32, sysconf(_SC_PAGESIZE), sectorsize); - if (metadata_profile_opt || data_profile_opt) { if (metadata_profile != data_profile) { error( @@ -1272,7 +1270,7 @@ int BOX_MAIN(mkfs)(int argc, char **argv) } if (!nodesize_forced) - nodesize = best_nodesize; + nodesize = sectorsize; } /* diff --git a/tests/fsck-tests/056-raid56-false-alerts/test.sh b/tests/fsck-tests/056-raid56-false-alerts/test.sh new file mode 100755 index 0000000000..b82e999c77 --- /dev/null +++ b/tests/fsck-tests/056-raid56-false-alerts/test.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# Make sure "btrfs check --check-data-csum" won't report false alerts on RAID56 +# data. +# + +source "$TEST_TOP/common" + +check_prereq btrfs +check_prereq mkfs.btrfs +check_global_prereq losetup + +setup_loopdevs 3 +prepare_loopdevs +dev1=${loopdevs[1]} +TEST_DEV=$dev1 + +setup_root_helper + +run_check $SUDO_HELPERS "$TOP/mkfs.btrfs" -f -m raid1 -d raid5 ${loopdevs[@]} +run_check_mount_test_dev + +run_check $SUDO_HELPER dd if=/dev/urandom of="$TEST_MNT/file" bs=16K count=1 \ + status=noxfer > /dev/null 2>&1 + +run_check_umount_test_dev + +# Check data csum should not report false alerts +run_check "$TOP/btrfs" check --check-data-csum "$dev1" + +cleanup_loopdevs