From f69934653e8baecd20d9cbff520e0dbb097d1194 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Mon, 10 Nov 2025 21:52:44 +0900 Subject: [PATCH 01/26] adding ci files --- .github/workflows/run-blktests.yml | 181 +++++++++++++++ CODE_OF_CONDUCT.md | 134 ++++++++++++ LICENSE | 339 +++++++++++++++++++++++++++++ LICENSES/GPL-2.0-or-later.txt | 339 +++++++++++++++++++++++++++++ README.md | 11 + 5 files changed, 1004 insertions(+) create mode 100644 .github/workflows/run-blktests.yml create mode 100644 CODE_OF_CONDUCT.md create mode 100644 LICENSE create mode 100644 LICENSES/GPL-2.0-or-later.txt create mode 100644 README.md diff --git a/.github/workflows/run-blktests.yml b/.github/workflows/run-blktests.yml new file mode 100644 index 0000000000000..c1c05141b9abe --- /dev/null +++ b/.github/workflows/run-blktests.yml @@ -0,0 +1,181 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Copyright (c) 2025 Western Digital Corporation or its affiliates. +# +# Authors: Dennis Maisenbacher (dennis.maisenbacher@wdc.com) + +name: Run blktests + +on: + pull_request: + +concurrency: + group: ci-test-${{ github.ref_name }} + +env: + KERNEL_REF: "${{ github.event.pull_request.head.sha }}" + KERNEL_TREE: "https://github.com/${{ github.repository }}" + +#This workflow requires an actions-runner-controllers (ARC) to be active. +#The k8s cluster of this ARC needs KubeVirt to be installed. +jobs: + build-and-test-kernel: + #This step runs in a container in the k8s cluster + runs-on: arc-vm-linux-blktests + steps: + - name: Checkout blktests-ci + uses: actions/checkout@v4 + with: + repository: linux-blktests/blktests-ci + path: blktests-ci + + - name: Build kernel and package it into a containerimage + run: | + cd blktests-ci/playbooks/roles/kernel-builder-k8s-job/templates + docker build \ + --build-arg KERNEL_TREE=${KERNEL_TREE} \ + --build-arg KERNEL_REF=${KERNEL_REF} \ + -t linux-kernel-containerdisk \ + -f Dockerfile.linux-kernel-containerdisk . 2>&1 | tee build.log + #Setting KERNEL_VERSION var which is latern needed for notifying the VM what kernel to pick up + cat build.log | grep KERNEL_VERSION | awk '{print $3}' | grep KERNEL_VERSION >> $GITHUB_ENV + + - name: Push the new Fedora containerimage with the freshly build kernel + run: | + docker tag linux-kernel-containerdisk registry-service.docker-registry.svc.cluster.local/linux-kernel-containerdisk:${KERNEL_VERSION} + docker push registry-service.docker-registry.svc.cluster.local/linux-kernel-containerdisk:${KERNEL_VERSION} + + - name: Run in VM + uses: ./blktests-ci/.github/actions/kubevirt-action + with: + kernel_version: ${{ env.KERNEL_VERSION }} + vm_artifact_upload_dir: blktests/results + run_cmds: | + #Print VM debug info + uname -a + cat /etc/os-release + lsblk + + #Install build dependencies for blktests + sudo dnf install -y gcc \ + clang \ + make \ + util-linux \ + llvm \ + gawk \ + fio \ + udev \ + kmod \ + coreutils \ + gcc \ + gzip \ + e2fsprogs \ + xfsprogs \ + f2fs-tools \ + btrfs-progs \ + device-mapper-multipath \ + blktrace \ + kernel-headers \ + liburing \ + liburing-devel \ + nbd \ + device-mapper \ + ktls-utils \ + dosfstools \ + bc \ + libnl3-cli \ + cryptsetup \ + sg3_utils \ + pciutils \ + unzip \ + jq \ + nvme-cli \ + git \ + wget \ + pkgconf \ + libudev-devel + + git clone https://git.kernel.org/pub/scm/utils/mdadm/mdadm.git + cd mdadm + git checkout d764c4829947923142a83251296d04edaee7d2f7 + make -j$(nproc) + sudo make install + + cd - + git clone https://github.com/linux-blktests/blktests.git + + cd blktests + git checkout ci + make + + #ATTENTION! This section formats all available NVMe devices. Be careful when changing the `runs-on` tag! + #This step runs in a VM with the previously compiled kernel in the k8s cluster + + failed=0 + rm -f all_failures + + # Run blktests block group + cat > config << EOF + TEST_DEVS=(${BDEV0}) + EOF + export RUN_ZONED_TESTS=1 + sudo ./check block || failed=1 + test -f results/failures && cat results/failures >> all_failures + + # Run blktests nvme group + cat > config << EOF + TEST_DEVS=(${BDEV0}) + NVMET_TRTYPES="loop rdma" + EXCLUDE=(nvme/014 nvme/038 nvme/057 nvme/058) + EOF + sudo ./check nvme || failed=1 + test -f results/failures && cat results/failures >> all_failures + + # Run blktests scsi group + cat > config << EOF + TEST_DEVS=() + export RUN_ZONED_TESTS=1 + EOF + sudo ./check scsi || failed=1 + test -f results/failures && cat results/failures >> all_failures + + # Run blktests misc groups + cat > config << EOF + TEST_DEVS=() + EOF + sudo ./check dm loop md throtl ublk || failed=1 + test -f results/failures && cat results/failures >> all_failures + + # Run ZBD blktests without TEST_DEVS + cat > config << EOF + TEST_DEVS=() + EXCLUDE=(zbd/009) + EOF + export RUN_ZONED_TESTS=1 + sudo ./check zbd || failed=1 + test -f results/failures && cat results/failures >> all_failures + + # Run ZBD blktests with TEST_DEVS + cat > config << EOF + TEST_DEVS=(${ZBD0}) + DEVICE_ONLY=1 + EOF + + export RUN_ZONED_TESTS=1 + sudo ./check zbd || failed=1 + test -f results/failures && cat results/failures >> all_failures + + # Run blktests srp group + cat > config << EOF + TEST_DEVS=() + EOF + sudo ./check srp || failed=1 + test -f results/failures && cat results/failures >> all_failures + + # Mark blktests completion for KPD report + echo "KPD: blktests completed" + + # Output failures for KPD report + test -f all_failures && echo "KPD: Failures:" && cat all_failures + + exit $failed diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000..4edb70c22d479 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,134 @@ + +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +opensource@wdc.com. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000..d159169d10508 --- /dev/null +++ b/LICENSE @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/LICENSES/GPL-2.0-or-later.txt b/LICENSES/GPL-2.0-or-later.txt new file mode 100644 index 0000000000000..d159169d10508 --- /dev/null +++ b/LICENSES/GPL-2.0-or-later.txt @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000..339ce7da25ed4 --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ + + +# blktests-kpd-ci +This is a collection of GitHub actions workflow scripts for Continuous +Integration (CI) of Linux kernel using blktests. The scripts are intended to be +triggered by [Kernel Patches Daemon](https://github.com/kernel-patches/kernel-patches-daemon). +To use the scripts, specify this repository and the main branch as "ci_repo" and +"ci_branch" parameters respectively in the Kernel Patches Daemon config file. From c2062cf729f45a61d761feacf5015a791f4cb2e9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:06 +0800 Subject: [PATCH 02/26] ublk: add parameter `struct io_uring_cmd *` to ublk_prep_auto_buf_reg() Add parameter `struct io_uring_cmd *` to ublk_prep_auto_buf_reg() and prepare for reusing this helper for the coming UBLK_BATCH_IO feature, which can fetch & commit one batch of io commands via single uring_cmd. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0c74a41a67530..f92717ef3ede1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1242,11 +1242,12 @@ ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io) } static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, - struct ublk_io *io, unsigned int issue_flags) + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) { int ret; - ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release, + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, io->buf.index, issue_flags); if (ret) { if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { @@ -1258,18 +1259,19 @@ static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, } io->task_registered_buffers = 1; - io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd); + io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd); io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; return true; } static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq, struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, unsigned int issue_flags) { ublk_init_req_ref(ubq, io); if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) - return ublk_auto_buf_reg(ubq, req, io, issue_flags); + return ublk_auto_buf_reg(ubq, req, io, cmd, issue_flags); return true; } @@ -1344,7 +1346,7 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, if (!ublk_start_io(ubq, req, io)) return; - if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags)) + if (ublk_prep_auto_buf_reg(ubq, req, io, io->cmd, issue_flags)) ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); } From 706e4e0897549cee628d42f48ea6b85a0046ea22 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:07 +0800 Subject: [PATCH 03/26] ublk: add `union ublk_io_buf` with improved naming Add `union ublk_io_buf` for naming the anonymous union of struct ublk_io's addr and buf fields, meantime apply it to `struct ublk_io` for storing either ublk auto buffer register data or ublk server io buffer address. The union uses clear field names: - `addr`: for regular ublk server io buffer addresses - `auto_reg`: for ublk auto buffer registration data This eliminates confusing access patterns and improves code readability. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f92717ef3ede1..b89acfee46c3e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -155,12 +155,13 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) +union ublk_io_buf { + __u64 addr; + struct ublk_auto_buf_reg auto_reg; +}; + struct ublk_io { - /* userspace buffer address from io cmd */ - union { - __u64 addr; - struct ublk_auto_buf_reg buf; - }; + union ublk_io_buf buf; unsigned int flags; int res; @@ -499,7 +500,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, iod->op_flags = ublk_op | ublk_req_build_flags(req); iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); - iod->addr = io->addr; + iod->addr = io->buf.addr; return BLK_STS_OK; } @@ -1047,7 +1048,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, struct iov_iter iter; const int dir = ITER_DEST; - import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter); + import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; @@ -1068,7 +1069,7 @@ static int ublk_unmap_io(bool need_map, WARN_ON_ONCE(io->res > rq_bytes); - import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter); + import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; @@ -1134,7 +1135,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) iod->op_flags = ublk_op | ublk_req_build_flags(req); iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); - iod->addr = io->addr; + iod->addr = io->buf.addr; return BLK_STS_OK; } @@ -1248,9 +1249,9 @@ static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, int ret; ret = io_buffer_register_bvec(cmd, req, ublk_io_release, - io->buf.index, issue_flags); + io->buf.auto_reg.index, issue_flags); if (ret) { - if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { + if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) { ublk_auto_buf_reg_fallback(ubq, io); return true; } @@ -1539,7 +1540,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) */ io->flags &= UBLK_IO_FLAG_CANCELED; io->cmd = NULL; - io->addr = 0; + io->buf.addr = 0; /* * old task is PF_EXITING, put it now @@ -2100,13 +2101,16 @@ static inline int ublk_check_cmd_op(u32 cmd_op) static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd) { - io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); + struct ublk_auto_buf_reg buf; + + buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); - if (io->buf.reserved0 || io->buf.reserved1) + if (buf.reserved0 || buf.reserved1) return -EINVAL; - if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) + if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) return -EINVAL; + io->buf.auto_reg = buf; return 0; } @@ -2128,7 +2132,7 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io, * this ublk request gets stuck. */ if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) - *buf_idx = io->buf.index; + *buf_idx = io->buf.auto_reg.index; } return ublk_set_auto_buf_reg(io, cmd); @@ -2156,7 +2160,7 @@ ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io, if (ublk_dev_support_auto_buf_reg(ub)) return ublk_handle_auto_buf_reg(io, cmd, buf_idx); - io->addr = buf_addr; + io->buf.addr = buf_addr; return 0; } @@ -2353,7 +2357,7 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, */ io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; /* update iod->addr because ublksrv may have passed a new io buffer */ - ublk_get_iod(ubq, req->tag)->addr = io->addr; + ublk_get_iod(ubq, req->tag)->addr = io->buf.addr; pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n", __func__, ubq->q_id, req->tag, io->flags, ublk_get_iod(ubq, req->tag)->addr); From c777f9441a854161648fce93fc64d41d850b81a7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:08 +0800 Subject: [PATCH 04/26] ublk: refactor auto buffer register in ublk_dispatch_req() Refactor auto buffer register code and prepare for supporting batch IO feature, and the main motivation is to put 'ublk_io' operation code together, so that per-io lock can be applied for the code block. The key changes are: - Rename ublk_auto_buf_reg() as ublk_do_auto_buf_reg() - Introduce an enum `auto_buf_reg_res` to represent the result of the buffer registration attempt (FAIL, FALLBACK, OK). - Split the existing `ublk_do_auto_buf_reg` function into two: - `__ublk_do_auto_buf_reg`: Performs the actual buffer registration and returns the `auto_buf_reg_res` status. - `ublk_do_auto_buf_reg`: A wrapper that calls the internal function and handles the I/O preparation based on the result. - Introduce `ublk_prep_auto_buf_reg_io` to encapsulate the logic for preparing the I/O for completion after buffer registration. - Pass the `tag` directly to `ublk_auto_buf_reg_fallback` to avoid recalculating it. This refactoring makes the control flow clearer and isolates the different stages of the auto buffer registration process. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 64 +++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b89acfee46c3e..a11ccf3d96d6c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1234,17 +1234,37 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, } static void -ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io) +ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag) { - unsigned tag = io - ubq->ios; struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag); iod->op_flags |= UBLK_IO_F_NEED_REG_BUF; } -static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, - struct ublk_io *io, struct io_uring_cmd *cmd, - unsigned int issue_flags) +enum auto_buf_reg_res { + AUTO_BUF_REG_FAIL, + AUTO_BUF_REG_FALLBACK, + AUTO_BUF_REG_OK, +}; + +static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, + enum auto_buf_reg_res res) +{ + if (res == AUTO_BUF_REG_OK) { + io->task_registered_buffers = 1; + io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd); + io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; + } + ublk_init_req_ref(ubq, io); + __ublk_prep_compl_io_cmd(io, req); +} + +static enum auto_buf_reg_res +__ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) { int ret; @@ -1252,29 +1272,27 @@ static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, io->buf.auto_reg.index, issue_flags); if (ret) { if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) { - ublk_auto_buf_reg_fallback(ubq, io); - return true; + ublk_auto_buf_reg_fallback(ubq, req->tag); + return AUTO_BUF_REG_FALLBACK; } blk_mq_end_request(req, BLK_STS_IOERR); - return false; + return AUTO_BUF_REG_FAIL; } - io->task_registered_buffers = 1; - io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd); - io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; - return true; + return AUTO_BUF_REG_OK; } -static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq, - struct request *req, struct ublk_io *io, - struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) { - ublk_init_req_ref(ubq, io); - if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) - return ublk_auto_buf_reg(ubq, req, io, cmd, issue_flags); + enum auto_buf_reg_res res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + issue_flags); - return true; + if (res != AUTO_BUF_REG_FAIL) { + ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags); + } } static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, @@ -1347,8 +1365,12 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, if (!ublk_start_io(ubq, req, io)) return; - if (ublk_prep_auto_buf_reg(ubq, req, io, io->cmd, issue_flags)) + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { + ublk_do_auto_buf_reg(ubq, req, io, io->cmd, issue_flags); + } else { + ublk_init_req_ref(ubq, io); ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); + } } static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, From 830917b582c8c2dc5ddfcc483d51b78e06db6b54 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:09 +0800 Subject: [PATCH 05/26] ublk: add helper of __ublk_fetch() Add helper __ublk_fetch() for the coming batch io feature. Meantime move ublk_config_io_buf() out of __ublk_fetch() because batch io has new interface for configuring buffer. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 46 +++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a11ccf3d96d6c..a44e40d1118b0 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2300,39 +2300,41 @@ static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) return 0; } -static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, - struct ublk_io *io, __u64 buf_addr) +static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, + struct ublk_io *io) { - int ret = 0; - - /* - * When handling FETCH command for setting up ublk uring queue, - * ub->mutex is the innermost lock, and we won't block for handling - * FETCH, so it is fine even for IO_URING_F_NONBLOCK. - */ - mutex_lock(&ub->mutex); /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ - if (ublk_dev_ready(ub)) { - ret = -EBUSY; - goto out; - } + if (ublk_dev_ready(ub)) + return -EBUSY; /* allow each command to be FETCHed at most once */ - if (io->flags & UBLK_IO_FLAG_ACTIVE) { - ret = -EINVAL; - goto out; - } + if (io->flags & UBLK_IO_FLAG_ACTIVE) + return -EINVAL; WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); - if (ret) - goto out; WRITE_ONCE(io->task, get_task_struct(current)); ublk_mark_io_ready(ub); -out: + + return 0; +} + +static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, + struct ublk_io *io, __u64 buf_addr) +{ + int ret; + + /* + * When handling FETCH command for setting up ublk uring queue, + * ub->mutex is the innermost lock, and we won't block for handling + * FETCH, so it is fine even for IO_URING_F_NONBLOCK. + */ + mutex_lock(&ub->mutex); + ret = __ublk_fetch(cmd, ub, io); + if (!ret) + ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); mutex_unlock(&ub->mutex); return ret; } From e611748d3584831d1ac73b8952de66ec7059de93 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:10 +0800 Subject: [PATCH 06/26] ublk: define ublk_ch_batch_io_fops for the coming feature F_BATCH_IO Introduces the basic structure for a batched I/O feature in the ublk driver. It adds placeholder functions and a new file operations structure, ublk_ch_batch_io_fops, which will be used for fetching and committing I/O commands in batches. Currently, the feature is disabled. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a44e40d1118b0..6b6e82ec7e45a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -255,6 +255,11 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, u16 q_id, u16 tag, struct ublk_io *io, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); +static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) +{ + return false; +} + static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) { @@ -2578,6 +2583,12 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} + static inline bool ublk_check_ubuf_dir(const struct request *req, int ubuf_dir) { @@ -2690,6 +2701,16 @@ static const struct file_operations ublk_ch_fops = { .mmap = ublk_ch_mmap, }; +static const struct file_operations ublk_ch_batch_io_fops = { + .owner = THIS_MODULE, + .open = ublk_ch_open, + .release = ublk_ch_release, + .read_iter = ublk_ch_read_iter, + .write_iter = ublk_ch_write_iter, + .uring_cmd = ublk_ch_batch_io_uring_cmd, + .mmap = ublk_ch_mmap, +}; + static void ublk_deinit_queue(struct ublk_device *ub, int q_id) { int size = ublk_queue_cmd_buf_size(ub); @@ -2827,7 +2848,10 @@ static int ublk_add_chdev(struct ublk_device *ub) if (ret) goto fail; - cdev_init(&ub->cdev, &ublk_ch_fops); + if (ublk_dev_support_batch_io(ub)) + cdev_init(&ub->cdev, &ublk_ch_batch_io_fops); + else + cdev_init(&ub->cdev, &ublk_ch_fops); ret = cdev_device_add(&ub->cdev, dev); if (ret) goto fail; From e57b7eb09f2f0fff5619d5274488f218bfd8b81b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:11 +0800 Subject: [PATCH 07/26] ublk: prepare for not tracking task context for command batch batch io is designed to be independent of task context, and we will not track task context for batch io feature. So warn on non-batch-io code paths. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6b6e82ec7e45a..7f02d9233a62a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2320,7 +2320,10 @@ static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, ublk_fill_io_cmd(io, cmd); - WRITE_ONCE(io->task, get_task_struct(current)); + if (ublk_dev_support_batch_io(ub)) + WRITE_ONCE(io->task, NULL); + else + WRITE_ONCE(io->task, get_task_struct(current)); ublk_mark_io_ready(ub); return 0; From dbfea85a95a522b2fbfe1987a58392b5e7bdcf5b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:12 +0800 Subject: [PATCH 08/26] ublk: pass const pointer to ublk_queue_is_zoned() Pass const pointer to ublk_queue_is_zoned() because it is readonly. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 7f02d9233a62a..09f6cc49b6273 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -271,7 +271,7 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) return ub->dev_info.flags & UBLK_F_ZONED; } -static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) +static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_ZONED; } From 81b7fffd809b357aad7c03cce905a4e4227fa0a6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:13 +0800 Subject: [PATCH 09/26] ublk: add new batch command UBLK_U_IO_PREP_IO_CMDS & UBLK_U_IO_COMMIT_IO_CMDS Add new command UBLK_U_IO_PREP_IO_CMDS, which is the batch version of UBLK_IO_FETCH_REQ. Add new command UBLK_U_IO_COMMIT_IO_CMDS, which is for committing io command result only, still the batch version. The new command header type is `struct ublk_batch_io`, and fixed buffer is required for these two uring_cmd. This patch doesn't actually implement these commands yet, just validates the SQE fields. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 107 +++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 49 ++++++++++++++++ 2 files changed, 155 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 09f6cc49b6273..c0406cc6b8ae5 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -85,6 +85,11 @@ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) +#define UBLK_BATCH_F_ALL \ + (UBLK_BATCH_F_HAS_ZONE_LBA | \ + UBLK_BATCH_F_HAS_BUF_ADDR | \ + UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) + struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to @@ -108,6 +113,12 @@ struct ublk_uring_cmd_pdu { u16 tag; }; +struct ublk_batch_io_data { + struct ublk_device *ub; + struct io_uring_cmd *cmd; + struct ublk_batch_io header; +}; + /* * io command is active: sqe cmd is received, and its cqe isn't done * @@ -2586,10 +2597,104 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) +{ + const u16 mask = UBLK_BATCH_F_HAS_BUF_ADDR | UBLK_BATCH_F_HAS_ZONE_LBA; + const unsigned header_len = sizeof(struct ublk_elem_header); + + if (uc->flags & ~UBLK_BATCH_F_ALL) + return -EINVAL; + + /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */ + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)) + return -EINVAL; + + switch (uc->flags & mask) { + case 0: + if (uc->elem_bytes != header_len) + return -EINVAL; + break; + case UBLK_BATCH_F_HAS_ZONE_LBA: + case UBLK_BATCH_F_HAS_BUF_ADDR: + if (uc->elem_bytes != header_len + sizeof(u64)) + return -EINVAL; + break; + case UBLK_BATCH_F_HAS_ZONE_LBA | UBLK_BATCH_F_HAS_BUF_ADDR: + if (uc->elem_bytes != header_len + sizeof(u64) + sizeof(u64)) + return -EINVAL; + break; + } + + return 0; +} + +static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) +{ + + const struct ublk_batch_io *uc = &data->header; + + if (!(data->cmd->flags & IORING_URING_CMD_FIXED)) + return -EINVAL; + + if (uc->nr_elem * uc->elem_bytes > data->cmd->sqe->len) + return -E2BIG; + + if (uc->nr_elem > data->ub->dev_info.queue_depth) + return -E2BIG; + + if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) && + !ublk_dev_is_zoned(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) && + !ublk_dev_need_map_io(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + !ublk_dev_support_auto_buf_reg(data->ub)) + return -EINVAL; + + if (uc->reserved || uc->reserved2) + return -EINVAL; + + return ublk_check_batch_cmd_flags(uc); +} + static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - return -EOPNOTSUPP; + const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + struct ublk_batch_io_data data = { + .ub = ub, + .cmd = cmd, + .header = (struct ublk_batch_io) { + .q_id = READ_ONCE(uc->q_id), + .flags = READ_ONCE(uc->flags), + .nr_elem = READ_ONCE(uc->nr_elem), + .elem_bytes = READ_ONCE(uc->elem_bytes), + }, + }; + u32 cmd_op = cmd->cmd_op; + int ret = -EINVAL; + + if (data.header.q_id >= ub->dev_info.nr_hw_queues) + goto out; + + switch (cmd_op) { + case UBLK_U_IO_PREP_IO_CMDS: + case UBLK_U_IO_COMMIT_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = -EOPNOTSUPP; + break; + default: + ret = -EOPNOTSUPP; + } +out: + return ret; } static inline bool ublk_check_ubuf_dir(const struct request *req, diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index ec77dabba45b2..2ce5a496b622c 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -102,6 +102,10 @@ _IOWR('u', 0x23, struct ublksrv_io_cmd) #define UBLK_U_IO_UNREGISTER_IO_BUF \ _IOWR('u', 0x24, struct ublksrv_io_cmd) +#define UBLK_U_IO_PREP_IO_CMDS \ + _IOWR('u', 0x25, struct ublk_batch_io) +#define UBLK_U_IO_COMMIT_IO_CMDS \ + _IOWR('u', 0x26, struct ublk_batch_io) /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 @@ -525,6 +529,51 @@ struct ublksrv_io_cmd { }; }; +struct ublk_elem_header { + __u16 tag; /* IO tag */ + + /* + * Buffer index for incoming io command, only valid iff + * UBLK_F_AUTO_BUF_REG is set + */ + __u16 buf_index; + __s32 result; /* I/O completion result (commit only) */ +}; + +/* + * uring_cmd buffer structure for batch commands + * + * buffer includes multiple elements, which number is specified by + * `nr_elem`. Each element buffer is organized in the following order: + * + * struct ublk_elem_buffer { + * // Mandatory fields (8 bytes) + * struct ublk_elem_header header; + * + * // Optional fields (8 bytes each, included based on flags) + * + * // Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data + * // between ublk request and ublk server buffer + * __u64 buf_addr; + * + * // returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA) + * __u64 zone_lba; + * } + * + * Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS` + */ +struct ublk_batch_io { + __u16 q_id; +#define UBLK_BATCH_F_HAS_ZONE_LBA (1 << 0) +#define UBLK_BATCH_F_HAS_BUF_ADDR (1 << 1) +#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK (1 << 2) + __u16 flags; + __u16 nr_elem; + __u8 elem_bytes; + __u8 reserved; + __u64 reserved2; +}; + struct ublk_param_basic { #define UBLK_ATTR_READ_ONLY (1 << 0) #define UBLK_ATTR_ROTATIONAL (1 << 1) From d114b6aedbad2baa350b56b8899f440235e725ee Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:14 +0800 Subject: [PATCH 10/26] ublk: handle UBLK_U_IO_PREP_IO_CMDS This commit implements the handling of the UBLK_U_IO_PREP_IO_CMDS command, which allows userspace to prepare a batch of I/O requests. The core of this change is the `ublk_walk_cmd_buf` function, which iterates over the elements in the uring_cmd fixed buffer. For each element, it parses the I/O details, finds the corresponding `ublk_io` structure, and prepares it for future dispatch. Add per-io lock for protecting concurrent delivery and committing. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 205 +++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 5 + 2 files changed, 209 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c0406cc6b8ae5..b3d92b0ed42e9 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -117,6 +117,7 @@ struct ublk_batch_io_data { struct ublk_device *ub; struct io_uring_cmd *cmd; struct ublk_batch_io header; + unsigned int issue_flags; }; /* @@ -201,6 +202,7 @@ struct ublk_io { unsigned task_registered_buffers; void *buf_ctx_handle; + spinlock_t lock; } ____cacheline_aligned_in_smp; struct ublk_queue { @@ -271,6 +273,16 @@ static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) return false; } +static inline void ublk_io_lock(struct ublk_io *io) +{ + spin_lock(&io->lock); +} + +static inline void ublk_io_unlock(struct ublk_io *io) +{ + spin_unlock(&io->lock); +} + static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) { @@ -2597,6 +2609,183 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = elem; + + if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) + return *(__u64 *)(buf + sizeof(*elem)); + return 0; +} + +static struct ublk_auto_buf_reg +ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + struct ublk_auto_buf_reg reg = { + .index = elem->buf_index, + .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ? + UBLK_AUTO_BUF_REG_FALLBACK : 0, + }; + + return reg; +} + +/* + * 48 can hold any type of buffer element(8, 16 and 24 bytes) because + * it is the least common multiple(LCM) of 8, 16 and 24 + */ +#define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10) +struct ublk_batch_io_iter { + /* copy to this buffer from iterator first */ + unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ]; + struct iov_iter iter; + unsigned done, total; + unsigned char elem_bytes; +}; + +static inline int +__ublk_walk_cmd_buf(struct ublk_queue *ubq, + struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + unsigned bytes, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + int i, ret = 0; + + for (i = 0; i < bytes; i += iter->elem_bytes) { + const struct ublk_elem_header *elem = + (const struct ublk_elem_header *)&iter->buf[i]; + + if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) { + ret = -EINVAL; + break; + } + + ret = cb(ubq, data, elem); + if (unlikely(ret)) + break; + } + + /* revert unhandled bytes in case of failure */ + if (ret) + iov_iter_revert(&iter->iter, bytes - i); + + iter->done += i; + return ret; +} + +static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + int ret = 0; + + while (iter->done < iter->total) { + unsigned int len = min(sizeof(iter->buf), iter->total - iter->done); + + ret = copy_from_iter(iter->buf, len, &iter->iter); + if (ret != len) { + pr_warn("ublk%d: read batch cmd buffer failed %u/%u\n", + data->ub->dev_info.dev_id, ret, len); + ret = -EINVAL; + break; + } + + ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb); + if (ret) + break; + } + return ret; +} + +static int ublk_batch_unprep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + + data->ub->nr_io_ready--; + ublk_io_lock(io); + io->flags = 0; + ublk_io_unlock(io); + return 0; +} + +static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data) +{ + int ret; + + if (!iter->done) + return; + + iov_iter_revert(&iter->iter, iter->done); + iter->total = iter->done; + iter->done = 0; + + ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io); + WARN_ON_ONCE(ret); +} + +static int ublk_batch_prep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + union ublk_io_buf buf = { 0 }; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + else if (ublk_dev_need_map_io(data->ub)) { + buf.addr = ublk_batch_buf_addr(uc, elem); + + ret = ublk_check_fetch_buf(data->ub, buf.addr); + if (ret) + return ret; + } + + ublk_io_lock(io); + ret = __ublk_fetch(data->cmd, data->ub, io); + if (!ret) + io->buf = buf; + ublk_io_unlock(io); + + return ret; +} + +static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + int ret; + + ret = io_uring_cmd_import_fixed(cmd->sqe->addr, iter.total, + WRITE, &iter.iter, cmd, data->issue_flags); + if (ret) + return ret; + + mutex_lock(&data->ub->mutex); + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io); + + if (ret && iter.done) + ublk_batch_revert_prep_cmd(&iter, data); + mutex_unlock(&data->ub->mutex); + return ret; +} + static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) { const u16 mask = UBLK_BATCH_F_HAS_BUF_ADDR | UBLK_BATCH_F_HAS_ZONE_LBA; @@ -2675,6 +2864,7 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, .nr_elem = READ_ONCE(uc->nr_elem), .elem_bytes = READ_ONCE(uc->elem_bytes), }, + .issue_flags = issue_flags, }; u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; @@ -2684,6 +2874,11 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, switch (cmd_op) { case UBLK_U_IO_PREP_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_prep_cmd(&data); + break; case UBLK_U_IO_COMMIT_IO_CMDS: ret = ublk_check_batch_cmd(&data); if (ret) @@ -2842,7 +3037,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) struct ublk_queue *ubq = ublk_get_queue(ub, q_id); gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; void *ptr; - int size; + int size, i; spin_lock_init(&ubq->cancel_lock); ubq->flags = ub->dev_info.flags; @@ -2854,6 +3049,9 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) if (!ptr) return -ENOMEM; + for (i = 0; i < ubq->q_depth; i++) + spin_lock_init(&ubq->ios[i].lock); + ubq->io_cmd_buf = ptr; ubq->dev = ub; return 0; @@ -3092,6 +3290,11 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, return -EINVAL; mutex_lock(&ub->mutex); + /* device may become not ready in case of F_BATCH */ + if (!ublk_dev_ready(ub)) { + ret = -EINVAL; + goto out_unlock; + } if (ub->dev_info.state == UBLK_S_DEV_LIVE || test_bit(UB_STATE_USED, &ub->state)) { ret = -EEXIST; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 2ce5a496b622c..c96c299057c38 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -102,6 +102,11 @@ _IOWR('u', 0x23, struct ublksrv_io_cmd) #define UBLK_U_IO_UNREGISTER_IO_BUF \ _IOWR('u', 0x24, struct ublksrv_io_cmd) + +/* + * return 0 if the command is run successfully, otherwise failure code + * is returned + */ #define UBLK_U_IO_PREP_IO_CMDS \ _IOWR('u', 0x25, struct ublk_batch_io) #define UBLK_U_IO_COMMIT_IO_CMDS \ From 185cb778d5c780fcc1917c1f2f7ddb2c27aebaa6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:15 +0800 Subject: [PATCH 11/26] ublk: handle UBLK_U_IO_COMMIT_IO_CMDS Handle UBLK_U_IO_COMMIT_IO_CMDS by walking the uring_cmd fixed buffer: - read each element into one temp buffer in batch style - parse and apply each element for committing io result Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 121 ++++++++++++++++++++++++++++++++-- include/uapi/linux/ublk_cmd.h | 8 +++ 2 files changed, 125 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b3d92b0ed42e9..3066bfa19f990 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2164,9 +2164,9 @@ static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd return 0; } -static int ublk_handle_auto_buf_reg(struct ublk_io *io, - struct io_uring_cmd *cmd, - u16 *buf_idx) +static void __ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) { if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) { io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG; @@ -2184,7 +2184,13 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io, if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) *buf_idx = io->buf.auto_reg.index; } +} +static int ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) +{ + __ublk_handle_auto_buf_reg(io, cmd, buf_idx); return ublk_set_auto_buf_reg(io, cmd); } @@ -2619,6 +2625,17 @@ static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc, return 0; } +static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = (const void *)elem; + + if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) + return *(__u64 *)(buf + sizeof(*elem) + + 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)); + return -1; +} + static struct ublk_auto_buf_reg ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc, const struct ublk_elem_header *elem) @@ -2786,6 +2803,102 @@ static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data) return ret; } +static int ublk_batch_commit_io_check(const struct ublk_queue *ubq, + struct ublk_io *io, + union ublk_io_buf *buf) +{ + struct request *req = io->req; + + if (!req) + return -EINVAL; + + if (io->flags & UBLK_IO_FLAG_ACTIVE) + return -EBUSY; + + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + return -EINVAL; + + if (ublk_need_map_io(ubq)) { + /* + * COMMIT_AND_FETCH_REQ has to provide IO buffer if + * NEED GET DATA is not enabled or it is Read IO. + */ + if (!buf->addr && (!ublk_need_get_data(ubq) || + req_op(req) == REQ_OP_READ)) + return -EINVAL; + } + return 0; +} + +static int ublk_batch_commit_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + u16 buf_idx = UBLK_INVALID_BUF_IDX; + union ublk_io_buf buf = { 0 }; + struct request *req = NULL; + bool auto_reg = false; + bool compl = false; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) { + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + auto_reg = true; + } else if (ublk_dev_need_map_io(data->ub)) + buf.addr = ublk_batch_buf_addr(uc, elem); + + ublk_io_lock(io); + ret = ublk_batch_commit_io_check(ubq, io, &buf); + if (!ret) { + io->res = elem->result; + io->buf = buf; + req = ublk_fill_io_cmd(io, data->cmd); + + if (auto_reg) + __ublk_handle_auto_buf_reg(io, data->cmd, &buf_idx); + compl = ublk_need_complete_req(data->ub, io); + } + ublk_io_unlock(io); + + if (unlikely(ret)) { + pr_warn("%s: dev %u queue %u io %u: commit failure %d\n", + __func__, data->ub->dev_info.dev_id, ubq->q_id, + elem->tag, ret); + return ret; + } + + /* can't touch 'ublk_io' any more */ + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ublk_batch_zone_lba(uc, elem); + if (compl) + __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub)); + return 0; +} + +static int ublk_handle_batch_commit_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + int ret; + + ret = io_uring_cmd_import_fixed(cmd->sqe->addr, iter.total, + WRITE, &iter.iter, cmd, data->issue_flags); + if (ret) + return ret; + + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io); + + return iter.done == 0 ? ret : iter.done; +} + static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) { const u16 mask = UBLK_BATCH_F_HAS_BUF_ADDR | UBLK_BATCH_F_HAS_ZONE_LBA; @@ -2883,7 +2996,7 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_check_batch_cmd(&data); if (ret) goto out; - ret = -EOPNOTSUPP; + ret = ublk_handle_batch_commit_cmd(&data); break; default: ret = -EOPNOTSUPP; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index c96c299057c38..295ec8f341731 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -109,6 +109,14 @@ */ #define UBLK_U_IO_PREP_IO_CMDS \ _IOWR('u', 0x25, struct ublk_batch_io) +/* + * If failure code is returned, nothing in the command buffer is handled. + * Otherwise, the returned value means how many bytes in command buffer + * are handled actually, then number of handled IOs can be calculated with + * `elem_bytes` for each IO. IOs in the remained bytes are not committed, + * userspace has to check return value for dealing with partial committing + * correctly. + */ #define UBLK_U_IO_COMMIT_IO_CMDS \ _IOWR('u', 0x26, struct ublk_batch_io) From 3e179a0ecf8e1a57df21943dbdc26aa6af13caa8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:16 +0800 Subject: [PATCH 12/26] ublk: add io events fifo structure Add ublk io events fifo structure and prepare for supporting command batch, which will use io_uring multishot uring_cmd for fetching one batch of io commands each time. One nice feature of kfifo is to allow multiple producer vs single consumer. We just need lock the producer side, meantime the single consumer can be lockless. The producer is actually from ublk_queue_rq() or ublk_queue_rqs(), so lock contention can be eased by setting proper blk-mq nr_queues. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 55 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 3066bfa19f990..a17056f592bcb 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -217,6 +218,22 @@ struct ublk_queue { bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ spinlock_t cancel_lock; struct ublk_device *dev; + + /* + * Inflight ublk request tag is saved in this fifo + * + * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(), + * so lock is required for storing request tag to fifo + * + * Make sure just one reader for fetching request from task work + * function to ublk server, so no need to grab the lock in reader + * side. + */ + struct { + DECLARE_KFIFO_PTR(evts_fifo, unsigned short); + spinlock_t evts_lock; + }____cacheline_aligned_in_smp; + struct ublk_io ios[]; }; @@ -283,6 +300,31 @@ static inline void ublk_io_unlock(struct ublk_io *io) spin_unlock(&io->lock); } +/* Initialize the queue */ +static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size) +{ + spin_lock_init(&q->evts_lock); + return kfifo_alloc(&q->evts_fifo, size, GFP_KERNEL); +} + +/* Check if queue is empty */ +static inline bool ublk_io_evts_empty(const struct ublk_queue *q) +{ + return kfifo_is_empty(&q->evts_fifo); +} + +/* Check if queue is full */ +static inline bool ublk_io_evts_full(const struct ublk_queue *q) +{ + return kfifo_is_full(&q->evts_fifo); +} + +static inline void ublk_io_evts_deinit(struct ublk_queue *q) +{ + WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo)); + kfifo_free(&q->evts_fifo); +} + static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) { @@ -3143,6 +3185,8 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id) if (ubq->io_cmd_buf) free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); + if (ublk_dev_support_batch_io(ub)) + ublk_io_evts_deinit(ubq); } static int ublk_init_queue(struct ublk_device *ub, int q_id) @@ -3150,7 +3194,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) struct ublk_queue *ubq = ublk_get_queue(ub, q_id); gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; void *ptr; - int size, i; + int size, i, ret = 0; spin_lock_init(&ubq->cancel_lock); ubq->flags = ub->dev_info.flags; @@ -3167,7 +3211,16 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) ubq->io_cmd_buf = ptr; ubq->dev = ub; + + if (ublk_dev_support_batch_io(ub)) { + ret = ublk_io_evts_init(ubq, ubq->q_depth); + if (ret) + goto fail; + } return 0; +fail: + ublk_deinit_queue(ub, q_id); + return ret; } static void ublk_deinit_queues(struct ublk_device *ub) From 7ef63767e26281802ebb3c30748c12d43a5bca1f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:17 +0800 Subject: [PATCH 13/26] ublk: add batch I/O dispatch infrastructure Add infrastructure for delivering I/O commands to ublk server in batches, preparing for the upcoming UBLK_U_IO_FETCH_IO_CMDS feature. Key components: - struct ublk_batch_fcmd: Represents a batch fetch uring_cmd that will receive multiple I/O tags in a single operation, using io_uring's multishot command for efficient ublk IO delivery. - ublk_batch_dispatch(): Batch version of ublk_dispatch_req() that: * Pulls multiple request tags from the events FIFO (lock-free reader) * Prepares each I/O for delivery (including auto buffer registration) * Delivers tags to userspace via single uring_cmd notification * Handles partial failures by restoring undelivered tags to FIFO The batch approach significantly reduces notification overhead by aggregating multiple I/O completions into single uring_cmd, while maintaining the same I/O processing semantics as individual operations. Error handling ensures system consistency: if buffer selection or CQE posting fails, undelivered tags are restored to the FIFO for retry. This runs in task work context, scheduled via io_uring_cmd_complete_in_task() or called directly from ->uring_cmd(), enabling efficient batch processing without blocking the I/O submission path. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 126 ++++++++++++++++++++++++++++++++++ include/uapi/linux/ublk_cmd.h | 6 ++ 2 files changed, 132 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a17056f592bcb..412cf1f6ecbc7 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -91,6 +91,12 @@ UBLK_BATCH_F_HAS_BUF_ADDR | \ UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) +/* ublk batch fetch uring_cmd */ +struct ublk_batch_fcmd { + struct io_uring_cmd *cmd; + unsigned short buf_group; +}; + struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to @@ -616,6 +622,32 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +static void ublk_batch_deinit_fetch_buf(const struct ublk_batch_io_data *data, + struct ublk_batch_fcmd *fcmd, + int res) +{ + io_uring_cmd_done(fcmd->cmd, res, data->issue_flags); + fcmd->cmd = NULL; +} + +static int ublk_batch_fetch_post_cqe(struct ublk_batch_fcmd *fcmd, + struct io_br_sel *sel, + unsigned int issue_flags) +{ + if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags)) + return -ENOBUFS; + return 0; +} + +static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fcmd *fcmd, + void __user *buf, const u16 *tag_buf, + unsigned int len) +{ + if (copy_to_user(buf, tag_buf, len)) + return -EFAULT; + return len; +} + #define UBLK_MAX_UBLKS UBLK_MINORS /* @@ -1443,6 +1475,100 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, } } +static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short tag) +{ + struct ublk_device *ub = data->ub; + struct ublk_io *io = &ubq->ios[tag]; + struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK; + struct io_uring_cmd *cmd = data->cmd; + + if (!ublk_start_io(ubq, req, io)) + return false; + + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) + res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + data->issue_flags); + + ublk_io_lock(io); + ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + ublk_io_unlock(io); + + return res != AUTO_BUF_REG_FAIL; +} + +static void ublk_batch_prep_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short *tag_buf, + unsigned int len) +{ + int i; + + for (i = 0; i < len; i += 1) { + unsigned short tag = tag_buf[i]; + + if (!__ublk_batch_prep_dispatch(ubq, data, tag)) + tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG; + } +} + +#define MAX_NR_TAG 128 +static int __ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fcmd *fcmd) +{ + unsigned short tag_buf[MAX_NR_TAG]; + struct io_br_sel sel; + size_t len = 0; + int ret; + + sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len, + data->issue_flags); + if (sel.val < 0) + return sel.val; + if (!sel.addr) + return -ENOBUFS; + + /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */ + len = min(len, sizeof(tag_buf)) / 2; + len = kfifo_out(&ubq->evts_fifo, tag_buf, len); + + ublk_batch_prep_dispatch(ubq, data, tag_buf, len); + + sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * 2); + ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags); + if (unlikely(ret < 0)) { + int res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo, + tag_buf, len, &ubq->evts_lock); + + pr_warn("%s: copy tags or post CQE failure, move back " + "tags(%d %lu) ret %d\n", __func__, res, len, + ret); + } + return ret; +} + +static __maybe_unused int +ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fcmd *fcmd) +{ + int ret = 0; + + while (!ublk_io_evts_empty(ubq)) { + ret = __ublk_batch_dispatch(ubq, data, fcmd); + if (ret <= 0) + break; + } + + if (ret < 0) + ublk_batch_deinit_fetch_buf(data, fcmd, ret); + + return ret; +} + static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, unsigned int issue_flags) { diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 295ec8f341731..a56a50bbb00a4 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -553,6 +553,12 @@ struct ublk_elem_header { __s32 result; /* I/O completion result (commit only) */ }; +/* + * If this tag value is observed from buffer of `UBLK_U_IO_FETCH_IO_CMDS` + * ublk server can simply ignore it + */ +#define UBLK_BATCH_IO_UNUSED_TAG (__u16)(-1) + /* * uring_cmd buffer structure for batch commands * From 68fac3b100c95f664183612f17b1b0774caca87d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:18 +0800 Subject: [PATCH 14/26] ublk: add UBLK_U_IO_FETCH_IO_CMDS for batch I/O processing Add UBLK_U_IO_FETCH_IO_CMDS command to enable efficient batch processing of I/O requests. This multishot uring_cmd allows the ublk server to fetch multiple I/O commands in a single operation, significantly reducing submission overhead compared to individual FETCH_REQ* commands. Key Design Features: 1. Multishot Operation: One UBLK_U_IO_FETCH_IO_CMDS can fetch many I/O commands, with the batch size limited by the provided buffer length. 2. Dynamic Load Balancing: Multiple fetch commands can be submitted simultaneously, but only one is active at any time. This enables efficient load distribution across multiple server task contexts. 3. Implicit State Management: The implementation uses three key variables to track state: - evts_fifo: Queue of request tags awaiting processing - fcmd_head: List of available fetch commands - active_fcmd: Currently active fetch command (NULL = none active) States are derived implicitly: - IDLE: No fetch commands available - READY: Fetch commands available, none active - ACTIVE: One fetch command processing events 4. Lockless Reader Optimization: The active fetch command can read from evts_fifo without locking (single reader guarantee), while writers (ublk_queue_rq/ublk_queue_rqs) use evts_lock protection. Implementation Details: - ublk_queue_rq() and ublk_queue_rqs() save request tags to evts_fifo - __ublk_pick_active_fcmd() selects an available fetch command when events arrive and no command is currently active - ublk_batch_dispatch() moves tags from evts_fifo to the fetch command's buffer and posts completion via io_uring_mshot_cmd_post_cqe() - State transitions are coordinated via evts_lock to maintain consistency Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 394 +++++++++++++++++++++++++++++++--- include/uapi/linux/ublk_cmd.h | 7 + 2 files changed, 370 insertions(+), 31 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 412cf1f6ecbc7..6b9b47b1b4cac 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -93,6 +93,7 @@ /* ublk batch fetch uring_cmd */ struct ublk_batch_fcmd { + struct list_head node; struct io_uring_cmd *cmd; unsigned short buf_group; }; @@ -117,7 +118,10 @@ struct ublk_uring_cmd_pdu { */ struct ublk_queue *ubq; - u16 tag; + union { + u16 tag; + struct ublk_batch_fcmd *fcmd; /* batch io only */ + }; }; struct ublk_batch_io_data { @@ -226,18 +230,36 @@ struct ublk_queue { struct ublk_device *dev; /* - * Inflight ublk request tag is saved in this fifo + * Batch I/O State Management: + * + * The batch I/O system uses implicit state management based on the + * combination of three key variables below. + * + * - IDLE: list_empty(&fcmd_head) && !active_fcmd + * No fetch commands available, events queue in evts_fifo + * + * - READY: !list_empty(&fcmd_head) && !active_fcmd + * Fetch commands available but none processing events * - * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(), - * so lock is required for storing request tag to fifo + * - ACTIVE: active_fcmd + * One fetch command actively processing events from evts_fifo * - * Make sure just one reader for fetching request from task work - * function to ublk server, so no need to grab the lock in reader - * side. + * Key Invariants: + * - At most one active_fcmd at any time (single reader) + * - active_fcmd is always from fcmd_head list when non-NULL + * - evts_fifo can be read locklessly by the single active reader + * - All state transitions require evts_lock protection + * - Multiple writers to evts_fifo require lock protection */ struct { DECLARE_KFIFO_PTR(evts_fifo, unsigned short); spinlock_t evts_lock; + + /* List of fetch commands available to process events */ + struct list_head fcmd_head; + + /* Currently active fetch command (NULL = none active) */ + struct ublk_batch_fcmd *active_fcmd; }____cacheline_aligned_in_smp; struct ublk_io ios[]; @@ -290,12 +312,22 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, u16 q_id, u16 tag, struct ublk_io *io, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); +static void ublk_batch_dispatch(struct ublk_queue *ubq, + struct ublk_batch_io_data *data, + struct ublk_batch_fcmd *fcmd); +static struct ublk_batch_fcmd *__ublk_pick_active_fcmd( + struct ublk_queue *ubq); static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) { return false; } +static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) +{ + return false; +} + static inline void ublk_io_lock(struct ublk_io *io) { spin_lock(&io->lock); @@ -621,13 +653,40 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +static struct ublk_batch_fcmd * +ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd) +{ + struct ublk_batch_fcmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO); + + if (fcmd) { + fcmd->cmd = cmd; + fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index); + } + return fcmd; +} -static void ublk_batch_deinit_fetch_buf(const struct ublk_batch_io_data *data, +static void ublk_batch_free_fcmd(struct ublk_batch_fcmd *fcmd) +{ + kfree(fcmd); +} + +/* + * Nothing can move on, so clear ->active_fcmd, and the caller should stop + * dispatching + */ +static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, struct ublk_batch_fcmd *fcmd, int res) { + spin_lock(&ubq->evts_lock); + list_del(&fcmd->node); + WARN_ON_ONCE(fcmd != ubq->active_fcmd); + ubq->active_fcmd = NULL; + spin_unlock(&ubq->evts_lock); + io_uring_cmd_done(fcmd->cmd, res, data->issue_flags); - fcmd->cmd = NULL; + ublk_batch_free_fcmd(fcmd); } static int ublk_batch_fetch_post_cqe(struct ublk_batch_fcmd *fcmd, @@ -1524,6 +1583,8 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq, size_t len = 0; int ret; + WARN_ON_ONCE(data->cmd != fcmd->cmd); + sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len, data->issue_flags); if (sel.val < 0) @@ -1550,23 +1611,79 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq, return ret; } -static __maybe_unused int -ublk_batch_dispatch(struct ublk_queue *ubq, - const struct ublk_batch_io_data *data, - struct ublk_batch_fcmd *fcmd) +static struct ublk_batch_fcmd *__ublk_pick_active_fcmd( + struct ublk_queue *ubq) { + struct ublk_batch_fcmd *fcmd; + + lockdep_assert_held(&ubq->evts_lock); + + if (!ublk_io_evts_empty(ubq) && !ubq->active_fcmd) { + smp_mb(); + fcmd = ubq->active_fcmd = list_first_entry_or_null( + &ubq->fcmd_head, struct ublk_batch_fcmd, node); + } else { + fcmd = NULL; + } + return fcmd; +} + +static void ublk_batch_tw_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fcmd *fcmd = pdu->fcmd; + struct ublk_batch_io_data data = { + .ub = pdu->ubq->dev, + .cmd = fcmd->cmd, + .issue_flags = issue_flags, + }; + + WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd); + + ublk_batch_dispatch(pdu->ubq, &data, fcmd); +} + +static void ublk_batch_dispatch(struct ublk_queue *ubq, + struct ublk_batch_io_data *data, + struct ublk_batch_fcmd *fcmd) +{ + struct ublk_batch_fcmd *new_fcmd; + void *handle; + bool empty; int ret = 0; +again: while (!ublk_io_evts_empty(ubq)) { ret = __ublk_batch_dispatch(ubq, data, fcmd); if (ret <= 0) break; } - if (ret < 0) - ublk_batch_deinit_fetch_buf(data, fcmd, ret); + if (ret < 0) { + ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret); + return; + } - return ret; + handle = io_uring_cmd_ctx_handle(fcmd->cmd); + ubq->active_fcmd = NULL; + smp_mb(); + empty = ublk_io_evts_empty(ubq); + if (likely(empty)) + return; + + spin_lock(&ubq->evts_lock); + new_fcmd = __ublk_pick_active_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (!new_fcmd) + return; + if (handle == io_uring_cmd_ctx_handle(new_fcmd->cmd)) { + data->cmd = new_fcmd->cmd; + fcmd = new_fcmd; + goto again; + } + io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb); } static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, @@ -1578,13 +1695,27 @@ static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, ublk_dispatch_req(ubq, pdu->req, issue_flags); } -static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) +static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last) { - struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + if (ublk_support_batch_io(ubq)) { + unsigned short tag = rq->tag; + struct ublk_batch_fcmd *fcmd = NULL; + + spin_lock(&ubq->evts_lock); + kfifo_put(&ubq->evts_fifo, tag); + if (last) + fcmd = __ublk_pick_active_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); + } else { + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - pdu->req = rq; - io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); + pdu->req = rq; + io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); + } } static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, @@ -1602,14 +1733,44 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, } while (rq); } -static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l) +static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) { - struct io_uring_cmd *cmd = io->cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + unsigned short tags[MAX_NR_TAG]; + struct ublk_batch_fcmd *fcmd; + struct request *rq; + unsigned cnt = 0; + + spin_lock(&ubq->evts_lock); + rq_list_for_each(l, rq) { + tags[cnt++] = (unsigned short)rq->tag; + if (cnt >= MAX_NR_TAG) { + kfifo_in(&ubq->evts_fifo, tags, cnt); + cnt = 0; + } + } + if (cnt) + kfifo_in(&ubq->evts_fifo, tags, cnt); + fcmd = __ublk_pick_active_fcmd(ubq); + spin_unlock(&ubq->evts_lock); - pdu->req_list = rq_list_peek(l); rq_list_init(l); - io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct ublk_io *io, + struct rq_list *l, bool batch) +{ + if (batch) { + ublk_batch_queue_cmd_list(ubq, l); + } else { + struct io_uring_cmd *cmd = io->cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + pdu->req_list = rq_list_peek(l); + rq_list_init(l); + io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); + } } static enum blk_eh_timer_return ublk_timeout(struct request *rq) @@ -1688,7 +1849,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } - ublk_queue_cmd(ubq, rq); + ublk_queue_cmd(ubq, rq, bd->last); return BLK_STS_OK; } @@ -1700,11 +1861,25 @@ static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, (io->task == io2->task); } -static void ublk_queue_rqs(struct rq_list *rqlist) +static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct ublk_batch_fcmd *fcmd; + + spin_lock(&ubq->evts_lock); + fcmd = __ublk_pick_active_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + +static void __ublk_queue_rqs(struct rq_list *rqlist, bool batch) { struct rq_list requeue_list = { }; struct rq_list submit_list = { }; struct ublk_io *io = NULL; + struct ublk_queue *ubq = NULL; struct request *req; while ((req = rq_list_pop(rqlist))) { @@ -1718,16 +1893,27 @@ static void ublk_queue_rqs(struct rq_list *rqlist) if (io && !ublk_belong_to_same_batch(io, this_io) && !rq_list_empty(&submit_list)) - ublk_queue_cmd_list(io, &submit_list); + ublk_queue_cmd_list(ubq, io, &submit_list, batch); io = this_io; + ubq = this_q; rq_list_add_tail(&submit_list, req); } if (!rq_list_empty(&submit_list)) - ublk_queue_cmd_list(io, &submit_list); + ublk_queue_cmd_list(ubq, io, &submit_list, batch); *rqlist = requeue_list; } +static void ublk_queue_rqs(struct rq_list *rqlist) +{ + __ublk_queue_rqs(rqlist, false); +} + +static void ublk_batch_queue_rqs(struct rq_list *rqlist) +{ + __ublk_queue_rqs(rqlist, true); +} + static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -1745,6 +1931,14 @@ static const struct blk_mq_ops ublk_mq_ops = { .timeout = ublk_timeout, }; +static const struct blk_mq_ops ublk_batch_mq_ops = { + .commit_rqs = ublk_commit_rqs, + .queue_rq = ublk_queue_rq, + .queue_rqs = ublk_batch_queue_rqs, + .init_hctx = ublk_init_hctx, + .timeout = ublk_timeout, +}; + static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) { int i; @@ -2122,6 +2316,56 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } +static void ublk_batch_cancel_cmd(struct ublk_queue *ubq, + struct ublk_batch_fcmd *fcmd, + unsigned int issue_flags) +{ + bool done; + + spin_lock(&ubq->evts_lock); + done = (ubq->active_fcmd != fcmd); + if (done) + list_del(&fcmd->node); + spin_unlock(&ubq->evts_lock); + + if (done) { + io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags); + ublk_batch_free_fcmd(fcmd); + } +} + +static void ublk_batch_cancel_queue(struct ublk_queue *ubq) +{ + LIST_HEAD(fcmd_list); + + spin_lock(&ubq->evts_lock); + ubq->force_abort = true; + list_splice_init(&ubq->fcmd_head, &fcmd_list); + if (ubq->active_fcmd) + list_move(&ubq->active_fcmd->node, &ubq->fcmd_head); + spin_unlock(&ubq->evts_lock); + + while (!list_empty(&fcmd_list)) { + struct ublk_batch_fcmd *fcmd = list_first_entry(&fcmd_list, + struct ublk_batch_fcmd, node); + + ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED); + } +} + +static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fcmd *fcmd = pdu->fcmd; + struct ublk_queue *ubq = pdu->ubq; + + if (!ubq->canceling) + ublk_start_cancel(ubq->dev); + + ublk_batch_cancel_cmd(ubq, fcmd, issue_flags); +} + /* * The ublk char device won't be closed when calling cancel fn, so both * ublk device and queue are guaranteed to be live @@ -2173,6 +2417,11 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) { int i; + if (ublk_support_batch_io(ubq)) { + ublk_batch_cancel_queue(ubq); + return; + } + for (i = 0; i < ubq->q_depth; i++) ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); } @@ -3131,6 +3380,74 @@ static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) return ublk_check_batch_cmd_flags(uc); } +static int ublk_batch_attach(struct ublk_queue *ubq, + struct ublk_batch_io_data *data, + struct ublk_batch_fcmd *fcmd) +{ + struct ublk_batch_fcmd *new_fcmd = NULL; + bool free = false; + + spin_lock(&ubq->evts_lock); + if (unlikely(ubq->force_abort || ubq->canceling)) { + free = true; + } else { + list_add_tail(&fcmd->node, &ubq->fcmd_head); + new_fcmd = __ublk_pick_active_fcmd(ubq); + } + spin_unlock(&ubq->evts_lock); + + /* + * If the two fetch commands are originated from same io_ring_ctx, + * run batch dispatch directly. Otherwise, schedule task work for + * doing it. + */ + if (new_fcmd && io_uring_cmd_ctx_handle(new_fcmd->cmd) == + io_uring_cmd_ctx_handle(fcmd->cmd)) { + data->cmd = new_fcmd->cmd; + ublk_batch_dispatch(ubq, data, new_fcmd); + } else if (new_fcmd) { + io_uring_cmd_complete_in_task(new_fcmd->cmd, + ublk_batch_tw_cb); + } + + if (free) { + ublk_batch_free_fcmd(fcmd); + return -ENODEV; + } + return -EIOCBQUEUED; +} + +static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + struct ublk_batch_fcmd *fcmd = ublk_batch_alloc_fcmd(data->cmd); + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd); + + if (!fcmd) + return -ENOMEM; + + pdu->ubq = ubq; + pdu->fcmd = fcmd; + io_uring_cmd_mark_cancelable(data->cmd, data->issue_flags); + + return ublk_batch_attach(ubq, data, fcmd); +} + +static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data, + const struct ublk_batch_io *uc) +{ + if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT)) + return -EINVAL; + + if (uc->elem_bytes != sizeof(__u16)) + return -EINVAL; + + if (uc->flags != 0) + return -E2BIG; + + return 0; +} + static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -3153,6 +3470,11 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, if (data.header.q_id >= ub->dev_info.nr_hw_queues) goto out; + if (unlikely(issue_flags & IO_URING_F_CANCEL)) { + ublk_batch_cancel_fn(cmd, issue_flags); + return 0; + } + switch (cmd_op) { case UBLK_U_IO_PREP_IO_CMDS: ret = ublk_check_batch_cmd(&data); @@ -3166,6 +3488,12 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, goto out; ret = ublk_handle_batch_commit_cmd(&data); break; + case UBLK_U_IO_FETCH_IO_CMDS: + ret = ublk_validate_batch_fetch_cmd(&data, uc); + if (ret) + goto out; + ret = ublk_handle_batch_fetch_cmd(&data); + break; default: ret = -EOPNOTSUPP; } @@ -3342,6 +3670,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) ret = ublk_io_evts_init(ubq, ubq->q_depth); if (ret) goto fail; + INIT_LIST_HEAD(&ubq->fcmd_head); } return 0; fail: @@ -3473,7 +3802,10 @@ static void ublk_align_max_io_size(struct ublk_device *ub) static int ublk_add_tag_set(struct ublk_device *ub) { - ub->tag_set.ops = &ublk_mq_ops; + if (ublk_dev_support_batch_io(ub)) + ub->tag_set.ops = &ublk_batch_mq_ops; + else + ub->tag_set.ops = &ublk_mq_ops; ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a56a50bbb00a4..8a06eebdedaf6 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -120,6 +120,13 @@ #define UBLK_U_IO_COMMIT_IO_CMDS \ _IOWR('u', 0x26, struct ublk_batch_io) +/* + * Fetch io commands to provided buffer in multishot style, + * `IORING_URING_CMD_MULTISHOT` is required for this command. + */ +#define UBLK_U_IO_FETCH_IO_CMDS \ + _IOWR('u', 0x27, struct ublk_batch_io) + /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 #define UBLK_IO_RES_NEED_GET_DATA 1 From 143f122c02e0bc9d1c4d276d8323b9030ce53a99 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:19 +0800 Subject: [PATCH 15/26] ublk: abort requests filled in event kfifo In case of BATCH_IO, any request filled in event kfifo, they don't get chance to be dispatched any more when releasing ublk char device, so we have to abort them too. Add ublk_abort_batch_queue() for aborting this kind of requests. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6b9b47b1b4cac..da89086afce4e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2235,6 +2235,26 @@ static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, } } +/* + * Request tag may just be filled to event kfifo, not get chance to + * dispatch, abort these requests too + */ +static void ublk_abort_batch_queue(struct ublk_device *ub, + struct ublk_queue *ubq) +{ + while (true) { + struct request *req; + short tag; + + if (!kfifo_out(&ubq->evts_fifo, &tag, 1)) + break; + + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + if (req && blk_mq_request_started(req)) + __ublk_fail_req(ub, &ubq->ios[tag], req); + } +} + /* * Called from ublk char device release handler, when any uring_cmd is * done, meantime request queue is "quiesced" since all inflight requests @@ -2253,6 +2273,9 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) __ublk_fail_req(ub, io, io->req); } + + if (ublk_support_batch_io(ubq)) + ublk_abort_batch_queue(ub, ubq); } static void ublk_start_cancel(struct ublk_device *ub) From 8beb44b177ea871f4fa4cdac54502c1ad5cfeb8d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:20 +0800 Subject: [PATCH 16/26] ublk: add new feature UBLK_F_BATCH_IO Add new feature UBLK_F_BATCH_IO which replaces the following two per-io commands: - UBLK_U_IO_FETCH_REQ - UBLK_U_IO_COMMIT_AND_FETCH_REQ with three per-queue batch io uring_cmd: - UBLK_U_IO_PREP_IO_CMDS - UBLK_U_IO_COMMIT_IO_CMDS - UBLK_U_IO_FETCH_IO_CMDS Then ublk can deliver batch io commands to ublk server in single multishort uring_cmd, also allows to prepare & commit multiple commands in batch style via single uring_cmd, communication cost is reduced a lot. This feature also doesn't limit task context any more for all supported commands, so any allowed uring_cmd can be issued in any task context. ublk server implementation becomes much easier. Meantime load balance becomes much easier to support with this feature. The command `UBLK_U_IO_FETCH_IO_CMDS` can be issued from multiple task contexts, so each task can adjust this command's buffer length or number of inflight commands for controlling how much load is handled by current task. Later, priority parameter will be added to command `UBLK_U_IO_FETCH_IO_CMDS` for improving load balance support. UBLK_U_IO_GET_DATA isn't supported in batch io yet, but it may be enabled in future via its batch pair. Signed-off-by: Ming Lei --- drivers/block/ublk_drv.c | 58 ++++++++++++++++++++++++++++++++--- include/uapi/linux/ublk_cmd.h | 16 ++++++++++ 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index da89086afce4e..237ad1b2e3bb3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -74,7 +74,8 @@ | UBLK_F_AUTO_BUF_REG \ | UBLK_F_QUIESCE \ | UBLK_F_PER_IO_DAEMON \ - | UBLK_F_BUF_REG_OFF_DAEMON) + | UBLK_F_BUF_REG_OFF_DAEMON \ + | UBLK_F_BATCH_IO) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -320,12 +321,12 @@ static struct ublk_batch_fcmd *__ublk_pick_active_fcmd( static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) { - return false; + return ub->dev_info.flags & UBLK_F_BATCH_IO; } static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) { - return false; + return ubq->flags & UBLK_F_BATCH_IO; } static inline void ublk_io_lock(struct ublk_io *io) @@ -3471,6 +3472,41 @@ static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data, return 0; } +static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + unsigned tag = READ_ONCE(ub_cmd->tag); + unsigned q_id = READ_ONCE(ub_cmd->q_id); + unsigned index = READ_ONCE(ub_cmd->addr); + struct ublk_queue *ubq; + struct ublk_io *io; + int ret = -EINVAL; + + if (!ub) + return ret; + + if (q_id >= ub->dev_info.nr_hw_queues) + return ret; + + ubq = ublk_get_queue(ub, q_id); + if (tag >= ubq->q_depth) + return ret; + + io = &ubq->ios[tag]; + + switch (cmd->cmd_op) { + case UBLK_U_IO_REGISTER_IO_BUF: + return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, + issue_flags); + case UBLK_U_IO_UNREGISTER_IO_BUF: + return ublk_unregister_io_buf(cmd, ub, index, issue_flags); + default: + return -EOPNOTSUPP; + } +} + static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -3518,7 +3554,8 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_handle_batch_fetch_cmd(&data); break; default: - ret = -EOPNOTSUPP; + ret = ublk_handle_non_batch_cmd(cmd, issue_flags); + break; } out: return ret; @@ -4166,9 +4203,13 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK | - UBLK_F_PER_IO_DAEMON | + (ublk_dev_support_batch_io(ub) ? 0 : UBLK_F_PER_IO_DAEMON) | UBLK_F_BUF_REG_OFF_DAEMON; + /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */ + if (ublk_dev_support_batch_io(ub)) + ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON; + /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) @@ -4521,6 +4562,13 @@ static int ublk_wait_for_idle_io(struct ublk_device *ub, unsigned int elapsed = 0; int ret; + /* + * For UBLK_F_BATCH_IO ublk server can get notified with existing + * or new fetch command, so needn't wait any more + */ + if (ublk_dev_support_batch_io(ub)) + return 0; + while (elapsed < timeout_ms && !signal_pending(current)) { unsigned int queues_cancelable = 0; int i; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 8a06eebdedaf6..650886f359276 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -335,6 +335,22 @@ */ #define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14) + +/* + * Support the following commands for delivering & committing io command + * in batch. + * + * - UBLK_U_IO_PREP_IO_CMDS + * - UBLK_U_IO_COMMIT_IO_CMDS + * - UBLK_U_IO_FETCH_IO_CMDS + * - UBLK_U_IO_REGISTER_IO_BUF + * - UBLK_U_IO_UNREGISTER_IO_BUF + * + * The existing UBLK_U_IO_FETCH_REQ, UBLK_U_IO_COMMIT_AND_FETCH_REQ and + * UBLK_U_IO_GET_DATA uring_cmd are not supported for this feature. + */ +#define UBLK_F_BATCH_IO (1ULL << 15) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 From 76a035b1b2f9e7ffa041788d32a4ec5f9b691617 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:21 +0800 Subject: [PATCH 17/26] ublk: document feature UBLK_F_BATCH_IO Document feature UBLK_F_BATCH_IO. Signed-off-by: Ming Lei --- Documentation/block/ublk.rst | 60 +++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 8c4030bcabb63..09a5604f8e10f 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -260,9 +260,12 @@ The following IO commands are communicated via io_uring passthrough command, and each command is only for forwarding the IO and committing the result with specified IO tag in the command data: -- ``UBLK_IO_FETCH_REQ`` +Traditional Per-I/O Commands +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Sent from the server IO pthread for fetching future incoming IO requests +- ``UBLK_U_IO_FETCH_REQ`` + + Sent from the server I/O pthread for fetching future incoming I/O requests destined to ``/dev/ublkb*``. This command is sent only once from the server IO pthread for ublk driver to setup IO forward environment. @@ -278,7 +281,7 @@ with specified IO tag in the command data: supported by the driver, daemons must be per-queue instead - i.e. all I/Os associated to a single qid must be handled by the same task. -- ``UBLK_IO_COMMIT_AND_FETCH_REQ`` +- ``UBLK_U_IO_COMMIT_AND_FETCH_REQ`` When an IO request is destined to ``/dev/ublkb*``, the driver stores the IO's ``ublksrv_io_desc`` to the specified mapped area; then the @@ -293,7 +296,7 @@ with specified IO tag in the command data: requests with the same IO tag. That is, ``UBLK_IO_COMMIT_AND_FETCH_REQ`` is reused for both fetching request and committing back IO result. -- ``UBLK_IO_NEED_GET_DATA`` +- ``UBLK_U_IO_NEED_GET_DATA`` With ``UBLK_F_NEED_GET_DATA`` enabled, the WRITE request will be firstly issued to ublk server without data copy. Then, IO backend of ublk server @@ -322,6 +325,55 @@ with specified IO tag in the command data: ``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy the server buffer (pages) read to the IO request pages. +Batch I/O Commands (UBLK_F_BATCH_IO) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``UBLK_F_BATCH_IO`` feature provides an alternative high-performance +I/O handling model that replaces the traditional per-I/O commands with +per-queue batch commands. This significantly reduces communication overhead +and enables better load balancing across multiple server tasks. + +Key differences from traditional mode: + +- **Per-queue vs Per-I/O**: Commands operate on queues rather than individual I/Os +- **Batch processing**: Multiple I/Os are handled in single operations +- **Multishot commands**: Use io_uring multishot for reduced submission overhead +- **Flexible task assignment**: Any task can handle any I/O (no per-I/O daemons) +- **Better load balancing**: Tasks can adjust their workload dynamically + +Batch I/O Commands: + +- ``UBLK_U_IO_PREP_IO_CMDS`` + + Prepares multiple I/O commands in batch. The server provides a buffer + containing multiple I/O descriptors that will be processed together. + This reduces the number of individual command submissions required. + +- ``UBLK_U_IO_COMMIT_IO_CMDS`` + + Commits results for multiple I/O operations in batch. The server provides + a buffer containing the results of multiple completed I/Os, allowing + efficient bulk completion of requests. + +- ``UBLK_U_IO_FETCH_IO_CMDS`` + + **Multishot command** for fetching I/O commands in batch. This is the key + command that enables high-performance batch processing: + + * Uses io_uring multishot capability for reduced submission overhead + * Single command can fetch multiple I/O requests over time + * Buffer size determines maximum batch size per operation + * Multiple fetch commands can be submitted for load balancing + * Only one fetch command is active at any time per queue + * Supports dynamic load balancing across multiple server tasks + + Each task can submit ``UBLK_U_IO_FETCH_IO_CMDS`` with different buffer + sizes to control how much work it handles. This enables sophisticated + load balancing strategies in multi-threaded servers. + +Migration: Applications using traditional commands (``UBLK_U_IO_FETCH_REQ``, +``UBLK_U_IO_COMMIT_AND_FETCH_REQ``) cannot use batch mode simultaneously. + Zero copy --------- From b4c74e27dc2121c0dd2f30825e194d0423c42d1c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:22 +0800 Subject: [PATCH 18/26] selftests: ublk: fix user_data truncation for tgt_data >= 256 The build_user_data() function packs multiple fields into a __u64 value using bit shifts. Without explicit __u64 casts before shifting, the shift operations are performed on 32-bit unsigned integers before being promoted to 64-bit, causing data loss. Specifically, when tgt_data >= 256, the expression (tgt_data << 24) shifts on a 32-bit value, truncating the upper 8 bits before promotion to __u64. Since tgt_data can be up to 16 bits (assertion allows up to 65535), values >= 256 would have their high byte lost. Add explicit __u64 casts to both op and tgt_data before shifting to ensure the shift operations happen in 64-bit space, preserving all bits of the input values. user_data_to_tgt_data() is only used by stripe.c, in which the max supported member disks are 4, so won't trigger this issue. Signed-off-by: Ming Lei --- tools/testing/selftests/ublk/kublk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 5e55484fb0aa2..87bed2ed65210 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -223,7 +223,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); - return tag | (op << 16) | (tgt_data << 24) | + return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; } From 3561b5d0f29ebd9e874200a78a6e9930697439a2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:23 +0800 Subject: [PATCH 19/26] selftests: ublk: replace assert() with ublk_assert() Replace assert() with ublk_assert() since it is often triggered in daemon, and we may get nothing shown in terminal. Add ublk_assert(), so we can log something to syslog when assert() is triggered. Signed-off-by: Ming Lei --- tools/testing/selftests/ublk/common.c | 2 +- tools/testing/selftests/ublk/file_backed.c | 2 +- tools/testing/selftests/ublk/kublk.c | 2 +- tools/testing/selftests/ublk/kublk.h | 2 +- tools/testing/selftests/ublk/stripe.c | 10 +++++----- tools/testing/selftests/ublk/utils.h | 10 ++++++++++ 6 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index 01580a6f85196..4c07bc37eb6d2 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -16,7 +16,7 @@ int backing_file_tgt_init(struct ublk_dev *dev) { int fd, i; - assert(dev->nr_fds == 1); + ublk_assert(dev->nr_fds == 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) { char *file = dev->tgt.backing_file[i]; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index cd9fe69ecce20..9e7dd3859ea90 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -10,7 +10,7 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; - assert(0); + ublk_assert(0); } static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 6b8123c12a7ae..f226d66a6eb2e 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -750,7 +750,7 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, } if (cqe->res == UBLK_IO_RES_OK) { - assert(tag < q->q_depth); + ublk_assert(tag < q->q_depth); if (q->tgt_ops->queue_io) q->tgt_ops->queue_io(t, q, tag); } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) { diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 87bed2ed65210..bfd1d7a890f50 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -221,7 +221,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, { /* we only have 7 bits to encode q_id */ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); - assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); + ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 791fa8dc16510..50874858a8290 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -96,12 +96,12 @@ static void calculate_stripe_array(const struct stripe_conf *conf, this->seq = seq; s->nr += 1; } else { - assert(seq == this->seq); - assert(this->start + this->nr_sects == stripe_off); + ublk_assert(seq == this->seq); + ublk_assert(this->start + this->nr_sects == stripe_off); this->nr_sects += nr_sects; } - assert(this->nr_vec < this->cap); + ublk_assert(this->nr_vec < this->cap); this->vec[this->nr_vec].iov_base = (void *)(base + done); this->vec[this->nr_vec++].iov_len = nr_sects << 9; @@ -120,7 +120,7 @@ static inline enum io_uring_op stripe_to_uring_op( return zc ? IORING_OP_READV_FIXED : IORING_OP_READV; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV; - assert(0); + ublk_assert(0); } static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, @@ -318,7 +318,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) return -EINVAL; - assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + ublk_assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index a852e0b7153e2..17eefed736908 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -43,6 +43,7 @@ static inline void ublk_err(const char *fmt, ...) va_start(ap, fmt); vfprintf(stderr, fmt, ap); + va_end(ap); } static inline void ublk_log(const char *fmt, ...) @@ -52,6 +53,7 @@ static inline void ublk_log(const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } @@ -62,7 +64,15 @@ static inline void ublk_dbg(int level, const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } +#define ublk_assert(x) do { \ + if (!(x)) { \ + ublk_err("%s %d: assert!\n", __func__, __LINE__); \ + assert(x); \ + } \ +} while (0) + #endif From 08055b733569b29f075a69759c0bf8893cee0ed2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:24 +0800 Subject: [PATCH 20/26] selftests: ublk: add ublk_io_buf_idx() for returning io buffer index Since UBLK_F_PER_IO_DAEMON is added, io buffer index may depend on current thread because the common way is to use per-pthread io_ring_ctx for issuing ublk uring_cmd. Add one helper for returning io buffer index, so we can hide the buffer index implementation details for target code. Signed-off-by: Ming Lei --- tools/testing/selftests/ublk/file_backed.c | 9 +++++---- tools/testing/selftests/ublk/kublk.c | 9 +++++---- tools/testing/selftests/ublk/kublk.h | 10 +++++++++- tools/testing/selftests/ublk/null.c | 18 ++++++++++-------- tools/testing/selftests/ublk/stripe.c | 7 ++++--- 5 files changed, 33 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 9e7dd3859ea90..58ac59528b748 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -36,6 +36,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, enum io_uring_op op = ublk_to_uring_op(iod, zc | auto_zc); struct io_uring_sqe *sqe[3]; void *addr = (zc | auto_zc) ? NULL : (void *)iod->addr; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); if (!zc || auto_zc) { ublk_io_alloc_sqes(t, sqe, 1); @@ -47,7 +48,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, iod->nr_sectors << 9, iod->start_sector << 9); if (auto_zc) - sqe[0]->buf_index = tag; + sqe[0]->buf_index = buf_idx; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -56,7 +57,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -64,11 +65,11 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, iod->nr_sectors << 9, iod->start_sector << 9); - sqe[1]->buf_index = tag; + sqe[1]->buf_index = buf_idx; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); return 2; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index f226d66a6eb2e..388e85a8d07ec 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -579,16 +579,17 @@ static void ublk_dev_unprep(struct ublk_dev *dev) close(dev->fds[0]); } -static void ublk_set_auto_buf_reg(const struct ublk_queue *q, +static void ublk_set_auto_buf_reg(const struct ublk_thread *t, + const struct ublk_queue *q, struct io_uring_sqe *sqe, unsigned short tag) { struct ublk_auto_buf_reg buf = {}; if (q->tgt_ops->buf_index) - buf.index = q->tgt_ops->buf_index(q, tag); + buf.index = q->tgt_ops->buf_index(t, q, tag); else - buf.index = q->ios[tag].buf_index; + buf.index = ublk_io_buf_idx(t, q, tag); if (ublk_queue_auto_zc_fallback(q)) buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; @@ -655,7 +656,7 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) cmd->addr = 0; if (ublk_queue_use_auto_zc(q)) - ublk_set_auto_buf_reg(q, sqe[0], io->tag); + ublk_set_auto_buf_reg(t, q, sqe[0], io->tag); user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); io_uring_sqe_set_data64(sqe[0], user_data); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index bfd1d7a890f50..c3da64d3b7b05 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -143,7 +143,8 @@ struct ublk_tgt_ops { void (*usage)(const struct ublk_tgt_ops *ops); /* return buffer index for UBLK_F_AUTO_BUF_REG */ - unsigned short (*buf_index)(const struct ublk_queue *, int tag); + unsigned short (*buf_index)(const struct ublk_thread *t, + const struct ublk_queue *, int tag); }; struct ublk_tgt { @@ -354,6 +355,13 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, + const struct ublk_queue *q, + unsigned tag) +{ + return q->ios[tag].buf_index; +} + static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) { return &q->ios[tag]; diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 280043f6b6896..819f72ac2da98 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -43,12 +43,12 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) } static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod, - struct io_uring_sqe *sqe, int q_id) + struct io_uring_sqe *sqe, int q_id, unsigned buf_idx) { unsigned ublk_op = ublksrv_get_op(iod); io_uring_prep_nop(sqe); - sqe->buf_index = tag; + sqe->buf_index = buf_idx; sqe->flags |= IOSQE_FIXED_FILE; sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; sqe->len = iod->nr_sectors << 9; /* injected result */ @@ -60,18 +60,19 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe[3]; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; - __setup_nop_io(tag, iod, sqe[1], q->q_id); + __setup_nop_io(tag, iod, sqe[1], q->q_id, buf_idx); sqe[1]->flags |= IOSQE_IO_HARDLINK; - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); // buf register is marked as IOSQE_CQE_SKIP_SUCCESS @@ -85,7 +86,7 @@ static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q, struct io_uring_sqe *sqe[1]; ublk_io_alloc_sqes(t, sqe, 1); - __setup_nop_io(tag, iod, sqe[0], q->q_id); + __setup_nop_io(tag, iod, sqe[0], q->q_id, ublk_io_buf_idx(t, q, tag)); return 1; } @@ -136,11 +137,12 @@ static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q, * return invalid buffer index for triggering auto buffer register failure, * then UBLK_IO_RES_NEED_REG_BUF handling is covered */ -static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag) +static unsigned short ublk_null_buf_index(const struct ublk_thread *t, + const struct ublk_queue *q, int tag) { if (ublk_queue_auto_zc_fallback(q)) return (unsigned short)-1; - return q->ios[tag].buf_index; + return ublk_io_buf_idx(t, q, tag); } const struct ublk_tgt_ops null_tgt_ops = { diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 50874858a8290..db281a879877b 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -135,6 +135,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, struct ublk_io *io = ublk_get_io(q, tag); int i, extra = zc ? 2 : 0; void *base = (zc | auto_zc) ? NULL : (void *)iod->addr; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); io->private_data = s; calculate_stripe_array(conf, iod, s, base); @@ -142,7 +143,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -158,7 +159,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, t->start << 9); io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); if (auto_zc || zc) { - sqe[i]->buf_index = tag; + sqe[i]->buf_index = buf_idx; if (zc) sqe[i]->flags |= IOSQE_IO_HARDLINK; } @@ -168,7 +169,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (zc) { struct io_uring_sqe *unreg = sqe[s->nr + 1]; - io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, buf_idx); unreg->user_data = build_user_data( tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); } From 8091f0faf8282c97adc7595b3505ba158e44ee31 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:25 +0800 Subject: [PATCH 21/26] selftests: ublk: add batch buffer management infrastructure Add the foundational infrastructure for UBLK_F_BATCH_IO buffer management including: - Allocator utility functions for small sized per-thread allocation - Batch buffer allocation and deallocation functions - Buffer index management for commit buffers - Thread state management for batch I/O mode - Buffer size calculation based on device features This prepares the groundwork for handling batch I/O commands by establishing the buffer management layer needed for UBLK_U_IO_PREP_IO_CMDS and UBLK_U_IO_COMMIT_IO_CMDS operations. The allocator uses CPU sets for efficient per-thread buffer tracking, and commit buffers are pre-allocated with 2 buffers per thread to handle overlapping command operations. Signed-off-by: Ming Lei --- tools/testing/selftests/ublk/Makefile | 2 +- tools/testing/selftests/ublk/batch.c | 153 ++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.c | 26 ++++- tools/testing/selftests/ublk/kublk.h | 52 +++++++++ tools/testing/selftests/ublk/utils.h | 54 +++++++++ 5 files changed, 283 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/ublk/batch.c diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 770269efe42ab..a724276622d06 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -44,7 +44,7 @@ TEST_GEN_PROGS_EXTENDED = kublk include ../lib.mk -$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c stripe.c \ +$(TEST_GEN_PROGS_EXTENDED): kublk.c batch.c null.c file_backed.c common.c stripe.c \ fault_inject.c check: diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c new file mode 100644 index 0000000000000..b6bcdd529df9f --- /dev/null +++ b/tools/testing/selftests/ublk/batch.c @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: UBLK_F_BATCH_IO buffer management + */ + +#include "kublk.h" + +static inline void *ublk_get_commit_buf(struct ublk_thread *t, + unsigned short buf_idx) +{ + unsigned idx; + + if (buf_idx < t->commit_buf_start || + buf_idx >= t->commit_buf_start + t->nr_commit_buf) + return NULL; + idx = buf_idx - t->commit_buf_start; + return t->commit_buf + idx * t->commit_buf_size; +} + +/* + * Allocate one buffer for UBLK_U_IO_PREP_IO_CMDS or UBLK_U_IO_COMMIT_IO_CMDS + * + * Buffer index is returned. + */ +static inline unsigned short ublk_alloc_commit_buf(struct ublk_thread *t) +{ + int idx = allocator_get(&t->commit_buf_alloc); + + if (idx >= 0) + return idx + t->commit_buf_start; + return UBLKS_T_COMMIT_BUF_INV_IDX; +} + +/* + * Free one commit buffer which is used by UBLK_U_IO_PREP_IO_CMDS or + * UBLK_U_IO_COMMIT_IO_CMDS + */ +static inline void ublk_free_commit_buf(struct ublk_thread *t, + unsigned short i) +{ + unsigned short idx = i - t->commit_buf_start; + + ublk_assert(idx < t->nr_commit_buf); + ublk_assert(allocator_get_val(&t->commit_buf_alloc, idx) != 0); + + allocator_put(&t->commit_buf_alloc, idx); +} + +static unsigned char ublk_commit_elem_buf_size(struct ublk_dev *dev) +{ + if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY | + UBLK_F_AUTO_BUF_REG)) + return 8; + + /* one extra 8bytes for carrying buffer address */ + return 16; +} + +static unsigned ublk_commit_buf_size(struct ublk_thread *t) +{ + struct ublk_dev *dev = t->dev; + unsigned elem_size = ublk_commit_elem_buf_size(dev); + unsigned int total = elem_size * dev->dev_info.queue_depth; + unsigned int page_sz = getpagesize(); + + return round_up(total, page_sz); +} + +static void free_batch_commit_buf(struct ublk_thread *t) +{ + free(t->commit_buf); + allocator_deinit(&t->commit_buf_alloc); +} + +static int alloc_batch_commit_buf(struct ublk_thread *t) +{ + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + struct iovec iov[t->nr_commit_buf]; + unsigned int page_sz = getpagesize(); + void *buf = NULL; + int i, ret; + + allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); + + t->commit_buf = NULL; + ret = posix_memalign(&buf, page_sz, total); + if (ret || !buf) + goto fail; + + t->commit_buf = buf; + for (i = 0; i < t->nr_commit_buf; i++) { + iov[i].iov_base = buf; + iov[i].iov_len = buf_size; + buf += buf_size; + } + + ret = io_uring_register_buffers_update_tag(&t->ring, + t->commit_buf_start, iov, NULL, + t->nr_commit_buf); + if (ret == t->nr_commit_buf) + return 0; + + ublk_err("%s: io_uring_register_buffers_update_tag failed ret %d\n", + __func__, ret); +fail: + free_batch_commit_buf(t); + return ret; +} + +void ublk_batch_prepare(struct ublk_thread *t) +{ + /* + * We only handle single device in this thread context. + * + * All queues have same feature flags, so use queue 0's for + * calculate uring_cmd flags. + * + * This way looks not elegant, but it works so far. + */ + struct ublk_queue *q = &t->dev->q[0]; + + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); + t->commit_buf_size = ublk_commit_buf_size(t); + t->commit_buf_start = t->nr_bufs; + t->nr_commit_buf = 2; + t->nr_bufs += t->nr_commit_buf; + + t->cmd_flags = 0; + if (ublk_queue_use_auto_zc(q)) { + if (ublk_queue_auto_zc_fallback(q)) + t->cmd_flags |= UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK; + } else if (!ublk_queue_no_buf(q)) + t->cmd_flags |= UBLK_BATCH_F_HAS_BUF_ADDR; + + t->state |= UBLKS_T_BATCH_IO; + + ublk_log("%s: thread %d commit(nr_bufs %u, buf_size %u, start %u)\n", + __func__, t->idx, + t->nr_commit_buf, t->commit_buf_size, + t->nr_bufs); +} + +int ublk_batch_alloc_buf(struct ublk_thread *t) +{ + ublk_assert(t->nr_commit_buf < 16); + return alloc_batch_commit_buf(t); +} + +void ublk_batch_free_buf(struct ublk_thread *t) +{ + free_batch_commit_buf(t); +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 388e85a8d07ec..0880581141369 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -423,6 +423,8 @@ static void ublk_thread_deinit(struct ublk_thread *t) { io_uring_unregister_buffers(&t->ring); + ublk_batch_free_buf(t); + io_uring_unregister_ring_fd(&t->ring); if (t->ring.ring_fd > 0) { @@ -505,15 +507,33 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); - ret = io_uring_register_buffers_sparse( - &t->ring, max_nr_ios_per_thread); + + t->nr_bufs = max_nr_ios_per_thread; + } else { + t->nr_bufs = 0; + } + + if (ublk_dev_batch_io(dev)) + ublk_batch_prepare(t); + + if (t->nr_bufs) { + ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs); if (ret) { - ublk_err("ublk dev %d thread %d register spare buffers failed %d", + ublk_err("ublk dev %d thread %d register spare buffers failed %d\n", dev->dev_info.dev_id, t->idx, ret); goto fail; } } + if (ublk_dev_batch_io(dev)) { + ret = ublk_batch_alloc_buf(t); + if (ret) { + ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n", + dev->dev_info.dev_id, t->idx, ret); + goto fail; + } + } + io_uring_register_ring_fd(&t->ring); if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index c3da64d3b7b05..4dd9d30533259 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -174,6 +174,14 @@ struct ublk_queue { struct ublk_io ios[UBLK_QUEUE_DEPTH]; }; +/* align with `ublk_elem_header` */ +struct ublk_batch_elem { + __u16 tag; + __u16 buf_index; + __s32 result; + __u64 buf_addr; +}; + struct ublk_thread { struct ublk_dev *dev; struct io_uring ring; @@ -185,7 +193,23 @@ struct ublk_thread { #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) +#define UBLKS_T_BATCH_IO (1U << 31) /* readonly */ unsigned state; + + unsigned short nr_bufs; + + /* followings are for BATCH_IO */ + unsigned short commit_buf_start; + unsigned char commit_buf_elem_size; + /* + * We just support single device, so pre-calculate commit/prep flags + */ + unsigned short cmd_flags; + unsigned int nr_commit_buf; + unsigned int commit_buf_size; + void *commit_buf; +#define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) + struct allocator commit_buf_alloc; }; struct ublk_dev { @@ -206,6 +230,27 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline int __ublk_use_batch_io(__u64 flags) +{ + return flags & UBLK_F_BATCH_IO; +} + +static inline int ublk_queue_batch_io(const struct ublk_queue *q) +{ + return __ublk_use_batch_io(q->flags); +} + +static inline int ublk_dev_batch_io(const struct ublk_dev *dev) +{ + return __ublk_use_batch_io(dev->dev_info.flags); +} + +/* only work for handle single device in this pthread context */ +static inline int ublk_thread_batch_io(const struct ublk_thread *t) +{ + return t->state & UBLKS_T_BATCH_IO; +} + static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) { @@ -421,6 +466,13 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +/* Initialize batch I/O state and calculate buffer parameters */ +void ublk_batch_prepare(struct ublk_thread *t); +/* Allocate and register commit buffers for batch operations */ +int ublk_batch_alloc_buf(struct ublk_thread *t); +/* Free commit buffers and cleanup batch allocator */ +void ublk_batch_free_buf(struct ublk_thread *t); + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index 17eefed736908..aab522f261675 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -21,6 +21,60 @@ #define round_up(val, rnd) \ (((val) + ((rnd) - 1)) & ~((rnd) - 1)) +/* small sized & per-thread allocator */ +struct allocator { + unsigned int size; + cpu_set_t *set; +}; + +static inline int allocator_init(struct allocator *a, unsigned size) +{ + a->set = CPU_ALLOC(size); + a->size = size; + + if (a->set) + return 0; + return -ENOMEM; +} + +static inline void allocator_deinit(struct allocator *a) +{ + CPU_FREE(a->set); + a->set = NULL; + a->size = 0; +} + +static inline int allocator_get(struct allocator *a) +{ + int i; + + for (i = 0; i < a->size; i += 1) { + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (!CPU_ISSET_S(i, set_size, a->set)) { + CPU_SET_S(i, set_size, a->set); + return i; + } + } + + return -1; +} + +static inline void allocator_put(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (i >= 0 && i < a->size) + CPU_CLR_S(i, set_size, a->set); +} + +static inline int allocator_get_val(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + return CPU_ISSET_S(i, set_size, a->set); +} + static inline unsigned int ilog2(unsigned int x) { if (x == 0) From 6894acc5ccdc4358f8941b3d0c51c31e88f6bf77 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:26 +0800 Subject: [PATCH 22/26] selftests: ublk: handle UBLK_U_IO_PREP_IO_CMDS Implement support for UBLK_U_IO_PREP_IO_CMDS in the batch I/O framework: - Add batch command initialization and setup functions - Implement prep command queueing with proper buffer management - Add command completion handling for prep and commit commands - Integrate batch I/O setup into thread initialization - Update CQE handling to support batch commands The implementation uses the previously established buffer management infrastructure to queue UBLK_U_IO_PREP_IO_CMDS commands. Commands are prepared in the first thread context and use commit buffers for efficient command batching. Key changes: - ublk_batch_queue_prep_io_cmds() prepares I/O command batches - ublk_batch_compl_cmd() handles batch command completions - Modified thread setup to use batch operations when enabled - Enhanced buffer index calculation for batch mode Signed-off-by: Ming Lei --- tools/testing/selftests/ublk/batch.c | 114 +++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.c | 46 ++++++++--- tools/testing/selftests/ublk/kublk.h | 22 ++++++ 3 files changed, 172 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index b6bcdd529df9f..ca0ecc33e6a1a 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -151,3 +151,117 @@ void ublk_batch_free_buf(struct ublk_thread *t) { free_batch_commit_buf(t); } + +static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, + struct io_uring_sqe *sqe, unsigned op, + unsigned short elem_bytes, + unsigned short nr_elem, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + __u64 user_data; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + ublk_set_sqe_cmd_op(sqe, op); + + sqe->fd = 0; /* dev->fds[0] */ + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags = IOSQE_FIXED_FILE; + + cmd->q_id = q_id; + cmd->flags = 0; + cmd->reserved = 0; + cmd->elem_bytes = elem_bytes; + cmd->nr_elem = nr_elem; + + user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0); + io_uring_sqe_set_data64(sqe, user_data); + + t->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: thread %u qid %d cmd_op %x data %lx " + "nr_elem %u elem_bytes %u buf_size %u buf_idx %d " + "cmd_inflight %u\n", + __func__, t->idx, q_id, op, user_data, + cmd->nr_elem, cmd->elem_bytes, + nr_elem * elem_bytes, buf_idx, t->cmd_inflight); +} + +static void ublk_setup_commit_sqe(struct ublk_thread *t, + struct io_uring_sqe *sqe, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + sqe->rw_flags= IORING_URING_CMD_FIXED; + sqe->buf_index = buf_idx; + cmd->flags |= t->cmd_flags; +} + +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + unsigned short nr_elem = q->q_depth; + unsigned short buf_idx = ublk_alloc_commit_buf(t); + struct io_uring_sqe *sqe; + void *buf; + int i; + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_assert(nr_elem == q->q_depth); + buf = ublk_get_commit_buf(t, buf_idx); + for (i = 0; i < nr_elem; i++) { + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)( + buf + i * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[i]; + + elem->tag = i; + elem->result = 0; + + if (ublk_queue_use_auto_zc(q)) + elem->buf_index = ublk_batch_io_buf_idx(t, q, i); + else if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64)io->buf_addr; + } + + sqe->addr = (__u64)buf; + sqe->len = t->commit_buf_elem_size * nr_elem; + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_PREP_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); + return 0; +} + +static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe, + unsigned op) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) + ublk_assert(cqe->res == 0); + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) + ;//assert(cqe->res == t->commit_buf_size); + else + ublk_assert(0); + + ublk_free_commit_buf(t, buf_idx); +} + +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || + op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + ublk_batch_compl_commit_cmd(t, cqe, op); + return; + } +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 0880581141369..2de422cf3f1c6 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -795,28 +795,32 @@ static void ublk_handle_cqe(struct ublk_thread *t, { struct ublk_dev *dev = t->dev; unsigned q_id = user_data_to_q_id(cqe->user_data); - struct ublk_queue *q = &dev->q[q_id]; unsigned cmd_op = user_data_to_op(cqe->user_data); if (cqe->res < 0 && cqe->res != -ENODEV) - ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, - cqe->res, cqe->user_data, q->flags); + ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, + cqe->res, cqe->user_data, t->state); - ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", - __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), - cmd_op, is_target_io(cqe->user_data), + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x " + "data %lx target %d/%d) stopping %d\n", + __func__, cqe->res, t->idx, q_id, + user_data_to_tag(cqe->user_data), + cmd_op, cqe->user_data, is_target_io(cqe->user_data), user_data_to_tgt_data(cqe->user_data), (t->state & UBLKS_T_STOPPING)); /* Don't retrieve io in case of target io */ if (is_target_io(cqe->user_data)) { - ublksrv_handle_tgt_cqe(t, q, cqe); + ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe); return; } t->cmd_inflight--; - ublk_handle_uring_cmd(t, q, cqe); + if (ublk_thread_batch_io(t)) + ublk_batch_compl_cmd(t, cqe); + else + ublk_handle_uring_cmd(t, &dev->q[q_id], cqe); } static int ublk_reap_events_uring(struct ublk_thread *t) @@ -873,6 +877,22 @@ struct ublk_thread_info { unsigned long long extra_flags; }; +static void ublk_batch_setup_queues(struct ublk_thread *t) +{ + int i; + + /* setup all queues in the 1st thread */ + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + struct ublk_queue *q = &t->dev->q[i]; + int ret; + + ret = ublk_batch_queue_prep_io_cmds(t, q); + ublk_assert(ret == 0); + ret = ublk_process_io(t); + ublk_assert(ret >= 0); + } +} + static void *ublk_io_handler_fn(void *data) { struct ublk_thread_info *info = data; @@ -897,8 +917,14 @@ static void *ublk_io_handler_fn(void *data) ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", gettid(), dev_id, t->idx); - /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(t); + if (!ublk_thread_batch_io(t)) { + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(t); + } else if (!t->idx) { + /* prepare all io commands in the 1st thread context */ + ublk_batch_setup_queues(t); + } + do { if (ublk_process_io(t) < 0) break; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 4dd9d30533259..0cecde526a2c1 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -400,10 +400,16 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag); + static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, const struct ublk_queue *q, unsigned tag) { + if (ublk_queue_batch_io(q)) + return ublk_batch_io_buf_idx(t, q, tag); return q->ios[tag].buf_index; } @@ -466,6 +472,22 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +/* + * Each IO's buffer index has to be calculated by this helper for + * UBLKS_T_BATCH_IO + */ +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag) +{ + return tag; +} + +/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Handle completion of batch I/O commands (prep/commit) */ +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe); /* Initialize batch I/O state and calculate buffer parameters */ void ublk_batch_prepare(struct ublk_thread *t); /* Allocate and register commit buffers for batch operations */ From 97760a0b278bdcf77c23322ffb540dfe6c367cce Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:27 +0800 Subject: [PATCH 23/26] selftests: ublk: handle UBLK_U_IO_COMMIT_IO_CMDS Implement UBLK_U_IO_COMMIT_IO_CMDS to enable efficient batched completion of I/O operations in the batch I/O framework. This completes the batch I/O infrastructure by adding the commit phase that notifies the kernel about completed I/O operations: Key features: - Batch multiple I/O completions into single UBLK_U_IO_COMMIT_IO_CMDS - Dynamic commit buffer allocation and management per thread - Automatic commit buffer preparation before processing events - Commit buffer submission after processing completed I/Os - Integration with existing completion workflows Implementation details: - ublk_batch_prep_commit() allocates and initializes commit buffers - ublk_batch_complete_io() adds completed I/Os to current batch - ublk_batch_commit_io_cmds() submits batched completions to kernel - Modified ublk_process_io() to handle batch commit lifecycle - Enhanced ublk_complete_io() to route to batch or legacy completion The commit buffer stores completion information (tag, result, buffer details) for multiple I/Os, then submits them all at once, significantly reducing syscall overhead compared to individual I/O completions. Signed-off-by: Ming Lei --- tools/testing/selftests/ublk/batch.c | 74 ++++++++++++++++++++++++++-- tools/testing/selftests/ublk/kublk.c | 8 ++- tools/testing/selftests/ublk/kublk.h | 69 +++++++++++++++++--------- 3 files changed, 122 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index ca0ecc33e6a1a..83c9cad715917 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -175,7 +175,7 @@ static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, cmd->elem_bytes = elem_bytes; cmd->nr_elem = nr_elem; - user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0); + user_data = build_user_data(buf_idx, _IOC_NR(op), nr_elem, q_id, 0); io_uring_sqe_set_data64(sqe, user_data); t->cmd_inflight += 1; @@ -246,9 +246,11 @@ static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) ublk_assert(cqe->res == 0); - else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) - ;//assert(cqe->res == t->commit_buf_size); - else + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + int nr_elem = user_data_to_tgt_data(cqe->user_data); + + ublk_assert(cqe->res == t->commit_buf_elem_size * nr_elem); + } else ublk_assert(0); ublk_free_commit_buf(t, buf_idx); @@ -265,3 +267,67 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, return; } } + +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + struct io_uring_sqe *sqe; + unsigned short buf_idx; + unsigned short nr_elem = t->commit.done; + + /* nothing to commit */ + if (!nr_elem) { + ublk_free_commit_buf(t, t->commit.buf_idx); + return; + } + + ublk_io_alloc_sqes(t, &sqe, 1); + buf_idx = t->commit.buf_idx; + sqe->addr = (__u64)t->commit.elem; + sqe->len = nr_elem * t->commit_buf_elem_size; + + /* commit isn't per-queue command */ + ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); +} + +static void ublk_batch_init_commit(struct ublk_thread *t, + unsigned short buf_idx) +{ + /* so far only support 1:1 queue/thread mapping */ + t->commit.q_id = t->idx; + t->commit.buf_idx = buf_idx; + t->commit.elem = ublk_get_commit_buf(t, buf_idx); + t->commit.done = 0; + t->commit.count = t->commit_buf_size / + t->commit_buf_elem_size; +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + unsigned short buf_idx = ublk_alloc_commit_buf(t); + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + ublk_batch_init_commit(t, buf_idx); +} + +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + struct batch_commit_buf *cb = &t->commit; + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem + + cb->done * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[tag]; + + ublk_assert(q->q_id == t->commit.q_id); + + elem->tag = tag; + elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); + elem->result = res; + + if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64) (uintptr_t) io->buf_addr; + + cb->done += 1; + ublk_assert(cb->done <= cb->count); +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 2de422cf3f1c6..62bee39db52b5 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -852,7 +852,13 @@ static int ublk_process_io(struct ublk_thread *t) return -ENODEV; ret = io_uring_submit_and_wait(&t->ring, 1); - reapped = ublk_reap_events_uring(t); + if (ublk_thread_batch_io(t)) { + ublk_batch_prep_commit(t); + reapped = ublk_reap_events_uring(t); + ublk_batch_commit_io_cmds(t); + } else { + reapped = ublk_reap_events_uring(t); + } ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", ret, reapped, (t->state & UBLKS_T_STOPPING), diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 0cecde526a2c1..5a90b99495e81 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -182,6 +182,14 @@ struct ublk_batch_elem { __u64 buf_addr; }; +struct batch_commit_buf { + unsigned short q_id; + unsigned short buf_idx; + void *elem; + unsigned short done; + unsigned short count; +}; + struct ublk_thread { struct ublk_dev *dev; struct io_uring ring; @@ -210,6 +218,7 @@ struct ublk_thread { void *commit_buf; #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; + struct batch_commit_buf commit; }; struct ublk_dev { @@ -418,30 +427,6 @@ static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) return &q->ios[tag]; } -static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int res) -{ - struct ublk_io *io = &q->ios[tag]; - - ublk_mark_io_done(io, res); - - return ublk_queue_io_cmd(t, io); -} - -static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int queued) -{ - if (queued < 0) - ublk_complete_io(t, q, tag, queued); - else { - struct ublk_io *io = ublk_get_io(q, tag); - - t->io_inflight += queued; - io->tgt_ios = queued; - io->result = 0; - } -} - static inline int ublk_completed_tgt_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag) { @@ -495,6 +480,42 @@ int ublk_batch_alloc_buf(struct ublk_thread *t); /* Free commit buffers and cleanup batch allocator */ void ublk_batch_free_buf(struct ublk_thread *t); +/* Prepare a new commit buffer for batching completed I/O operations */ +void ublk_batch_prep_commit(struct ublk_thread *t); +/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */ +void ublk_batch_commit_io_cmds(struct ublk_thread *t); +/* Add a completed I/O operation to the current batch commit buffer */ +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res); + +static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + if (ublk_queue_batch_io(q)) { + ublk_batch_complete_io(t, q, tag, res); + return 0; + } else { + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + return ublk_queue_io_cmd(t, io); + } +} + +static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(t, q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + t->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; From af6f318baf7e8096409acb377997f36de6e2843f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:28 +0800 Subject: [PATCH 24/26] selftests: ublk: handle UBLK_U_IO_FETCH_IO_CMDS Add support for UBLK_U_IO_FETCH_IO_CMDS to enable efficient batch fetching of I/O commands using multishot io_uring operations. Key improvements: - Implement multishot UBLK_U_IO_FETCH_IO_CMDS for continuous command fetching - Add fetch buffer management with page-aligned, mlocked buffers - Process fetched I/O command tags from kernel-provided buffers - Integrate fetch operations with existing batch I/O infrastructure - Significantly reduce uring_cmd issuing overhead through batching The implementation uses two fetch buffers per thread with automatic requeuing to maintain continuous I/O command flow. Each fetch operation retrieves multiple command tags in a single syscall, dramatically improving performance compared to individual command fetching. Technical details: - Fetch buffers are page-aligned and mlocked for optimal performance - Uses IORING_URING_CMD_MULTISHOT for continuous operation - Automatic buffer management and requeuing on completion - Enhanced CQE handling for fetch command completions Signed-off-by: Ming Lei --- tools/testing/selftests/ublk/batch.c | 137 ++++++++++++++++++++++++++- tools/testing/selftests/ublk/kublk.c | 14 ++- tools/testing/selftests/ublk/kublk.h | 14 +++ 3 files changed, 161 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 83c9cad715917..69fb417ebf93f 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -141,15 +141,63 @@ void ublk_batch_prepare(struct ublk_thread *t) t->nr_bufs); } +static void free_batch_fetch_buf(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); + munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); + free(t->fetch[i].fetch_buf); + } +} + +static int alloc_batch_fetch_buf(struct ublk_thread *t) +{ + /* page aligned fetch buffer, and it is mlocked for speedup delivery */ + unsigned pg_sz = getpagesize(); + unsigned buf_size = round_up(t->dev->dev_info.queue_depth * 2, pg_sz); + int ret; + int i = 0; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + t->fetch[i].fetch_buf_size = buf_size; + + if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, + t->fetch[i].fetch_buf_size)) + return -ENOMEM; + + /* lock fetch buffer page for fast fetching */ + if (mlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size)) + ublk_err("%s: can't lock fetch buffer %s\n", __func__, + strerror(errno)); + t->fetch[i].br = io_uring_setup_buf_ring(&t->ring, 1, + i, IOU_PBUF_RING_INC, &ret); + if (!t->fetch[i].br) { + ublk_err("Buffer ring register failed %d\n", ret); + return ret; + } + } + + return 0; +} + int ublk_batch_alloc_buf(struct ublk_thread *t) { + int ret; + ublk_assert(t->nr_commit_buf < 16); - return alloc_batch_commit_buf(t); + + ret = alloc_batch_commit_buf(t); + if (ret) + return ret; + return alloc_batch_fetch_buf(t); } void ublk_batch_free_buf(struct ublk_thread *t) { free_batch_commit_buf(t); + free_batch_fetch_buf(t); } static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, @@ -201,6 +249,79 @@ static void ublk_setup_commit_sqe(struct ublk_thread *t, cmd->flags |= t->cmd_flags; } +static void ublk_batch_queue_fetch(struct ublk_thread *t, + struct ublk_queue *q, + unsigned short buf_idx) +{ + unsigned short nr_elem = t->fetch[buf_idx].fetch_buf_size / 2; + struct io_uring_sqe *sqe; + + io_uring_buf_ring_add(t->fetch[buf_idx].br, t->fetch[buf_idx].fetch_buf, + t->fetch[buf_idx].fetch_buf_size, + 0, 0, 0); + io_uring_buf_ring_advance(t->fetch[buf_idx].br, 1); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_FETCH_IO_CMDS, 2, nr_elem, + buf_idx); + + sqe->rw_flags= IORING_URING_CMD_MULTISHOT; + sqe->buf_group = buf_idx; + sqe->flags |= IOSQE_BUFFER_SELECT; + + t->fetch[buf_idx].fetch_buf_off = 0; +} + +void ublk_batch_start_fetch(struct ublk_thread *t, + struct ublk_queue *q) +{ + int i; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) + ublk_batch_queue_fetch(t, q, i); +} + +static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, + struct ublk_queue *q, + const struct io_uring_cqe *cqe) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + unsigned start = t->fetch[buf_idx].fetch_buf_off; + unsigned end = start + cqe->res; + void *buf = t->fetch[buf_idx].fetch_buf; + int i; + + if (cqe->res < 0) + return buf_idx; + + if ((end - start) / 2 > q->q_depth) { + ublk_err("%s: fetch duplicated ios offset %u count %u\n", __func__, start, cqe->res); + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + ublk_err("%u ", tag); + } + ublk_err("\n"); + } + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + if (tag == UBLK_BATCH_IO_UNUSED_TAG) + continue; + + if (tag >= q->q_depth) + ublk_err("%s: bad tag %u\n", __func__, tag); + + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(t, q, tag); + } + t->fetch[buf_idx].fetch_buf_off = end; + return buf_idx; +} + int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) { unsigned short nr_elem = q->q_depth; @@ -260,12 +381,26 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe) { unsigned op = user_data_to_op(cqe->user_data); + struct ublk_queue *q; + unsigned buf_idx; + unsigned q_id; if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { ublk_batch_compl_commit_cmd(t, cqe, op); return; } + + /* FETCH command is per queue */ + q_id = user_data_to_q_id(cqe->user_data); + q = &t->dev->q[q_id]; + buf_idx = ublk_compl_batch_fetch(t, q, cqe); + + if (cqe->res < 0 && cqe->res != -ENOBUFS) { + t->state |= UBLKS_T_STOPPING; + } else if (!(cqe->flags & IORING_CQE_F_MORE) || cqe->res == -ENOBUFS) { + ublk_batch_queue_fetch(t, q, buf_idx); + } } void ublk_batch_commit_io_cmds(struct ublk_thread *t) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 62bee39db52b5..f6ed18b11541b 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -493,6 +493,10 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; int ret; + /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ + if (ublk_dev_batch_io(dev)) + cq_depth += dev->dev_info.queue_depth; + ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER | @@ -797,7 +801,7 @@ static void ublk_handle_cqe(struct ublk_thread *t, unsigned q_id = user_data_to_q_id(cqe->user_data); unsigned cmd_op = user_data_to_op(cqe->user_data); - if (cqe->res < 0 && cqe->res != -ENODEV) + if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS) ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, cqe->res, cqe->user_data, t->state); @@ -926,9 +930,13 @@ static void *ublk_io_handler_fn(void *data) if (!ublk_thread_batch_io(t)) { /* submit all io commands to ublk driver */ ublk_submit_fetch_commands(t); - } else if (!t->idx) { + } else { + struct ublk_queue *q = &t->dev->q[t->idx]; + /* prepare all io commands in the 1st thread context */ - ublk_batch_setup_queues(t); + if (!t->idx) + ublk_batch_setup_queues(t); + ublk_batch_start_fetch(t, q); } do { diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 5a90b99495e81..d647281622c74 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -190,6 +190,13 @@ struct batch_commit_buf { unsigned short count; }; +struct batch_fetch_buf { + struct io_uring_buf_ring *br; + void *fetch_buf; + unsigned int fetch_buf_size; + unsigned int fetch_buf_off; +}; + struct ublk_thread { struct ublk_dev *dev; struct io_uring ring; @@ -219,6 +226,10 @@ struct ublk_thread { #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; struct batch_commit_buf commit; + + /* FETCH_IO_CMDS buffer */ +#define UBLKS_T_NR_FETCH_BUF 2 + struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF]; }; struct ublk_dev { @@ -470,6 +481,9 @@ static inline unsigned short ublk_batch_io_buf_idx( /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ +void ublk_batch_start_fetch(struct ublk_thread *t, + struct ublk_queue *q); /* Handle completion of batch I/O commands (prep/commit) */ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe); From cf2d0cc2a7e0c68f2b1c21a1ebef6ca0fa8737c1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:29 +0800 Subject: [PATCH 25/26] selftests: ublk: add --batch/-b for enabling F_BATCH_IO Add --batch/-b for enabling F_BATCH_IO. Add generic_14 for covering its basic function. Add stress_06 and stress_07 for covering stress test. Signed-off-by: Ming Lei --- tools/testing/selftests/ublk/Makefile | 3 ++ tools/testing/selftests/ublk/kublk.c | 13 +++++- .../testing/selftests/ublk/test_generic_14.sh | 32 +++++++++++++ .../testing/selftests/ublk/test_stress_06.sh | 45 +++++++++++++++++++ .../testing/selftests/ublk/test_stress_07.sh | 44 ++++++++++++++++++ 5 files changed, 136 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/ublk/test_generic_14.sh create mode 100755 tools/testing/selftests/ublk/test_stress_06.sh create mode 100755 tools/testing/selftests/ublk/test_stress_07.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index a724276622d06..cbf57113b1a6a 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -21,6 +21,7 @@ TEST_PROGS += test_generic_10.sh TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh +TEST_PROGS += test_generic_14.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh @@ -39,6 +40,8 @@ TEST_PROGS += test_stress_02.sh TEST_PROGS += test_stress_03.sh TEST_PROGS += test_stress_04.sh TEST_PROGS += test_stress_05.sh +TEST_PROGS += test_stress_06.sh +TEST_PROGS += test_stress_07.sh TEST_GEN_PROGS_EXTENDED = kublk diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index f6ed18b11541b..c15cb1cfec063 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1462,6 +1462,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_QUIESCE), FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), + FEAT_NAME(UBLK_F_BATCH_IO), }; struct ublk_dev *dev; __u64 features = 0; @@ -1557,6 +1558,7 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n"); printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); printf("\t[--nthreads threads] [--per_io_tasks]\n"); + printf("\t[--batch|-b]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1619,6 +1621,7 @@ int main(int argc, char *argv[]) { "nthreads", 1, NULL, 0 }, { "per_io_tasks", 0, NULL, 0 }, { "no_ublk_fixed_fd", 0, NULL, 0 }, + { "batch", 0, NULL, 'b'}, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1640,9 +1643,12 @@ int main(int argc, char *argv[]) opterr = 0; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gaz", + while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazb", longopts, &option_idx)) != -1) { switch (opt) { + case 'b': + ctx.flags |= UBLK_F_BATCH_IO; + break; case 'a': ctx.all = 1; break; @@ -1723,6 +1729,11 @@ int main(int argc, char *argv[]) } } + if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) { + ublk_err("per_io_task and F_BATCH_IO conflict\n"); + return -EINVAL; + } + /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ if (ctx.auto_zc_fallback && !((ctx.flags & UBLK_F_AUTO_BUF_REG) && diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_generic_14.sh new file mode 100755 index 0000000000000..ac457b45f4391 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_14.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_13" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test basic function of UBLK_F_BATCH_IO" + +_create_backfile 0 256M +_create_backfile 1 256M + +dev_id=$(_add_ublk_dev -t loop -q 2 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +if ! _mkfs_mount_test /dev/ublkb"${dev_id}"; then + _cleanup_test "generic" + _show_result $TID 255 +fi + +dev_id=$(_add_ublk_dev -t stripe -b --auto_zc "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}") +_check_add_dev $TID $? +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_06.sh b/tools/testing/selftests/ublk/test_stress_06.sh new file mode 100755 index 0000000000000..190db0b4f2ade --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_06.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_06" +ERR_CODE=0 + +ublk_io_and_remove() +{ + run_io_and_remove "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and remove device(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_remove 8G -t null -q 4 -b & +ublk_io_and_remove 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_remove 256M -t stripe -q 4 --auto_zc -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_07.sh b/tools/testing/selftests/ublk/test_stress_07.sh new file mode 100755 index 0000000000000..1b6bdb31da031 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_07.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_07" +ERR_CODE=0 + +ublk_io_and_kill_daemon() +{ + run_io_and_kill_daemon "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and kill ublk server(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_kill_daemon 8G -t null -q 4 -z -b & +ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_kill_daemon 256M -t stripe -q 4 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE From 3eda0c2bc2a5b98d8b793b2f6187bb987281a46c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 23 Oct 2025 23:32:30 +0800 Subject: [PATCH 26/26] selftests: ublk: support arbitrary threads/queues combination Enable flexible thread-to-queue mapping in batch I/O mode to support arbitrary combinations of threads and queues, improving resource utilization and scalability. Key improvements: - Support N:M thread-to-queue mapping (previously limited to 1:1) - Dynamic buffer allocation based on actual queue assignment per thread - Thread-safe queue preparation with spinlock protection - Intelligent buffer index calculation for multi-queue scenarios - Enhanced validation for thread/queue combination constraints Implementation details: - Add q_thread_map matrix to track queue-to-thread assignments - Dynamic allocation of commit and fetch buffers per thread - Round-robin queue assignment algorithm for load balancing - Per-queue spinlock to prevent race conditions during prep - Updated buffer index calculation using queue position within thread This enables efficient configurations like: - Any other N:M combinations for optimal resource matching Testing: - Added test_generic_14.sh: 4 threads vs 1 queue - Added test_generic_15.sh: 1 thread vs 4 queues - Validates correctness across different mapping scenarios Signed-off-by: Ming Lei --- tools/testing/selftests/ublk/Makefile | 2 + tools/testing/selftests/ublk/batch.c | 200 +++++++++++++++--- tools/testing/selftests/ublk/kublk.c | 34 ++- tools/testing/selftests/ublk/kublk.h | 37 +++- .../testing/selftests/ublk/test_generic_15.sh | 30 +++ .../testing/selftests/ublk/test_generic_16.sh | 30 +++ 6 files changed, 285 insertions(+), 48 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_generic_15.sh create mode 100755 tools/testing/selftests/ublk/test_generic_16.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index cbf57113b1a6a..3dbd9a8577165 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -22,6 +22,8 @@ TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_14.sh +TEST_PROGS += test_generic_15.sh +TEST_PROGS += test_generic_16.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 69fb417ebf93f..b640cf325472c 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -70,6 +70,7 @@ static void free_batch_commit_buf(struct ublk_thread *t) { free(t->commit_buf); allocator_deinit(&t->commit_buf_alloc); + free(t->commit); } static int alloc_batch_commit_buf(struct ublk_thread *t) @@ -79,7 +80,13 @@ static int alloc_batch_commit_buf(struct ublk_thread *t) struct iovec iov[t->nr_commit_buf]; unsigned int page_sz = getpagesize(); void *buf = NULL; - int i, ret; + int i, ret, j = 0; + + t->commit = calloc(t->nr_queues, sizeof(*t->commit)); + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->dev->q_thread_map[t->idx][i]) + t->commit[j++].q_id = i; + } allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); @@ -108,6 +115,17 @@ static int alloc_batch_commit_buf(struct ublk_thread *t) return ret; } +static unsigned int ublk_thread_nr_queues(const struct ublk_thread *t) +{ + int i; + int ret = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) + ret += !!t->dev->q_thread_map[t->idx][i]; + + return ret; +} + void ublk_batch_prepare(struct ublk_thread *t) { /* @@ -120,10 +138,13 @@ void ublk_batch_prepare(struct ublk_thread *t) */ struct ublk_queue *q = &t->dev->q[0]; + /* cache nr_queues because we don't support dynamic load-balance yet */ + t->nr_queues = ublk_thread_nr_queues(t); + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); t->commit_buf_size = ublk_commit_buf_size(t); t->commit_buf_start = t->nr_bufs; - t->nr_commit_buf = 2; + t->nr_commit_buf = 2 * t->nr_queues; t->nr_bufs += t->nr_commit_buf; t->cmd_flags = 0; @@ -145,11 +166,12 @@ static void free_batch_fetch_buf(struct ublk_thread *t) { int i; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + for (i = 0; i < t->nr_fetch_bufs; i++) { io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); free(t->fetch[i].fetch_buf); } + free(t->fetch); } static int alloc_batch_fetch_buf(struct ublk_thread *t) @@ -160,7 +182,12 @@ static int alloc_batch_fetch_buf(struct ublk_thread *t) int ret; int i = 0; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + /* double fetch buffer for each queue */ + t->nr_fetch_bufs = t->nr_queues * 2; + t->fetch = calloc(t->nr_fetch_bufs, sizeof(*t->fetch)); + + /* allocate one buffer for each queue */ + for (i = 0; i < t->nr_fetch_bufs; i++) { t->fetch[i].fetch_buf_size = buf_size; if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, @@ -186,7 +213,7 @@ int ublk_batch_alloc_buf(struct ublk_thread *t) { int ret; - ublk_assert(t->nr_commit_buf < 16); + ublk_assert(t->nr_commit_buf < 2 * UBLK_MAX_QUEUES); ret = alloc_batch_commit_buf(t); if (ret) @@ -273,13 +300,20 @@ static void ublk_batch_queue_fetch(struct ublk_thread *t, t->fetch[buf_idx].fetch_buf_off = 0; } -void ublk_batch_start_fetch(struct ublk_thread *t, - struct ublk_queue *q) +void ublk_batch_start_fetch(struct ublk_thread *t) { int i; + int j = 0; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) - ublk_batch_queue_fetch(t, q, i); + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->dev->q_thread_map[t->idx][i]) { + struct ublk_queue *q = &t->dev->q[i]; + + /* submit two fetch commands for each queue */ + ublk_batch_queue_fetch(t, q, j++); + ublk_batch_queue_fetch(t, q, j++); + } + } } static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, @@ -322,7 +356,7 @@ static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, return buf_idx; } -int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +static int __ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) { unsigned short nr_elem = q->q_depth; unsigned short buf_idx = ublk_alloc_commit_buf(t); @@ -359,6 +393,22 @@ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) return 0; } +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + int ret = 0; + + pthread_spin_lock(&q->lock); + if (q->flags & UBLKS_Q_PREPARED) + goto unlock; + ret = __ublk_batch_queue_prep_io_cmds(t, q); + if (!ret) + q->flags |= UBLKS_Q_PREPARED; +unlock: + pthread_spin_unlock(&q->lock); + + return ret; +} + static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe, unsigned op) @@ -403,59 +453,89 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, } } -void ublk_batch_commit_io_cmds(struct ublk_thread *t) +static void __ublk_batch_commit_io_cmds(struct ublk_thread *t, + struct batch_commit_buf *cb) { struct io_uring_sqe *sqe; unsigned short buf_idx; - unsigned short nr_elem = t->commit.done; + unsigned short nr_elem = cb->done; /* nothing to commit */ if (!nr_elem) { - ublk_free_commit_buf(t, t->commit.buf_idx); + ublk_free_commit_buf(t, cb->buf_idx); return; } ublk_io_alloc_sqes(t, &sqe, 1); - buf_idx = t->commit.buf_idx; - sqe->addr = (__u64)t->commit.elem; + buf_idx = cb->buf_idx; + sqe->addr = (__u64)cb->elem; sqe->len = nr_elem * t->commit_buf_elem_size; /* commit isn't per-queue command */ - ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + ublk_init_batch_cmd(t, cb->q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, t->commit_buf_elem_size, nr_elem, buf_idx); ublk_setup_commit_sqe(t, sqe, buf_idx); } -static void ublk_batch_init_commit(struct ublk_thread *t, - unsigned short buf_idx) +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) { + struct batch_commit_buf *cb = &t->commit[i]; + + if (cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX) + __ublk_batch_commit_io_cmds(t, cb); + } + +} + +static void __ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb, + unsigned short buf_idx) { /* so far only support 1:1 queue/thread mapping */ - t->commit.q_id = t->idx; - t->commit.buf_idx = buf_idx; - t->commit.elem = ublk_get_commit_buf(t, buf_idx); - t->commit.done = 0; - t->commit.count = t->commit_buf_size / + cb->buf_idx = buf_idx; + cb->elem = ublk_get_commit_buf(t, buf_idx); + cb->done = 0; + cb->count = t->commit_buf_size / t->commit_buf_elem_size; } -void ublk_batch_prep_commit(struct ublk_thread *t) +/* COMMIT_IO_CMDS is per-queue command, so use its own commit buffer */ +static void ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb) { unsigned short buf_idx = ublk_alloc_commit_buf(t); ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); - ublk_batch_init_commit(t, buf_idx); + ublk_assert(!ublk_batch_commit_prepared(cb)); + + __ublk_batch_init_commit(t, cb, buf_idx); +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) + t->commit[i].buf_idx = UBLKS_T_COMMIT_BUF_INV_IDX; } void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res) { - struct batch_commit_buf *cb = &t->commit; - struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem + - cb->done * t->commit_buf_elem_size); + unsigned q_t_idx = ublk_queue_idx_in_thread(t, q); + struct batch_commit_buf *cb = &t->commit[q_t_idx]; + struct ublk_batch_elem *elem; struct ublk_io *io = &q->ios[tag]; - ublk_assert(q->q_id == t->commit.q_id); + if (!ublk_batch_commit_prepared(cb)) + ublk_batch_init_commit(t, cb); + ublk_assert(q->q_id == cb->q_id); + + elem = (struct ublk_batch_elem *)(cb->elem + cb->done * t->commit_buf_elem_size); elem->tag = tag; elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); elem->result = res; @@ -466,3 +546,65 @@ void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, cb->done += 1; ublk_assert(cb->done <= cb->count); } + +void ublk_batch_setup_map(struct ublk_dev *dev) +{ + int i, j; + int nthreads = dev->nthreads; + int queues = dev->dev_info.nr_hw_queues; + + /* + * Setup round-robin queue-to-thread mapping for arbitrary N:M combinations. + * + * This algorithm distributes queues across threads (and threads across queues) + * in a balanced round-robin fashion to ensure even load distribution. + * + * Examples: + * - 2 threads, 4 queues: T0=[Q0,Q2], T1=[Q1,Q3] + * - 4 threads, 2 queues: T0=[Q0], T1=[Q1], T2=[Q0], T3=[Q1] + * - 3 threads, 3 queues: T0=[Q0], T1=[Q1], T2=[Q2] (1:1 mapping) + * + * Phase 1: Mark which queues each thread handles (boolean mapping) + */ + for (i = 0, j = 0; i < queues || j < nthreads; i++, j++) { + dev->q_thread_map[j % nthreads][i % queues] = 1; + } + + /* + * Phase 2: Convert boolean mapping to sequential indices within each thread. + * + * Transform from: q_thread_map[thread][queue] = 1 (handles queue) + * To: q_thread_map[thread][queue] = N (queue index within thread) + * + * This allows each thread to know the local index of each queue it handles, + * which is essential for buffer allocation and management. For example: + * - Thread 0 handling queues [0,2] becomes: q_thread_map[0][0]=1, q_thread_map[0][2]=2 + * - Thread 1 handling queues [1,3] becomes: q_thread_map[1][1]=1, q_thread_map[1][3]=2 + */ + for (j = 0; j < nthreads; j++) { + unsigned char seq = 1; + + for (i = 0; i < queues; i++) { + if (dev->q_thread_map[j][i]) + dev->q_thread_map[j][i] = seq++; + } + } + +#if 0 + for (j = 0; j < dev->nthreads; j++) { + printf("thread %0d: ", j); + for (i = 0; i < dev->dev_info.nr_hw_queues; i++) { + if (dev->q_thread_map[j][i]) + printf("%03u ", i); + } + printf("\n"); + } + printf("\n"); + for (j = 0; j < dev->nthreads; j++) { + for (i = 0; i < dev->dev_info.nr_hw_queues; i++) { + printf("%03u ", dev->q_thread_map[j][i]); + } + printf("\n"); + } +#endif +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index c15cb1cfec063..e54d03bb69894 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -442,6 +442,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) int cmd_buf_size, io_buf_size; unsigned long off; + pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE); q->tgt_ops = dev->tgt.ops; q->flags = 0; q->q_depth = depth; @@ -495,7 +496,7 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ if (ublk_dev_batch_io(dev)) - cq_depth += dev->dev_info.queue_depth; + cq_depth += dev->dev_info.queue_depth * 2; ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | @@ -588,6 +589,9 @@ static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev) return -1; } + if (ublk_dev_batch_io(dev)) + ublk_batch_setup_map(dev); + dev->fds[0] = fd; if (dev->tgt.ops->init_tgt) ret = dev->tgt.ops->init_tgt(ctx, dev); @@ -891,14 +895,18 @@ static void ublk_batch_setup_queues(struct ublk_thread *t) { int i; - /* setup all queues in the 1st thread */ for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { struct ublk_queue *q = &t->dev->q[i]; int ret; + /* + * Only prepare io commands in the mapped thread context, + * otherwise io command buffer index may not work as expected + */ + if (t->dev->q_thread_map[t->idx][i] == 0) + continue; + ret = ublk_batch_queue_prep_io_cmds(t, q); - ublk_assert(ret == 0); - ret = ublk_process_io(t); ublk_assert(ret >= 0); } } @@ -931,12 +939,8 @@ static void *ublk_io_handler_fn(void *data) /* submit all io commands to ublk driver */ ublk_submit_fetch_commands(t); } else { - struct ublk_queue *q = &t->dev->q[t->idx]; - - /* prepare all io commands in the 1st thread context */ - if (!t->idx) - ublk_batch_setup_queues(t); - ublk_batch_start_fetch(t, q); + ublk_batch_setup_queues(t); + ublk_batch_start_fetch(t); } do { @@ -1220,7 +1224,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } - if (nthreads != nr_queues && !ctx->per_io_tasks) { + if (nthreads != nr_queues && (!ctx->per_io_tasks && + !(ctx->flags & UBLK_F_BATCH_IO))) { ublk_err("%s: threads %u must be same as queues %u if " "not using per_io_tasks\n", __func__, nthreads, nr_queues); @@ -1744,6 +1749,13 @@ int main(int argc, char *argv[]) return -EINVAL; } + if ((ctx.flags & UBLK_F_AUTO_BUF_REG) && + (ctx.flags & UBLK_F_BATCH_IO) && + (ctx.nthreads > ctx.nr_hw_queues)) { + ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index d647281622c74..97cf6ddbec5d1 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -166,12 +166,16 @@ struct ublk_queue { const struct ublk_tgt_ops *tgt_ops; struct ublksrv_io_desc *io_cmd_buf; -/* borrow one bit of ublk uapi flags, which may never be used */ +/* borrow three bit of ublk uapi flags, which may never be used */ #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) +#define UBLKS_Q_PREPARED (1ULL << 61) __u64 flags; int ublk_fd; /* cached ublk char device fd */ struct ublk_io ios[UBLK_QUEUE_DEPTH]; + + /* used for prep io commands */ + pthread_spinlock_t lock; }; /* align with `ublk_elem_header` */ @@ -204,7 +208,8 @@ struct ublk_thread { unsigned int io_inflight; pthread_t thread; - unsigned idx; + unsigned short idx; + unsigned short nr_queues; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) @@ -225,11 +230,11 @@ struct ublk_thread { void *commit_buf; #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; - struct batch_commit_buf commit; + struct batch_commit_buf *commit; /* FETCH_IO_CMDS buffer */ -#define UBLKS_T_NR_FETCH_BUF 2 - struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF]; + unsigned short nr_fetch_bufs; + struct batch_fetch_buf *fetch; }; struct ublk_dev { @@ -239,6 +244,7 @@ struct ublk_dev { struct ublk_thread threads[UBLK_MAX_THREADS]; unsigned nthreads; unsigned per_io_tasks; + unsigned char q_thread_map[UBLK_MAX_THREADS][UBLK_MAX_QUEUES]; int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */ int nr_fds; @@ -468,6 +474,21 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb) +{ + return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX; +} + +static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t, + const struct ublk_queue *q) +{ + unsigned char idx; + + idx = t->dev->q_thread_map[t->idx][q->q_id]; + ublk_assert(idx != 0); + return idx - 1; +} + /* * Each IO's buffer index has to be calculated by this helper for * UBLKS_T_BATCH_IO @@ -476,14 +497,13 @@ static inline unsigned short ublk_batch_io_buf_idx( const struct ublk_thread *t, const struct ublk_queue *q, unsigned tag) { - return tag; + return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag; } /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); /* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ -void ublk_batch_start_fetch(struct ublk_thread *t, - struct ublk_queue *q); +void ublk_batch_start_fetch(struct ublk_thread *t); /* Handle completion of batch I/O commands (prep/commit) */ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe); @@ -501,6 +521,7 @@ void ublk_batch_commit_io_cmds(struct ublk_thread *t); /* Add a completed I/O operation to the current batch commit buffer */ void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res); +void ublk_batch_setup_map(struct ublk_dev *dev); static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res) diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_generic_15.sh new file mode 100755 index 0000000000000..16a41fd164281 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_15.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_14" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 4_threads vs. 1_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 1 --nthreads 4 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh new file mode 100755 index 0000000000000..6b7000b34a9d6 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_15" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 1_threads vs. 4_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 4 --nthreads 1 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE