Skip to content

Commit

Permalink
x86: Add new memset implementation
Browse files Browse the repository at this point in the history
Follows much of the baseline of memcpy.S, but with memset and rep stosb.

Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
  • Loading branch information
heatd committed May 5, 2023
1 parent 7bbfb9d commit 70c7b12
Show file tree
Hide file tree
Showing 5 changed files with 196 additions and 26 deletions.
25 changes: 0 additions & 25 deletions kernel/arch/x86_64/copy.S
Original file line number Diff line number Diff line change
Expand Up @@ -84,28 +84,3 @@ __set_non_temporal:
xor %rax, %rax

RET

.global __memset
.type __memset, @function
__memset:
push %rbp
mov %rsp, %rbp

/* Shuffle the registers around, rep stosb requires count to be in %rcx, and the fill byte in %rax.
* We then use the freed-register %rsi to hold the return value, which is the address itself.
*/
mov %rdx, %rcx
mov %rsi, %rax
mov %rdi, %rsi

rep stosb

mov %rsi, %rax

pop %rbp
RET

.size __memset, . - __memset

.weak memset
.set memset, __memset
2 changes: 1 addition & 1 deletion kernel/arch/x86_64/make.config
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ endif

KERNEL_ARCH_LDFLAGS:=-z max-page-size=0x1000
LIBK_EXTRA_OBJ:=arch/x86_64/crti.o arch/x86_64/crtn.o
LIBK_ARCH_OBJS:= arch/x86_64/memcpy.o
LIBK_ARCH_OBJS:= arch/x86_64/memcpy.o arch/x86_64/memset.o
20 changes: 20 additions & 0 deletions kernel/lib/libk/arch/x86_64/memset.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright (c) 2023 Pedro Falcato
* This file is part of Onyx, and is released under the terms of the MIT License
* check LICENSE at the root directory for more information
*
* SPDX-License-Identifier: MIT
*/
#include "memset_impl.S"

ALIGN_TEXT
.global __memset
.type __memset, @function
__memset:
/* Set up the return value */
mov %rdi, %rax
memset_like _memset
.size __memset, . - __memset

.weak memset
.set memset, __memset
108 changes: 108 additions & 0 deletions kernel/lib/libk/arch/x86_64/memset_impl.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright (c) 2023 Pedro Falcato
* This file is part of Onyx, and is released under the terms of the MIT License
* check LICENSE at the root directory for more information
*
* SPDX-License-Identifier: MIT
*/
#define RET ret
#define ALIGN_TEXT .p2align 4, 0x90

#ifndef L
#define L(label) .L##label##\suffix
#endif

.macro memset_like suffix
/* Test for 0 */
test %rdx, %rdx
jz L(out)

/* Expand the value given into a 64-bit value */
and $0xff, %rsi
mov $0x0101010101010101, %rcx
imul %rcx, %rsi

/* Deal with [0..16], [16..32], [32..256] and [256..] separately */
cmp $16, %rdx
jbe L(0_to_16_bytes)

cmp $32, %rdx
jbe L(0_to_32_bytes)

/* Heuristic tested on Kabylake R */
/* The limit is likely much lower on FSRM but TODO */
cmp $512, %rdx
jae L(erms)

/* Fallthrough to the 32 byte set */
ALIGN_TEXT
L(32_byte_set):
movq %rsi, (%rdi)
movq %rsi, 8(%rdi)
movq %rsi, 16(%rdi)
movq %rsi, 24(%rdi)
/* We use both lea and arithmetic insns as to fully utilize execution units */
lea 32(%rdi), %rdi
sub $32, %rdx
jz L(out)
cmp $32, %rdx
jae L(32_byte_set)

/* Fallthrough to the 0..32 memset */
ALIGN_TEXT
/* This whole code (the part that handles the "tail") is based on being able to
* do unaligned, overlapping stores. So something like (i.e 2-3 byte store):
* movw %sil, (%rdi)
* movw %sil, -2(%rdi, %rdx)
* where rdi is dest, rsi is val, rdx is len. This is much cheaper than having a lot more branching
* down with some duff's device-like thing.
*/
L(0_to_32_bytes):
cmp $16, %rdx
jbe L(0_to_16_bytes)
movq %rsi, (%rdi)
movq %rsi, 8(%rdi)
movq %rsi, -16(%rdi, %rdx)
movq %rsi, -8(%rdi, %rdx)
RET

ALIGN_TEXT
L(0_to_16_bytes):
cmp $8, %rdx
jb L(4_to_7_bytes)
movq %rsi, (%rdi)
movq %rsi, -8(%rdi, %rdx)
RET

ALIGN_TEXT
L(4_to_7_bytes):
cmp $4, %rdx
jb L(1_to_3_bytes)
movl %esi, (%rdi)
movl %esi, -4(%rdi, %rdx)
RET

ALIGN_TEXT
L(1_to_3_bytes):
cmp $1, %rdx
je L(1_byte)
movw %si, (%rdi)
movw %si, -2(%rdi, %rdx)
RET

L(1_byte):
movb %sil, (%rdi)
RET

ALIGN_TEXT
L(erms):
/* Note: We save rax temporarily in r8 since it's likely to be set up with a ret val */
mov %rax, %r8
mov %rsi, %rax
mov %rdx, %rcx
rep stosb
mov %r8, %rax
L(out):
RET

.endm
67 changes: 67 additions & 0 deletions kernel/lib/libk/tests/host_folly_memset.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// clang-format off
#include <stdlib.h>
#include <cstddef>

#include <gtest/gtest.h>

// Compile using:
// g++ -O2 kernel/lib/libk/tests/host_folly_memset.cpp kernel/lib/libk/arch/x86_64/memset.S
// -lgtest -lgtest_main -I kernel/include/

constexpr size_t kPageSize = 4096;
constexpr size_t kMaxSize = 2 * kPageSize;
constexpr uint8_t kBufEnd = 0xDB;

extern "C" void *__memset(void*, int, size_t);
// memset implementation test with 0xFF pattern
// buf must have an extra byte to be filled with magic constant
void testMemsetImpl(uint8_t* buf, size_t maxLen) {
for (size_t len = 0; len < maxLen; len++) {
for (size_t i = 0; i < maxLen; i++) {
buf[i] = 0x0;
}
buf[len] = kBufEnd;
auto* p = __memset(buf, 0xFF, len);
EXPECT_EQ(buf, reinterpret_cast<uint8_t*>(p));
bool isEq = true;
for (size_t i = 0; i < len; i++) {
EXPECT_EQ(buf[i], 0xFF) << "buf[" << i << "]\n";
}

EXPECT_EQ(buf[len], kBufEnd);
}
}

TEST(MemsetAsmTest, alignedBuffer) {
uint8_t* buf = reinterpret_cast<uint8_t*>(
aligned_alloc(kPageSize, kMaxSize + 2 * kPageSize));
// Get buffer aligned power of 2 from 16 all the way up to a page size
for (size_t alignment = 16; alignment <= kPageSize; alignment <<= 1) {
testMemsetImpl(buf + (alignment % kPageSize), kMaxSize);
}
free(buf);
}

TEST(MemsetAsmTest, unalignedBuffer) {
uint8_t* buf = reinterpret_cast<uint8_t*>(
aligned_alloc(kPageSize, kMaxSize + 2 * kPageSize));
for (size_t alignment = 1; alignment <= 192; alignment++) {
testMemsetImpl(buf + alignment, kMaxSize);
}
free(buf);
}

0 comments on commit 70c7b12

Please sign in to comment.