-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Follows much of the baseline of memcpy.S, but with memset and rep stosb. Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
- Loading branch information
Showing
5 changed files
with
196 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
/* | ||
* Copyright (c) 2023 Pedro Falcato | ||
* This file is part of Onyx, and is released under the terms of the MIT License | ||
* check LICENSE at the root directory for more information | ||
* | ||
* SPDX-License-Identifier: MIT | ||
*/ | ||
#include "memset_impl.S" | ||
|
||
ALIGN_TEXT | ||
.global __memset | ||
.type __memset, @function | ||
__memset: | ||
/* Set up the return value */ | ||
mov %rdi, %rax | ||
memset_like _memset | ||
.size __memset, . - __memset | ||
|
||
.weak memset | ||
.set memset, __memset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
/* | ||
* Copyright (c) 2023 Pedro Falcato | ||
* This file is part of Onyx, and is released under the terms of the MIT License | ||
* check LICENSE at the root directory for more information | ||
* | ||
* SPDX-License-Identifier: MIT | ||
*/ | ||
#define RET ret | ||
#define ALIGN_TEXT .p2align 4, 0x90 | ||
|
||
#ifndef L | ||
#define L(label) .L##label##\suffix | ||
#endif | ||
|
||
.macro memset_like suffix | ||
/* Test for 0 */ | ||
test %rdx, %rdx | ||
jz L(out) | ||
|
||
/* Expand the value given into a 64-bit value */ | ||
and $0xff, %rsi | ||
mov $0x0101010101010101, %rcx | ||
imul %rcx, %rsi | ||
|
||
/* Deal with [0..16], [16..32], [32..256] and [256..] separately */ | ||
cmp $16, %rdx | ||
jbe L(0_to_16_bytes) | ||
|
||
cmp $32, %rdx | ||
jbe L(0_to_32_bytes) | ||
|
||
/* Heuristic tested on Kabylake R */ | ||
/* The limit is likely much lower on FSRM but TODO */ | ||
cmp $512, %rdx | ||
jae L(erms) | ||
|
||
/* Fallthrough to the 32 byte set */ | ||
ALIGN_TEXT | ||
L(32_byte_set): | ||
movq %rsi, (%rdi) | ||
movq %rsi, 8(%rdi) | ||
movq %rsi, 16(%rdi) | ||
movq %rsi, 24(%rdi) | ||
/* We use both lea and arithmetic insns as to fully utilize execution units */ | ||
lea 32(%rdi), %rdi | ||
sub $32, %rdx | ||
jz L(out) | ||
cmp $32, %rdx | ||
jae L(32_byte_set) | ||
|
||
/* Fallthrough to the 0..32 memset */ | ||
ALIGN_TEXT | ||
/* This whole code (the part that handles the "tail") is based on being able to | ||
* do unaligned, overlapping stores. So something like (i.e 2-3 byte store): | ||
* movw %sil, (%rdi) | ||
* movw %sil, -2(%rdi, %rdx) | ||
* where rdi is dest, rsi is val, rdx is len. This is much cheaper than having a lot more branching | ||
* down with some duff's device-like thing. | ||
*/ | ||
L(0_to_32_bytes): | ||
cmp $16, %rdx | ||
jbe L(0_to_16_bytes) | ||
movq %rsi, (%rdi) | ||
movq %rsi, 8(%rdi) | ||
movq %rsi, -16(%rdi, %rdx) | ||
movq %rsi, -8(%rdi, %rdx) | ||
RET | ||
|
||
ALIGN_TEXT | ||
L(0_to_16_bytes): | ||
cmp $8, %rdx | ||
jb L(4_to_7_bytes) | ||
movq %rsi, (%rdi) | ||
movq %rsi, -8(%rdi, %rdx) | ||
RET | ||
|
||
ALIGN_TEXT | ||
L(4_to_7_bytes): | ||
cmp $4, %rdx | ||
jb L(1_to_3_bytes) | ||
movl %esi, (%rdi) | ||
movl %esi, -4(%rdi, %rdx) | ||
RET | ||
|
||
ALIGN_TEXT | ||
L(1_to_3_bytes): | ||
cmp $1, %rdx | ||
je L(1_byte) | ||
movw %si, (%rdi) | ||
movw %si, -2(%rdi, %rdx) | ||
RET | ||
|
||
L(1_byte): | ||
movb %sil, (%rdi) | ||
RET | ||
|
||
ALIGN_TEXT | ||
L(erms): | ||
/* Note: We save rax temporarily in r8 since it's likely to be set up with a ret val */ | ||
mov %rax, %r8 | ||
mov %rsi, %rax | ||
mov %rdx, %rcx | ||
rep stosb | ||
mov %r8, %rax | ||
L(out): | ||
RET | ||
|
||
.endm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
// clang-format off | ||
#include <stdlib.h> | ||
#include <cstddef> | ||
|
||
#include <gtest/gtest.h> | ||
|
||
// Compile using: | ||
// g++ -O2 kernel/lib/libk/tests/host_folly_memset.cpp kernel/lib/libk/arch/x86_64/memset.S | ||
// -lgtest -lgtest_main -I kernel/include/ | ||
|
||
constexpr size_t kPageSize = 4096; | ||
constexpr size_t kMaxSize = 2 * kPageSize; | ||
constexpr uint8_t kBufEnd = 0xDB; | ||
|
||
extern "C" void *__memset(void*, int, size_t); | ||
// memset implementation test with 0xFF pattern | ||
// buf must have an extra byte to be filled with magic constant | ||
void testMemsetImpl(uint8_t* buf, size_t maxLen) { | ||
for (size_t len = 0; len < maxLen; len++) { | ||
for (size_t i = 0; i < maxLen; i++) { | ||
buf[i] = 0x0; | ||
} | ||
buf[len] = kBufEnd; | ||
auto* p = __memset(buf, 0xFF, len); | ||
EXPECT_EQ(buf, reinterpret_cast<uint8_t*>(p)); | ||
bool isEq = true; | ||
for (size_t i = 0; i < len; i++) { | ||
EXPECT_EQ(buf[i], 0xFF) << "buf[" << i << "]\n"; | ||
} | ||
|
||
EXPECT_EQ(buf[len], kBufEnd); | ||
} | ||
} | ||
|
||
TEST(MemsetAsmTest, alignedBuffer) { | ||
uint8_t* buf = reinterpret_cast<uint8_t*>( | ||
aligned_alloc(kPageSize, kMaxSize + 2 * kPageSize)); | ||
// Get buffer aligned power of 2 from 16 all the way up to a page size | ||
for (size_t alignment = 16; alignment <= kPageSize; alignment <<= 1) { | ||
testMemsetImpl(buf + (alignment % kPageSize), kMaxSize); | ||
} | ||
free(buf); | ||
} | ||
|
||
TEST(MemsetAsmTest, unalignedBuffer) { | ||
uint8_t* buf = reinterpret_cast<uint8_t*>( | ||
aligned_alloc(kPageSize, kMaxSize + 2 * kPageSize)); | ||
for (size_t alignment = 1; alignment <= 192; alignment++) { | ||
testMemsetImpl(buf + alignment, kMaxSize); | ||
} | ||
free(buf); | ||
} |