Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## 2024-06-25 - Replace complex Regex with custom parsers for performance
**Learning:** Replaced multiple Regex instances in `HintExtractor.kt` that were utilized in a hot-path for splitting strings on various patterns (prefixes, camel case boundaries, id matching, etc). Even compiled Regex definitions are computationally intensive due to intermediate allocation from sequential `.replace` calls and state-machine evaluation. A manual, single-pass iteration using an inline `StringBuilder` checking string lengths and individual character types (isLowerCase/isUpperCase/isWhitespace) performs orders of magnitude faster.
**Action:** When working in tight hot paths, and especially for basic string transformations, prefer manual single-pass iteration using `StringBuilder` over multi-pass `Regex` matching, even if the regex expressions are compiled class properties.
Original file line number Diff line number Diff line change
Expand Up @@ -10,42 +10,80 @@ package halogen.engine
*/
internal object HintExtractor {

private val PREFIX_PATTERN = Regex("""^(?:/r/|/category/|/topic/|/|#)""")
private val CAMEL_SPLIT = Regex("""(?<=[a-z])(?=[A-Z])""")
private val ID_PATTERN = Regex("""^[0-9a-f]{8,}$""", RegexOption.IGNORE_CASE)
private val NUMERIC_ONLY = Regex("""^\d+$""")
private val WHITESPACE_PATTERN = Regex("""\s+""")

fun extract(key: String): String? {
if (key.isBlank()) return null

var cleaned = key.trim()

// Strip common prefixes
var cleaned = PREFIX_PATTERN.replace(key.trim(), "")
when {
cleaned.startsWith("/r/") -> cleaned = cleaned.substring(3)
cleaned.startsWith("/category/") -> cleaned = cleaned.substring(10)
cleaned.startsWith("/topic/") -> cleaned = cleaned.substring(7)
cleaned.startsWith("/") -> cleaned = cleaned.substring(1)
cleaned.startsWith("#") -> cleaned = cleaned.substring(1)
}

// Remove leading/trailing slashes
cleaned = cleaned.trim('/')

// Take the last meaningful segment if it looks like a path
if ('/' in cleaned) {
cleaned = cleaned.substringAfterLast('/')
val lastSlash = cleaned.lastIndexOf('/')
if (lastSlash != -1) {
cleaned = cleaned.substring(lastSlash + 1)
}

// Split camelCase
cleaned = CAMEL_SPLIT.replace(cleaned, " ")
if (cleaned.isBlank()) return null

// Split snake_case and kebab-case
cleaned = cleaned.replace('_', ' ').replace('-', ' ')
// Split camelCase, snake_case, kebab-case, normalize whitespace
val sb = StringBuilder(cleaned.length + 5)
var lastChar = '\u0000'
var spacePending = false

// Normalize whitespace
cleaned = cleaned.trim().replace(WHITESPACE_PATTERN, " ")
for (i in cleaned.indices) {
val c = cleaned[i]

if (cleaned.isBlank()) return null
if (c == '_' || c == '-' || c.isWhitespace()) {
spacePending = true
} else if (c.isUpperCase() && lastChar.isLowerCase()) {
if (sb.isNotEmpty()) sb.append(' ')
sb.append(c.lowercaseChar())
spacePending = false
} else {
if (spacePending && sb.isNotEmpty()) {
sb.append(' ')
}
sb.append(c.lowercaseChar())
spacePending = false
}
if (c != '_' && c != '-' && !c.isWhitespace()) {
lastChar = c
}
}

val result = sb.toString()
if (result.isBlank()) return null

// Reject things that look like IDs
val noSpaces = cleaned.replace(" ", "")
if (ID_PATTERN.matches(noSpaces)) return null
if (NUMERIC_ONLY.matches(noSpaces)) return null
var isAllHex = true
var isAllDigit = true
var noSpaceLength = 0

for (i in result.indices) {
val c = result[i]
if (c == ' ') continue
noSpaceLength++
if (!(c in '0'..'9' || c in 'a'..'f')) {
isAllHex = false
}
if (!(c in '0'..'9')) {
isAllDigit = false
}
}

if (noSpaceLength >= 8 && isAllHex) return null
if (noSpaceLength > 0 && isAllDigit) return null

return cleaned.lowercase()
return result
}
}
Loading