Skip to content
Permalink
Browse files
utf8: big improvements to case-insensitive UTF-8 string compare.
- Dramatically reduce RAM usage: uses between 8 and 11 kilobytes less static
memory for its internal case-folding tables.
- Actually works now. It would fail unconditionally if a codepoint folded
into multiple codepoints, even if the compared string contained those exact
codepoints.
- Now a public API!
- Removed __PHYSFS_utf8strnicmp(): nothing was using it, it was incorrect
anyhow, and what does 'n' represent when either string might case-fold to
something larger in-flight, anyhow?
  • Loading branch information
icculus committed Aug 11, 2017
1 parent 587ec88 commit d1f2637ca874866c0a7f005d5e557a7e9222e656
Showing with 2,695 additions and 2,258 deletions.
  1. +209 −18 extras/makecasefoldhashtable.pl
  2. +5 −5 src/physfs.c
  3. +24 −0 src/physfs.h
  4. +2,348 −2,129 src/physfs_casefolding.h
  5. +0 −21 src/physfs_internal.h
  6. +1 −1 src/physfs_platform_os2.c
  7. +108 −84 src/physfs_unicode.c
@@ -3,6 +3,11 @@
use warnings;
use strict;

my $HASHBUCKETS1_16 = 256;
my $HASHBUCKETS1_32 = 16;
my $HASHBUCKETS2_16 = 16;
my $HASHBUCKETS3_16 = 4;

print <<__EOF__;
/*
* This file is part of PhysicsFS (https://icculus.org/physfs/)
@@ -13,17 +18,97 @@
* Please see the file LICENSE.txt in the source's root directory.
*/
#ifndef _INCLUDE_PHYSFS_CASEFOLDING_H_
#define _INCLUDE_PHYSFS_CASEFOLDING_H_
#ifndef __PHYSICSFS_INTERNAL__
#error Do not include this header from your applications.
#endif
/* We build three simple hashmaps here: one that maps Unicode codepoints to
a one, two, or three lowercase codepoints. To retrieve this info: look at
case_fold_hashX, where X is 1, 2, or 3. Most foldable codepoints fold to one,
a few dozen fold to two, and a handful fold to three. If the codepoint isn't
in any of these hashes, it doesn't fold (no separate upper and lowercase).
Almost all these codepoints fit into 16 bits, so we hash them as such to save
memory. If a codepoint is > 0xFFFF, we have separate hashes for them,
since there are (currently) only about 120 of them and (currently) all of them
map to a single lowercase codepoint. */
typedef struct CaseFoldMapping1_32
{
PHYSFS_uint32 from;
PHYSFS_uint32 to0;
} CaseFoldMapping1_32;
typedef struct CaseFoldMapping1_16
{
PHYSFS_uint16 from;
PHYSFS_uint16 to0;
} CaseFoldMapping1_16;
typedef struct CaseFoldMapping2_16
{
PHYSFS_uint16 from;
PHYSFS_uint16 to0;
PHYSFS_uint16 to1;
} CaseFoldMapping2_16;
typedef struct CaseFoldMapping3_16
{
PHYSFS_uint16 from;
PHYSFS_uint16 to0;
PHYSFS_uint16 to1;
PHYSFS_uint16 to2;
} CaseFoldMapping3_16;
typedef struct CaseFoldHashBucket1_16
{
const CaseFoldMapping1_16 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket1_16;
typedef struct CaseFoldHashBucket1_32
{
const CaseFoldMapping1_32 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket1_32;
typedef struct CaseFoldHashBucket2_16
{
const CaseFoldMapping2_16 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket2_16;
typedef struct CaseFoldHashBucket3_16
{
const CaseFoldMapping3_16 *list;
const PHYSFS_uint8 count;
} CaseFoldHashBucket3_16;
__EOF__


my @foldPairs;
my @foldPairs1_16;
my @foldPairs2_16;
my @foldPairs3_16;
my @foldPairs1_32;

for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
$foldPairs1_16[$i] = '';
}

for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
$foldPairs1_32[$i] = '';
}

for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
$foldPairs2_16[$i] = '';
}

for (my $i = 0; $i < 256; $i++) {
$foldPairs[$i] = '';
for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
$foldPairs3_16[$i] = '';
}

open(FH,'<','casefolding.txt') or die("failed to open casefolding.txt: $!\n");
@@ -38,47 +123,153 @@

next if not /\A([a-fA-F0-9]+)\;\s*(.)\;\s*(.+)\;/;
my ($code, $status, $mapping) = ($1, $2, $3);

my $hexxed = hex($code);
my $hashed = (($hexxed ^ ($hexxed >> 8)) & 0xFF);
#print("// code '$code' status '$status' mapping '$mapping'\n");
#print("// hexxed '$hexxed' hashed '$hashed'\n");

if (($status eq 'C') or ($status eq 'F')) {
my ($map1, $map2, $map3) = ('0000', '0000', '0000');
my ($map1, $map2, $map3) = (undef, undef, undef);
$map1 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
$map2 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
$map3 = $1 if $mapping =~ s/\A([a-fA-F0-9]+)(\s*|\Z)//;
die("mapping space too small for '$code'\n") if ($mapping ne '');
$foldPairs[$hashed] .= " { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
die("problem parsing mapping for '$code'\n") if (not defined($map1));

if ($hexxed < 128) {
# Just ignore these, we'll handle the low-ASCII ones ourselves.
} elsif ($hexxed > 0xFFFF) {
# We just need to add the 32-bit 2 and/or 3 codepoint maps if this die()'s here.
die("Uhoh, a codepoint > 0xFFFF that folds to multiple codepoints! Fixme.") if defined($map2);
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_32-1));
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
$foldPairs1_32[$hashed] .= " { 0x$code, 0x$map1 },\n";
} elsif (not defined($map2)) {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS1_16-1));
#print("// hexxed '$hexxed' hashed1 '$hashed'\n");
$foldPairs1_16[$hashed] .= " { 0x$code, 0x$map1 },\n";
} elsif (not defined($map3)) {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS2_16-1));
#print("// hexxed '$hexxed' hashed2 '$hashed'\n");
$foldPairs2_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2 },\n";
} else {
my $hashed = (($hexxed ^ ($hexxed >> 8)) & ($HASHBUCKETS3_16-1));
#print("// hexxed '$hexxed' hashed3 '$hashed'\n");
$foldPairs3_16[$hashed] .= " { 0x$code, 0x$map1, 0x$map2, 0x$map3 },\n";
}
}
}
close(FH);

for (my $i = 0; $i < 256; $i++) {
$foldPairs[$i] =~ s/,\n\Z//;
my $str = $foldPairs[$i];
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
$foldPairs1_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs1_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_16_${num}";
print("static const CaseFoldMapping1_16 ${sym}[] = {\n$str\n};\n\n");
}

for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
$foldPairs1_32[$i] =~ s/,\n\Z//;
my $str = $foldPairs1_32[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_32_${num}";
print("static const CaseFoldMapping1_32 ${sym}[] = {\n$str\n};\n\n");
}

for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
$foldPairs2_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs2_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold2_16_${num}";
print("static const CaseFoldMapping2_16 ${sym}[] = {\n$str\n};\n\n");
}

for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
$foldPairs3_16[$i] =~ s/,\n\Z//;
my $str = $foldPairs3_16[$i];
next if $str eq '';
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold_${num}";
print("static const CaseFoldMapping ${sym}[] = {\n$str\n};\n\n");
my $sym = "case_fold3_16_${num}";
print("static const CaseFoldMapping3_16 ${sym}[] = {\n$str\n};\n\n");
}

print("\nstatic const CaseFoldHashBucket case_fold_hash[256] = {\n");
print("static const CaseFoldHashBucket1_16 case_fold_hash1_16[] = {\n");

for (my $i = 0; $i < 256; $i++) {
my $str = $foldPairs[$i];
for (my $i = 0; $i < $HASHBUCKETS1_16; $i++) {
my $str = $foldPairs1_16[$i];
if ($str eq '') {
print(" { 0, NULL },\n");
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold_${num}";
print(" { __PHYSFS_ARRAYLEN($sym), $sym },\n");
my $sym = "case_fold1_16_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");


print("static const CaseFoldHashBucket1_32 case_fold_hash1_32[] = {\n");

for (my $i = 0; $i < $HASHBUCKETS1_32; $i++) {
my $str = $foldPairs1_32[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold1_32_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");


print("static const CaseFoldHashBucket2_16 case_fold_hash2_16[] = {\n");

for (my $i = 0; $i < $HASHBUCKETS2_16; $i++) {
my $str = $foldPairs2_16[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold2_16_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");

print("static const CaseFoldHashBucket3_16 case_fold_hash3_16[] = {\n");

for (my $i = 0; $i < $HASHBUCKETS3_16; $i++) {
my $str = $foldPairs3_16[$i];
if ($str eq '') {
print(" { NULL, 0 },\n");
} else {
my $num = '000' . $i;
$num =~ s/\A.*?(\d\d\d)\Z/$1/;
my $sym = "case_fold3_16_${num}";
print(" { $sym, __PHYSFS_ARRAYLEN($sym) },\n");
}
}
print("};\n\n");

print <<__EOF__;
#endif /* _INCLUDE_PHYSFS_CASEFOLDING_H_ */
/* end of physfs_casefolding.h ... */
__EOF__

exit 0;

# end of makecashfoldhashtable.pl ...
@@ -891,14 +891,14 @@ static DirHandle *openDirectory(PHYSFS_Io *io, const char *d, int forWriting)
/* Look for archivers with matching file extensions first... */
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
{
if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) == 0)
retval = tryOpenDir(io, *i, d, forWriting);
} /* for */

/* failing an exact file extension match, try all the others... */
for (i = archivers; (*i != NULL) && (retval == NULL); i++)
{
if (__PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
if (PHYSFS_utf8stricmp(ext, (*i)->info.extension) != 0)
retval = tryOpenDir(io, *i, d, forWriting);
} /* for */
} /* if */
@@ -1442,7 +1442,7 @@ static int doRegisterArchiver(const PHYSFS_Archiver *_archiver)
ext = _archiver->info.extension;
for (i = 0; i < numArchivers; i++)
{
if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
BAIL(PHYSFS_ERR_DUPLICATE, 0);
} /* for */

@@ -1518,7 +1518,7 @@ int PHYSFS_deregisterArchiver(const char *ext)
__PHYSFS_platformGrabMutex(stateLock);
for (i = 0; i < numArchivers; i++)
{
if (__PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
if (PHYSFS_utf8stricmp(archiveInfo[i]->extension, ext) == 0)
{
const int retval = doDeregisterArchiver(i);
__PHYSFS_platformReleaseMutex(stateLock);
@@ -1921,7 +1921,7 @@ int PHYSFS_setSaneConfig(const char *organization, const char *appName,
if ((l > extlen) && ((*i)[l - extlen - 1] == '.'))
{
ext = (*i) + (l - extlen);
if (__PHYSFS_utf8stricmp(ext, archiveExt) == 0)
if (PHYSFS_utf8stricmp(ext, archiveExt) == 0)
setSaneCfgAddPath(*i, l, dirsep, archivesFirst);
} /* if */
} /* for */
@@ -2521,6 +2521,30 @@ PHYSFS_DECL void PHYSFS_utf8FromLatin1(const char *src, char *dst,

/* Everything above this line is part of the PhysicsFS 2.0 API. */

/**
* \fn int PHYSFS_utf8stricmp(const char *str1, const char *str2)
* \brief Case-insensitive compare of two UTF-8 strings.
*
* This is a strcasecmp/stricmp replacement that expects both strings
* to be in UTF-8 encoding. It will do "case folding" to decide if the
* Unicode codepoints in the strings match.
*
* It will report which string is "greater than" the other, but be aware that
* this doesn't necessarily mean anything: 'a' may be "less than" 'b', but
* a Japanese kuten has no meaningful alphabetically relationship to
* a Greek lambda, but being able to assign a reliable "value" makes sorting
* algorithms possible, if not entirely sane. Most cases should treat the
* return value as "equal" or "not equal".
*
* Like stricmp, this expects both strings to be NULL-terminated.
*
* \param str1 First string to compare.
* \param str2 Second string to compare.
* \return -1 if str1 is "less than" str2, 1 if "greater than", 0 if equal.
*/
PHYSFS_DECL int PHYSFS_utf8stricmp(const char *str1, const char *str2);


/**
* \fn int PHYSFS_unmount(const char *oldDir)
* \brief Remove a directory or archive from the search path.

0 comments on commit d1f2637

Please sign in to comment.