Skip to content
This repository
Browse code

Patch by Phoenix

* adds line 8 for additional UTF-8 ranges
* adds cjk-utf8 locale
* fixes check 3.2 and adds check 3.3

git-svn-id: http://svn.inspircd.org/repository/trunk/inspircd@11082 e03df62e-2008-0410-955e-edbf42e46eb7
  • Loading branch information...
commit bbf472cb954ce729fba5804c6c49faa473a10748 1 parent 35d20fb
peavey authored

Showing 2 changed files with 58 additions and 17 deletions. Show diff stats Hide diff stats

  1. +3 2 locales/readme.txt
  2. +55 15 src/modules/m_nationalchars.cpp
5 locales/readme.txt
@@ -23,7 +23,9 @@ Can be usefull for example for comparing nicknames that contains similar-looking
23 23
24 24 6: List of additional UTF-8 allowed characters
25 25
26   -7: List of additional UTF-8 ranges (character followed by "range"). Strongly experimental. May be replaced in future versions.
  26 +7: List of additional UTF-8 ranges (character followed by 1-byte "range").
  27 +
  28 +8: List of additional UTF-8 ranges (i.e. start1, end1, start2, end2,... UTF8-characters between each start-end pair assumed valid).
27 29
28 30 *** Line format ***
29 31
@@ -44,5 +46,4 @@ In this case every character of line except first dot specifies one character-co
44 46
45 47 *** TODO ***
46 48
47   -- Maybe replace line 7 with <interval start> <interval end> form?
48 49 - UTF-8 collation rules (Inapplieable to InspIRCd atm).
70 src/modules/m_nationalchars.cpp
@@ -12,7 +12,8 @@
12 12 */
13 13
14 14 /* Contains a code of Unreal IRCd + Bynets patch ( http://www.unrealircd.com/ and http://www.bynets.org/ )
15   - Changed at 2008-06-15 - 2008-12-15
  15 + Original patch is made by Dmitry "Killer{R}" Kononko. ( http://killprog.com/ )
  16 + Changed at 2008-06-15 - 2009-02-11
16 17 by Chernov-Phoenix Alexey (Phoenix@RusNet) mailto:phoenix /email address separator/ pravmail.ru */
17 18
18 19 #include "inspircd.h"
@@ -31,13 +32,14 @@ class lwbNickHandler : public HandlerBase2<bool, const char*, size_t>
31 32 };
32 33
33 34 /*,m_reverse_additionalUp[256];*/
34   -static unsigned char m_reverse_additional[256],m_additionalMB[256],m_additionalUtf8[256],m_additionalUtf8range[256];
  35 +static unsigned char m_reverse_additional[256],m_additionalMB[256],m_additionalUtf8[256],m_additionalUtf8range[256],m_additionalUtf8interval[256];
35 36
36 37 char utf8checkrest(unsigned char * mb, unsigned char cnt)
37 38 {
38 39 for (unsigned char * tmp=mb; tmp<mb+cnt; tmp++)
39 40 {
40   - if ((*tmp < 128) || (*tmp > 191))
  41 + /* & is faster! -- Phoenix (char & b11000000 == b10000000) */
  42 + if ((*tmp & 192) != 128)
41 43 return -1;
42 44 }
43 45 return cnt + 1;
@@ -107,7 +109,7 @@ bool lwbNickHandler::Call(const char* n, size_t max)
107 109 continue;
108 110
109 111 /* 3.1. Check against a simple UTF-8 characters enumeration */
110   - int cursize, ncursize = utf8size((unsigned char *)i);
  112 + int cursize, cursize2, ncursize = utf8size((unsigned char *)i);
111 113 /* do check only if current multibyte character is valid UTF-8 only */
112 114 if (ncursize != -1)
113 115 {
@@ -130,25 +132,39 @@ bool lwbNickHandler::Call(const char* n, size_t max)
130 132 if (found)
131 133 continue;
132 134
133   - /* 3.2. Check against an UTF-8 ranges: <start character> and <lenght of the range>.
134   - Also char. is to be checked if it is a valid UTF-8 one */
  135 + /* 3.2. Check against an UTF-8 ranges: <start character> and <length of the range>. */
135 136 found = false;
136 137 for (unsigned char * mb = m_additionalUtf8range; (utf8size(mb) != -1) && (mb < m_additionalUtf8range + sizeof(m_additionalUtf8range)); mb += cursize + 1)
137 138 {
138 139 cursize = utf8size(mb);
139   - /* Size differs? Pick the next! */
  140 + /* Size differs (or lengthbyte is zero)? Pick the next! */
140 141 if ((cursize != ncursize) || (!mb[cursize]))
141 142 continue;
142 143
143   - unsigned char uright[5] = {0,0,0,0,0};
144   -
  144 + unsigned char uright[5] = {0,0,0,0,0}, range = mb[cursize] - 1;
145 145 strncpy((char* ) uright, (char *) mb, cursize);
146 146
147   - if ((uright[cursize-1] + mb[cursize]-1>0xff) && (cursize != 1))
  147 + for (int temp = cursize - 1; (temp >= 0) && range; --temp)
148 148 {
149   - uright[cursize - 2]+=1;
  149 + /* all but the first char are 64-based */
  150 + if (temp)
  151 + {
  152 + char part64 = range & 63; /* i.e. % 64 */
  153 + /* handle carrying over */
  154 + if (uright[temp] + part64 - 1 > 191)
  155 + {
  156 + uright[temp] -= 64;
  157 + range += 64;
  158 + }
  159 + uright[temp] += part64;
  160 + range >>= 6; /* divide it on a 64 */
  161 + }
  162 + /* the first char of UTF-8 doesn't follow the rule */
  163 + else
  164 + {
  165 + uright[temp] += range;
  166 + }
150 167 }
151   - uright[cursize - 1] = (uright[cursize - 1]+mb[cursize] - 1) % 0x100;
152 168
153 169 if ((strncmp(i, (char *) mb, cursize) >= 0) && (strncmp(i, (char *) uright, cursize) <= 0))
154 170 {
@@ -160,6 +176,30 @@ bool lwbNickHandler::Call(const char* n, size_t max)
160 176 }
161 177 if (found)
162 178 continue;
  179 +
  180 + /* 3.3. Check against an UTF-8 intervals: <start character> and <end character>. */
  181 + found = false;
  182 + for (unsigned char * mb = m_additionalUtf8interval; (utf8size(mb) != -1) && (utf8size(mb+utf8size(mb)) != -1)
  183 + && (mb < m_additionalUtf8interval + sizeof(m_additionalUtf8interval)); mb += (cursize+cursize2) )
  184 + {
  185 + cursize = utf8size(mb);
  186 + cursize2= utf8size(mb+cursize);
  187 +
  188 + int minlen = cursize > ncursize ? ncursize : cursize;
  189 + int minlen2 = cursize2 > ncursize ? ncursize : cursize2;
  190 +
  191 + unsigned char* uright = mb + cursize;
  192 +
  193 + if ((strncmp(i, (char *) mb, minlen) >= 0) && (strncmp(i, (char *) uright, minlen2) <= 0))
  194 + {
  195 + i += cursize - 1;
  196 + p += cursize - 1;
  197 + found = true;
  198 + break;
  199 + }
  200 + }
  201 + if (found)
  202 + continue;
163 203 }
164 204
165 205 /* invalid character! abort */
@@ -216,8 +256,8 @@ class ModuleNationalChars : public Module
216 256 charset = conf->ReadValue("nationalchars", "file", 0);
217 257 casemapping = conf->ReadValue("nationalchars", "casemapping", charset, 0, false);
218 258 charset.insert(0, "../locales/");
219   - unsigned char * tables[7] = { m_additional, m_additionalMB, m_additionalUp, m_lower, m_upper, m_additionalUtf8, m_additionalUtf8range };
220   - loadtables(charset, tables, 7, 5);
  259 + unsigned char * tables[8] = { m_additional, m_additionalMB, m_additionalUp, m_lower, m_upper, m_additionalUtf8, m_additionalUtf8range, m_additionalUtf8interval };
  260 + loadtables(charset, tables, 8, 5);
221 261 forcequit = conf->ReadFlag("nationalchars", "forcequit", 0);
222 262 CheckForceQuit("National character set changed");
223 263 delete conf;
@@ -319,7 +359,7 @@ class ModuleNationalChars : public Module
319 359 if (buf[0] == '.') /* simple plain-text string after dot */
320 360 {
321 361 i = buf.size() - 1;
322   -
  362 +
323 363 if (i > (maxindex + 1))
324 364 i = maxindex + 1;
325 365

0 comments on commit bbf472c

Please sign in to comment.
Something went wrong with that request. Please try again.