-
Notifications
You must be signed in to change notification settings - Fork 251
/
pythonGrammar.jj
3685 lines (3504 loc) · 131 KB
/
pythonGrammar.jj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
options {
DEBUG_TOKEN_MANAGER = false;
DEBUG_PARSER = false;
UNICODE_INPUT = true;
STATIC = false;
TOKEN_EXTENDS = "PositionToken";
COMMON_TOKEN_ACTION = true;
USER_CHAR_STREAM = true;
}
PARSER_BEGIN(PythonParser)
package io.joern.pythonparser;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Stack;
import io.joern.pythonparser.ast.Module;
import io.joern.pythonparser.ast.*;
public class PythonParser {
static class BoxedObject<T> {
T object;
BoxedObject(T object) {
this.object = object;
}
}
TokenAttributeProvider attributes(Token startToken, Token endToken) {
return new TokenAttributeProvider(startToken, endToken);
}
NodeAttributeProvider attributes(iattributes attributeAstNode, Token endToken) {
return new NodeAttributeProvider(attributeAstNode, endToken);
}
private ArrayList<ErrorStatement> errors = new ArrayList<ErrorStatement>();
ArrayList<ErrorStatement> getErrors() {
return errors;
}
ErrorStatement recoverAndCreateErrorStmt(Token lastCorrectToken, Exception exception) {
try {
getNextToken();
while (token.kind != SEMICOLON && token.kind != NEWLINE && token.kind != EOF) {
getNextToken();
}
} catch (Exception e) {
// We are already in error handling/recovery mode and try to advance the token stream
// to a point where we have hope to parse something meaningful again. If we get an
// exception during this advancing we end up here.
// We just stop advancing, store the previous error which brought us here in the first
// place and hope the parse can recover from the current state.
}
Token errorStartToken = lastCorrectToken.next;
ErrorStatement errorStmt = new ErrorStatement(exception, attributes(errorStartToken, token));
errors.add(errorStmt);
return errorStmt;
}
// Since this grammar is handling python3 and python2 code, we need to deal with
// abiguities between those two python versions.
// One such ambiguity is caused by the print statements (not expressions) like:
// print (x), y
// In python3 this is a Tuple(elts = [Call(func = print, args = [x]), y])
// whereas in python this is a Call(func = print, args = [Tuple(elts = [x]), y]).
// Our grammar parses this as a python3 style tuple and this method rewrites the
// tuple to the python2 style call.
// The reasoning for this is that a statement with a top level tuple used in no
// expression is superfluous and thus a rare unexpected case. One could argue
// that this case is also expected to be rare in python2 so only time will tell
// whether this was the right choice.
istmt printStatmentToPython2StyleRewrite(istmt stmt) {
if (stmt instanceof Expr) {
Expr expr = (Expr) stmt;
if (expr.value() instanceof Tuple) {
Tuple tuple = (Tuple)expr.value();
if (tuple.elts().nonEmpty() && tuple.elts().head() instanceof Call) {
Call call = (Call) tuple.elts().head();
if (call.func() instanceof Name) {
Name name = (Name) call.func();
if (name.id().equals("print") && call.keywords().isEmpty()) {
ArrayList<iexpr> combinedArgs = new ArrayList<iexpr>();
int argCount = call.args().size();
for (int i = 0; i < argCount; i++) {
combinedArgs.add(call.args().apply(i));
}
int elementCount = tuple.elts().size();
for (int i = 1; i < elementCount; i++) {
combinedArgs.add(tuple.elts().apply(i));
}
return stmt = new Expr(
new Call(call.func(), combinedArgs, new ArrayList<Keyword>(),
call.attributeProvider()));
}
}
}
}
}
return stmt;
}
void setCurrentToken(Token setTo) {
token = setTo;
jj_ntk = -1;
}
}
PARSER_END(PythonParser)
TOKEN_MGR_DECLS:
{
// All kinds of parentheses are tracked here: ( { [
Stack<Integer> parenthesesStack = new Stack<Integer>();
int currentIndent = 0;
int dedentsToEmit;
Stack<Integer> indentStack = new Stack<Integer>();
{
pushParenScope();
indentStack.push(0);
}
void pushParenScope() {
parenthesesStack.push(0);
}
void popParenScope() {
parenthesesStack.pop();
}
void countOpenParen() {
Integer top = parenthesesStack.pop();
parenthesesStack.push(top + 1);
}
void countCloseParen() {
Integer top = parenthesesStack.pop();
parenthesesStack.push(top - 1);
}
int openParenCounter() {
return parenthesesStack.peek();
}
// matchedToken must have the form: <prefix><oneOrThreeQuotes>
// The matched tokens image this cut to not contain <oneOrThreeQuotes>
// and the input stream is backed up by the length(<oneOrThreeQuotes>)
String adjustTokenAndBackupInput(Token matchedToken, int prefixLen) {
int quoteLen = matchedToken.image.length() - prefixLen;
String result = matchedToken.image.substring(0, matchedToken.image.length() - quoteLen);
input_stream.backup(quoteLen);
return result;
}
// Cut token image by quote length since we do not want the close quote in the content.
String cutContentTokenImage(Token matchedToken, int quoteLen) {
String result = matchedToken.image.substring(0, matchedToken.image.length() - quoteLen);
return result;
}
// Cut content token image by length of endToken1 or endToken2 since we do not them in the content.
// In our case this is either a close quote or "{".
// We also backup the input by this length to generate an extra token for close quote or "{"
String cutContentTokenImageAndBackupInput(Token matchedToken, String endToken1, String endToken2) {
int length;
if (matchedToken.image.endsWith(endToken1)) {
length = endToken1.length();
} else if (matchedToken.image.endsWith(endToken2)) {
length = endToken2.length();
} else {
throw new RuntimeException("Unexpected end of matchedToken " + matchedToken.image);
}
String result = matchedToken.image.substring(0, matchedToken.image.length() - length);
input_stream.backup(length);
return result;
}
// Stores the format string lexing state we were in before switching to the DEFAULT state
// in order to parse a replacement field. Naturally needs to be a stack since replacement
// fields can be nested.
// Note that during format string lexing of strings containing multiple replacement fields,
// we switch back and forth multiple times between the lexer DEFAULT state and the stacks
// top state.
Stack<Integer> formatStringLexStateStack = new Stack<Integer>();
// Number of opened curly brackets while parsing a format string in FORMAT_SPEC_LEX.
int formatSpecOpenCurly = 0;
void CommonTokenAction(Token token) {
CharStreamImpl inputStreamImpl = (CharStreamImpl)input_stream;
token.startPos = inputStreamImpl.getBeginPos();
token.endPos = token.startPos + token.image.length();
}
}
TOKEN: {
<IF: "if">
| <ELSE: "else">
| <ELIF: "elif">
| <OR: "or">
| <AND: "and">
| <NOT: "not">
| <DEF: "def">
| <ASYNC: "async">
| <LAMBDA: "lambda">
| <FOR: "for">
| <WITH: "with">
| <RETURN: "return">
| <TRY: "try">
| <EXCEPT: "except">
| <FINALLY: "finally">
| <CLASS: "class">
| <WHILE: "while">
| <IMPORT: "import">
| <FROM: "from">
| <AS: "as">
| <RAISE: "raise">
| <PASS: "pass">
| <DEL: "del">
| <YIELD: "yield">
| <ASSERT: "assert">
| <BREAK: "break">
| <CONTINUE: "continue">
| <GLOBAL: "global">
| <NONLOCAL: "nonlocal">
| <AWAIT: "await">
| <NONE: "None">
| <TRUE: "True">
| <FALSE: "False">
| <ARROW: "->">
| <ELLIPSIS: "...">
| <COLON: ":"> {
// Only on top level of the replacement field lexing we can
// switch into the format spec lexing. At top level
// openParenCounter() is 1 because of the opening curly bracket.
if (!formatStringLexStateStack.isEmpty() && openParenCounter() == 1) {
// We reached a ":" of a format spec of a replacement field of a format string.
SwitchTo(FORMAT_SPEC_LEX);
}
}
| <SEMICOLON: ";">
| <COMMA: ",">
| <ASSIGN: "=">
| <COLON_ASSIGN: ":=">
| <PLUS_ASSIGN: "+=">
| <MINUS_ASSIGN: "-=">
| <STAR_ASSIGN: "*=">
| <AT_ASSIGN: "@=">
| <DIV_ASSIGN: "/=">
| <MOD_ASSIGN: "%=">
| <BIT_AND_ASSIGN: "&=">
| <BIT_OR_ASSIGN: "|=">
| <BIT_XOR_ASSIGN: "^=">
| <LSHIFT_ASSIGN: "<<=">
| <RSHIFT_ASSIGN: ">>=">
| <POW_ASSIGN: "**=">
| <FLOOR_DIV_ASSIGN: "//=">
| <EQ: "==">
| <NEQ: "!=">
| <LT: "<">
| <LTE: "<=">
| <GT: ">">
| <GTE: ">=">
| <IS: "is">
| <IN: "in">
| <PLUS: "+">
| <MINUS: "-">
| <AT: "@">
| <DIV: "/">
| <FLOOR_DIV: "//">
| <MOD: "%">
| <INVERT: "~">
| <LSHIFT: "<<">
| <RSHIFT: ">>">
| <BIT_OR: "|">
| <BIT_XOR: "^">
| <BIT_AND: "&">
| <PAREN_OPEN: "("> { countOpenParen(); }
| <PAREN_CLOSE: ")"> { countCloseParen(); }
| <SQUARE_OPEN: "["> { countOpenParen(); }
| <SQUARE_CLOSE: "]"> { countCloseParen(); }
| <CURLY_OPEN: "{"> { countOpenParen(); }
| <CURLY_CLOSE: "}"> {
countCloseParen();
if (!formatStringLexStateStack.isEmpty() && openParenCounter() == 0) {
popParenScope();
SwitchTo(formatStringLexStateStack.peek());
}
}
| <DOT: ".">
| <STAR: "*">
| <DOUBLE_STAR: "**">
| <STR_CONVERSION: "!s">
| <REPR_CONVERSION: "!r">
| <ASCII_CONVERSION: "!a">
}
// Number tokens:
TOKEN: {
<DEC_INTEGER: <NON_ZERO_DIGIT> (("_")? <DIGIT>)* | "0" (("_")? "0")*>
| <BIN_INTEGER: "0" ("b" | "B") (("_")? <BIN_DIGIT>)+>
| <OCT_INTEGER:
(
"0" ("o" | "O") (("_")? <OCT_DIGIT>)+
| // Just a leading 0 without a following o or 0 is python2 style.
// This conflicts with the new python3 decimal integer style of writing
// zero as an arbitrary number of 0.
// This the DEC_INTEGER rule is defined first such a literal is
// tokenized as DEC_INTEGER. So we prioritize python3 here.
// Since the resulting integer value is in both cases zero the slight
// loss precession for python2 seems exceptable.
"0" (<OCT_DIGIT>)+
)
>
| <HEX_INTEGER: "0" ("x" | "X") (("_")? <HEX_DIGIT>)+>
| <#DIGIT: ["0" - "9"] >
| <#NON_ZERO_DIGIT: ["1" - "9"] >
| <#BIN_DIGIT: "0" | "1" >
| <#OCT_DIGIT: ["0" - "7"] >
| <#HEX_DIGIT: ["0" - "9"] | ["a" - "f"] | ["A" - "F"] >
| <FLOAT:
(
(<DIGIT_PART> "." (<DIGIT_PART>)? (<EXPONENT>)?)
| ("." <DIGIT_PART> (<EXPONENT>)?)
| (<DIGIT_PART> <EXPONENT>)
)
>
| <#EXPONENT: ("e" | "E") ("+" | "-")? <DIGIT_PART>>
| <#DIGIT_PART: <DIGIT> (("_")? <DIGIT>)*>
| <IMAGINARY: (<FLOAT> | <DIGIT_PART>) ("j" | "J")>
}
// Based on https://docs.python.org/3/reference/lexical_analysis.html#identifiers
// and https://www.unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt.
TOKEN: {
<NAME: <ID_START> (<ID_CONTINUE>)*>
| <#ID_START: ["\u0041" - "\u005A"] // L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
| ["\u0061" - "\u007A"] // L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
| "\u00AA" // Lo FEMININE ORDINAL INDICATOR
| "\u00B5" // L& MICRO SIGN
| "\u00BA" // Lo MASCULINE ORDINAL INDICATOR
| ["\u00C0" - "\u00D6"] // L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS
| ["\u00D8" - "\u00F6"] // L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS
| ["\u00F8" - "\u01BA"] // L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL
| "\u01BB" // Lo LATIN LETTER TWO WITH STROKE
| ["\u01BC" - "\u01BF"] // L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN
| ["\u01C0" - "\u01C3"] // Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK
| ["\u01C4" - "\u0293"] // L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL
| "\u0294" // Lo LATIN LETTER GLOTTAL STOP
| ["\u0295" - "\u02AF"] // L& [27] LATIN LETTER PHARYNGEAL VOICED FRICATIVE..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
| ["\u02B0" - "\u02C1"] // Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP
| ["\u02C6" - "\u02D1"] // Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON
| ["\u02E0" - "\u02E4"] // Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
| "\u02EC" // Lm MODIFIER LETTER VOICING
| "\u02EE" // Lm MODIFIER LETTER DOUBLE APOSTROPHE
| ["\u0370" - "\u0373"] // L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI
| "\u0374" // Lm GREEK NUMERAL SIGN
| ["\u0376" - "\u0377"] // L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
| "\u037A" // Lm GREEK YPOGEGRAMMENI
| ["\u037B" - "\u037D"] // L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL
| "\u037F" // L& GREEK CAPITAL LETTER YOT
| "\u0386" // L& GREEK CAPITAL LETTER ALPHA WITH TONOS
| ["\u0388" - "\u038A"] // L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS
| "\u038C" // L& GREEK CAPITAL LETTER OMICRON WITH TONOS
| ["\u038E" - "\u03A1"] // L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO
| ["\u03A3" - "\u03F5"] // L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL
| ["\u03F7" - "\u0481"] // L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA
| ["\u048A" - "\u052F"] // L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER
| ["\u0531" - "\u0556"] // L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH
| "\u0559" // Lm ARMENIAN MODIFIER LETTER LEFT HALF RING
| ["\u0560" - "\u0588"] // L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE
| ["\u05D0" - "\u05EA"] // Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV
| ["\u05EF" - "\u05F2"] // Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD
| ["\u0620" - "\u063F"] // Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
| "\u0640" // Lm ARABIC TATWEEL
| ["\u0641" - "\u064A"] // Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH
| ["\u066E" - "\u066F"] // Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF
| ["\u0671" - "\u06D3"] // Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
| "\u06D5" // Lo ARABIC LETTER AE
| ["\u06E5" - "\u06E6"] // Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH
| ["\u06EE" - "\u06EF"] // Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V
| ["\u06FA" - "\u06FC"] // Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW
| "\u06FF" // Lo ARABIC LETTER HEH WITH INVERTED V
| "\u0710" // Lo SYRIAC LETTER ALAPH
| ["\u0712" - "\u072F"] // Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH
| ["\u074D" - "\u07A5"] // Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU
| "\u07B1" // Lo THAANA LETTER NAA
| ["\u07CA" - "\u07EA"] // Lo [33] NKO LETTER A..NKO LETTER JONA RA
| ["\u07F4" - "\u07F5"] // Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE
| "\u07FA" // Lm NKO LAJANYALAN
| ["\u0800" - "\u0815"] // Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF
| "\u081A" // Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT
| "\u0824" // Lm SAMARITAN MODIFIER LETTER SHORT A
| "\u0828" // Lm SAMARITAN MODIFIER LETTER I
| ["\u0840" - "\u0858"] // Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN
| ["\u0860" - "\u086A"] // Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA
| ["\u0870" - "\u0887"] // Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT
| ["\u0889" - "\u088E"] // Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL
| ["\u08A0" - "\u08C8"] // Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF
| "\u08C9" // Lm ARABIC SMALL FARSI YEH
| ["\u0904" - "\u0939"] // Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA
| "\u093D" // Lo DEVANAGARI SIGN AVAGRAHA
| "\u0950" // Lo DEVANAGARI OM
| ["\u0958" - "\u0961"] // Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL
| "\u0971" // Lm DEVANAGARI SIGN HIGH SPACING DOT
| ["\u0972" - "\u0980"] // Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI
| ["\u0985" - "\u098C"] // Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L
| ["\u098F" - "\u0990"] // Lo [2] BENGALI LETTER E..BENGALI LETTER AI
| ["\u0993" - "\u09A8"] // Lo [22] BENGALI LETTER O..BENGALI LETTER NA
| ["\u09AA" - "\u09B0"] // Lo [7] BENGALI LETTER PA..BENGALI LETTER RA
| "\u09B2" // Lo BENGALI LETTER LA
| ["\u09B6" - "\u09B9"] // Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA
| "\u09BD" // Lo BENGALI SIGN AVAGRAHA
| "\u09CE" // Lo BENGALI LETTER KHANDA TA
| ["\u09DC" - "\u09DD"] // Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA
| ["\u09DF" - "\u09E1"] // Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL
| ["\u09F0" - "\u09F1"] // Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL
| "\u09FC" // Lo BENGALI LETTER VEDIC ANUSVARA
| ["\u0A05" - "\u0A0A"] // Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU
| ["\u0A0F" - "\u0A10"] // Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI
| ["\u0A13" - "\u0A28"] // Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA
| ["\u0A2A" - "\u0A30"] // Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA
| ["\u0A32" - "\u0A33"] // Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA
| ["\u0A35" - "\u0A36"] // Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA
| ["\u0A38" - "\u0A39"] // Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA
| ["\u0A59" - "\u0A5C"] // Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA
| "\u0A5E" // Lo GURMUKHI LETTER FA
| ["\u0A72" - "\u0A74"] // Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR
| ["\u0A85" - "\u0A8D"] // Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E
| ["\u0A8F" - "\u0A91"] // Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O
| ["\u0A93" - "\u0AA8"] // Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA
| ["\u0AAA" - "\u0AB0"] // Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA
| ["\u0AB2" - "\u0AB3"] // Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA
| ["\u0AB5" - "\u0AB9"] // Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA
| "\u0ABD" // Lo GUJARATI SIGN AVAGRAHA
| "\u0AD0" // Lo GUJARATI OM
| ["\u0AE0" - "\u0AE1"] // Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL
| "\u0AF9" // Lo GUJARATI LETTER ZHA
| ["\u0B05" - "\u0B0C"] // Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L
| ["\u0B0F" - "\u0B10"] // Lo [2] ORIYA LETTER E..ORIYA LETTER AI
| ["\u0B13" - "\u0B28"] // Lo [22] ORIYA LETTER O..ORIYA LETTER NA
| ["\u0B2A" - "\u0B30"] // Lo [7] ORIYA LETTER PA..ORIYA LETTER RA
| ["\u0B32" - "\u0B33"] // Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA
| ["\u0B35" - "\u0B39"] // Lo [5] ORIYA LETTER VA..ORIYA LETTER HA
| "\u0B3D" // Lo ORIYA SIGN AVAGRAHA
| ["\u0B5C" - "\u0B5D"] // Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA
| ["\u0B5F" - "\u0B61"] // Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL
| "\u0B71" // Lo ORIYA LETTER WA
| "\u0B83" // Lo TAMIL SIGN VISARGA
| ["\u0B85" - "\u0B8A"] // Lo [6] TAMIL LETTER A..TAMIL LETTER UU
| ["\u0B8E" - "\u0B90"] // Lo [3] TAMIL LETTER E..TAMIL LETTER AI
| ["\u0B92" - "\u0B95"] // Lo [4] TAMIL LETTER O..TAMIL LETTER KA
| ["\u0B99" - "\u0B9A"] // Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA
| "\u0B9C" // Lo TAMIL LETTER JA
| ["\u0B9E" - "\u0B9F"] // Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA
| ["\u0BA3" - "\u0BA4"] // Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA
| ["\u0BA8" - "\u0BAA"] // Lo [3] TAMIL LETTER NA..TAMIL LETTER PA
| ["\u0BAE" - "\u0BB9"] // Lo [12] TAMIL LETTER MA..TAMIL LETTER HA
| "\u0BD0" // Lo TAMIL OM
| ["\u0C05" - "\u0C0C"] // Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L
| ["\u0C0E" - "\u0C10"] // Lo [3] TELUGU LETTER E..TELUGU LETTER AI
| ["\u0C12" - "\u0C28"] // Lo [23] TELUGU LETTER O..TELUGU LETTER NA
| ["\u0C2A" - "\u0C39"] // Lo [16] TELUGU LETTER PA..TELUGU LETTER HA
| "\u0C3D" // Lo TELUGU SIGN AVAGRAHA
| ["\u0C58" - "\u0C5A"] // Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA
| "\u0C5D" // Lo TELUGU LETTER NAKAARA POLLU
| ["\u0C60" - "\u0C61"] // Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL
| "\u0C80" // Lo KANNADA SIGN SPACING CANDRABINDU
| ["\u0C85" - "\u0C8C"] // Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
| ["\u0C8E" - "\u0C90"] // Lo [3] KANNADA LETTER E..KANNADA LETTER AI
| ["\u0C92" - "\u0CA8"] // Lo [23] KANNADA LETTER O..KANNADA LETTER NA
| ["\u0CAA" - "\u0CB3"] // Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA
| ["\u0CB5" - "\u0CB9"] // Lo [5] KANNADA LETTER VA..KANNADA LETTER HA
| "\u0CBD" // Lo KANNADA SIGN AVAGRAHA
| ["\u0CDD" - "\u0CDE"] // Lo [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA
| ["\u0CE0" - "\u0CE1"] // Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
| ["\u0CF1" - "\u0CF2"] // Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
| ["\u0D04" - "\u0D0C"] // Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
| ["\u0D0E" - "\u0D10"] // Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
| ["\u0D12" - "\u0D3A"] // Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA
| "\u0D3D" // Lo MALAYALAM SIGN AVAGRAHA
| "\u0D4E" // Lo MALAYALAM LETTER DOT REPH
| ["\u0D54" - "\u0D56"] // Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL
| ["\u0D5F" - "\u0D61"] // Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL
| ["\u0D7A" - "\u0D7F"] // Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K
| ["\u0D85" - "\u0D96"] // Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA
| ["\u0D9A" - "\u0DB1"] // Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA
| ["\u0DB3" - "\u0DBB"] // Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA
| "\u0DBD" // Lo SINHALA LETTER DANTAJA LAYANNA
| ["\u0DC0" - "\u0DC6"] // Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA
| ["\u0E01" - "\u0E30"] // Lo [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A
| ["\u0E32" - "\u0E33"] // Lo [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM
| ["\u0E40" - "\u0E45"] // Lo [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO
| "\u0E46" // Lm THAI CHARACTER MAIYAMOK
| ["\u0E81" - "\u0E82"] // Lo [2] LAO LETTER KO..LAO LETTER KHO SUNG
| "\u0E84" // Lo LAO LETTER KHO TAM
| ["\u0E86" - "\u0E8A"] // Lo [5] LAO LETTER PALI GHA..LAO LETTER SO TAM
| ["\u0E8C" - "\u0EA3"] // Lo [24] LAO LETTER PALI JHA..LAO LETTER LO LING
| "\u0EA5" // Lo LAO LETTER LO LOOT
| ["\u0EA7" - "\u0EB0"] // Lo [10] LAO LETTER WO..LAO VOWEL SIGN A
| ["\u0EB2" - "\u0EB3"] // Lo [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM
| "\u0EBD" // Lo LAO SEMIVOWEL SIGN NYO
| ["\u0EC0" - "\u0EC4"] // Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
| "\u0EC6" // Lm LAO KO LA
| ["\u0EDC" - "\u0EDF"] // Lo [4] LAO HO NO..LAO LETTER KHMU NYO
| "\u0F00" // Lo TIBETAN SYLLABLE OM
| ["\u0F40" - "\u0F47"] // Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA
| ["\u0F49" - "\u0F6C"] // Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA
| ["\u0F88" - "\u0F8C"] // Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN
| ["\u1000" - "\u102A"] // Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU
| "\u103F" // Lo MYANMAR LETTER GREAT SA
| ["\u1050" - "\u1055"] // Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL
| ["\u105A" - "\u105D"] // Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE
| "\u1061" // Lo MYANMAR LETTER SGAW KAREN SHA
| ["\u1065" - "\u1066"] // Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA
| ["\u106E" - "\u1070"] // Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA
| ["\u1075" - "\u1081"] // Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA
| "\u108E" // Lo MYANMAR LETTER RUMAI PALAUNG FA
| ["\u10A0" - "\u10C5"] // L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE
| "\u10C7" // L& GEORGIAN CAPITAL LETTER YN
| "\u10CD" // L& GEORGIAN CAPITAL LETTER AEN
| ["\u10D0" - "\u10FA"] // L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
| "\u10FC" // Lm MODIFIER LETTER GEORGIAN NAR
| ["\u10FD" - "\u10FF"] // L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
| ["\u1100" - "\u1248"] // Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA
| ["\u124A" - "\u124D"] // Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE
| ["\u1250" - "\u1256"] // Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO
| "\u1258" // Lo ETHIOPIC SYLLABLE QHWA
| ["\u125A" - "\u125D"] // Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE
| ["\u1260" - "\u1288"] // Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA
| ["\u128A" - "\u128D"] // Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE
| ["\u1290" - "\u12B0"] // Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA
| ["\u12B2" - "\u12B5"] // Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE
| ["\u12B8" - "\u12BE"] // Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO
| "\u12C0" // Lo ETHIOPIC SYLLABLE KXWA
| ["\u12C2" - "\u12C5"] // Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE
| ["\u12C8" - "\u12D6"] // Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O
| ["\u12D8" - "\u1310"] // Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA
| ["\u1312" - "\u1315"] // Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE
| ["\u1318" - "\u135A"] // Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA
| ["\u1380" - "\u138F"] // Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE
| ["\u13A0" - "\u13F5"] // L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV
| ["\u13F8" - "\u13FD"] // L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
| ["\u1401" - "\u166C"] // Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA
| ["\u166F" - "\u167F"] // Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W
| ["\u1681" - "\u169A"] // Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH
| ["\u16A0" - "\u16EA"] // Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
| ["\u16EE" - "\u16F0"] // Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL
| ["\u16F1" - "\u16F8"] // Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
| ["\u1700" - "\u1711"] // Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA
| ["\u171F" - "\u1731"] // Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA
| ["\u1740" - "\u1751"] // Lo [18] BUHID LETTER A..BUHID LETTER HA
| ["\u1760" - "\u176C"] // Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA
| ["\u176E" - "\u1770"] // Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA
| ["\u1780" - "\u17B3"] // Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU
| "\u17D7" // Lm KHMER SIGN LEK TOO
| "\u17DC" // Lo KHMER SIGN AVAKRAHASANYA
| ["\u1820" - "\u1842"] // Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
| "\u1843" // Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN
| ["\u1844" - "\u1878"] // Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS
| ["\u1880" - "\u1884"] // Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA
| ["\u1885" - "\u1886"] // Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
| ["\u1887" - "\u18A8"] // Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA
| "\u18AA" // Lo MONGOLIAN LETTER MANCHU ALI GALI LHA
| ["\u18B0" - "\u18F5"] // Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S
| ["\u1900" - "\u191E"] // Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA
| ["\u1950" - "\u196D"] // Lo [30] TAI LE LETTER KA..TAI LE LETTER AI
| ["\u1970" - "\u1974"] // Lo [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6
| ["\u1980" - "\u19AB"] // Lo [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA
| ["\u19B0" - "\u19C9"] // Lo [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2
| ["\u1A00" - "\u1A16"] // Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA
| ["\u1A20" - "\u1A54"] // Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA
| "\u1AA7" // Lm TAI THAM SIGN MAI YAMOK
| ["\u1B05" - "\u1B33"] // Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA
| ["\u1B45" - "\u1B4C"] // Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA
| ["\u1B83" - "\u1BA0"] // Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA
| ["\u1BAE" - "\u1BAF"] // Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
| ["\u1BBA" - "\u1BE5"] // Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U
| ["\u1C00" - "\u1C23"] // Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A
| ["\u1C4D" - "\u1C4F"] // Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA
| ["\u1C5A" - "\u1C77"] // Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH
| ["\u1C78" - "\u1C7D"] // Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD
| ["\u1C80" - "\u1C88"] // L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK
| ["\u1C90" - "\u1CBA"] // L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN
| ["\u1CBD" - "\u1CBF"] // L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN
| ["\u1CE9" - "\u1CEC"] // Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
| ["\u1CEE" - "\u1CF3"] // Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA
| ["\u1CF5" - "\u1CF6"] // Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
| "\u1CFA" // Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA
| ["\u1D00" - "\u1D2B"] // L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL
| ["\u1D2C" - "\u1D6A"] // Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI
| ["\u1D6B" - "\u1D77"] // L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G
| "\u1D78" // Lm MODIFIER LETTER CYRILLIC EN
| ["\u1D79" - "\u1D9A"] // L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK
| ["\u1D9B" - "\u1DBF"] // Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA
| ["\u1E00" - "\u1F15"] // L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA
| ["\u1F18" - "\u1F1D"] // L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
| ["\u1F20" - "\u1F45"] // L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA
| ["\u1F48" - "\u1F4D"] // L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA
| ["\u1F50" - "\u1F57"] // L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI
| "\u1F59" // L& GREEK CAPITAL LETTER UPSILON WITH DASIA
| "\u1F5B" // L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA
| "\u1F5D" // L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA
| ["\u1F5F" - "\u1F7D"] // L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA
| ["\u1F80" - "\u1FB4"] // L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
| ["\u1FB6" - "\u1FBC"] // L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
| "\u1FBE" // L& GREEK PROSGEGRAMMENI
| ["\u1FC2" - "\u1FC4"] // L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
| ["\u1FC6" - "\u1FCC"] // L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
| ["\u1FD0" - "\u1FD3"] // L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
| ["\u1FD6" - "\u1FDB"] // L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA
| ["\u1FE0" - "\u1FEC"] // L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA
| ["\u1FF2" - "\u1FF4"] // L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
| ["\u1FF6" - "\u1FFC"] // L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
| "\u2071" // Lm SUPERSCRIPT LATIN SMALL LETTER I
| "\u207F" // Lm SUPERSCRIPT LATIN SMALL LETTER N
| ["\u2090" - "\u209C"] // Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T
| "\u2102" // L& DOUBLE-STRUCK CAPITAL C
| "\u2107" // L& EULER CONSTANT
| ["\u210A" - "\u2113"] // L& [10] SCRIPT SMALL G..SCRIPT SMALL L
| "\u2115" // L& DOUBLE-STRUCK CAPITAL N
| "\u2118" // Sm SCRIPT CAPITAL P
| ["\u2119" - "\u211D"] // L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R
| "\u2124" // L& DOUBLE-STRUCK CAPITAL Z
| "\u2126" // L& OHM SIGN
| "\u2128" // L& BLACK-LETTER CAPITAL Z
| ["\u212A" - "\u212D"] // L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C
| "\u212E" // So ESTIMATED SYMBOL
| ["\u212F" - "\u2134"] // L& [6] SCRIPT SMALL E..SCRIPT SMALL O
| ["\u2135" - "\u2138"] // Lo [4] ALEF SYMBOL..DALET SYMBOL
| "\u2139" // L& INFORMATION SOURCE
| ["\u213C" - "\u213F"] // L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI
| ["\u2145" - "\u2149"] // L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J
| "\u214E" // L& TURNED SMALL F
| ["\u2160" - "\u2182"] // Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND
| ["\u2183" - "\u2184"] // L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C
| ["\u2185" - "\u2188"] // Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND
| ["\u2C00" - "\u2C7B"] // L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E
| ["\u2C7C" - "\u2C7D"] // Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V
| ["\u2C7E" - "\u2CE4"] // L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI
| ["\u2CEB" - "\u2CEE"] // L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA
| ["\u2CF2" - "\u2CF3"] // L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI
| ["\u2D00" - "\u2D25"] // L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE
| "\u2D27" // L& GEORGIAN SMALL LETTER YN
| "\u2D2D" // L& GEORGIAN SMALL LETTER AEN
| ["\u2D30" - "\u2D67"] // Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO
| "\u2D6F" // Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK
| ["\u2D80" - "\u2D96"] // Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE
| ["\u2DA0" - "\u2DA6"] // Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO
| ["\u2DA8" - "\u2DAE"] // Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO
| ["\u2DB0" - "\u2DB6"] // Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO
| ["\u2DB8" - "\u2DBE"] // Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO
| ["\u2DC0" - "\u2DC6"] // Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO
| ["\u2DC8" - "\u2DCE"] // Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO
| ["\u2DD0" - "\u2DD6"] // Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO
| ["\u2DD8" - "\u2DDE"] // Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO
| "\u3005" // Lm IDEOGRAPHIC ITERATION MARK
| "\u3006" // Lo IDEOGRAPHIC CLOSING MARK
| "\u3007" // Nl IDEOGRAPHIC NUMBER ZERO
| ["\u3021" - "\u3029"] // Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE
| ["\u3031" - "\u3035"] // Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
| ["\u3038" - "\u303A"] // Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
| "\u303B" // Lm VERTICAL IDEOGRAPHIC ITERATION MARK
| "\u303C" // Lo MASU MARK
| ["\u3041" - "\u3096"] // Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
| ["\u309B" - "\u309C"] // Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
| ["\u309D" - "\u309E"] // Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK
| "\u309F" // Lo HIRAGANA DIGRAPH YORI
| ["\u30A1" - "\u30FA"] // Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO
| ["\u30FC" - "\u30FE"] // Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK
| "\u30FF" // Lo KATAKANA DIGRAPH KOTO
| ["\u3105" - "\u312F"] // Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN
| ["\u3131" - "\u318E"] // Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE
| ["\u31A0" - "\u31BF"] // Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
| ["\u31F0" - "\u31FF"] // Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
| ["\u3400" - "\u4DBF"] // Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
| ["\u4E00" - "\uA014"] // Lo [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E
| "\uA015" // Lm YI SYLLABLE WU
| ["\uA016" - "\uA48C"] // Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
| ["\uA4D0" - "\uA4F7"] // Lo [40] LISU LETTER BA..LISU LETTER OE
| ["\uA4F8" - "\uA4FD"] // Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU
| ["\uA500" - "\uA60B"] // Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG
| "\uA60C" // Lm VAI SYLLABLE LENGTHENER
| ["\uA610" - "\uA61F"] // Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG
| ["\uA62A" - "\uA62B"] // Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO
| ["\uA640" - "\uA66D"] // L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O
| "\uA66E" // Lo CYRILLIC LETTER MULTIOCULAR O
| "\uA67F" // Lm CYRILLIC PAYEROK
| ["\uA680" - "\uA69B"] // L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O
| ["\uA69C" - "\uA69D"] // Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
| ["\uA6A0" - "\uA6E5"] // Lo [70] BAMUM LETTER A..BAMUM LETTER KI
| ["\uA6E6" - "\uA6EF"] // Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM
| ["\uA717" - "\uA71F"] // Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
| ["\uA722" - "\uA76F"] // L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON
| "\uA770" // Lm MODIFIER LETTER US
| ["\uA771" - "\uA787"] // L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
| "\uA788" // Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT
| ["\uA78B" - "\uA78E"] // L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
| "\uA78F" // Lo LATIN LETTER SINOLOGICAL DOT
| ["\uA790" - "\uA7CA"] // L& [59] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
| ["\uA7D0" - "\uA7D1"] // L& [2] LATIN CAPITAL LETTER CLOSED INSULAR G..LATIN SMALL LETTER CLOSED INSULAR G
| "\uA7D3" // L& LATIN SMALL LETTER DOUBLE THORN
| ["\uA7D5" - "\uA7D9"] // L& [5] LATIN SMALL LETTER DOUBLE WYNN..LATIN SMALL LETTER SIGMOID S
| ["\uA7F2" - "\uA7F4"] // Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
| ["\uA7F5" - "\uA7F6"] // L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H
| "\uA7F7" // Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
| ["\uA7F8" - "\uA7F9"] // Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
| "\uA7FA" // L& LATIN LETTER SMALL CAPITAL TURNED M
| ["\uA7FB" - "\uA801"] // Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I
| ["\uA803" - "\uA805"] // Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O
| ["\uA807" - "\uA80A"] // Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO
| ["\uA80C" - "\uA822"] // Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO
| ["\uA840" - "\uA873"] // Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU
| ["\uA882" - "\uA8B3"] // Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA
| ["\uA8F2" - "\uA8F7"] // Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA
| "\uA8FB" // Lo DEVANAGARI HEADSTROKE
| ["\uA8FD" - "\uA8FE"] // Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY
| ["\uA90A" - "\uA925"] // Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO
| ["\uA930" - "\uA946"] // Lo [23] REJANG LETTER KA..REJANG LETTER A
| ["\uA960" - "\uA97C"] // Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH
| ["\uA984" - "\uA9B2"] // Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA
| "\uA9CF" // Lm JAVANESE PANGRANGKEP
| ["\uA9E0" - "\uA9E4"] // Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA
| "\uA9E6" // Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION
| ["\uA9E7" - "\uA9EF"] // Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA
| ["\uA9FA" - "\uA9FE"] // Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA
| ["\uAA00" - "\uAA28"] // Lo [41] CHAM LETTER A..CHAM LETTER HA
| ["\uAA40" - "\uAA42"] // Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG
| ["\uAA44" - "\uAA4B"] // Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS
| ["\uAA60" - "\uAA6F"] // Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA
| "\uAA70" // Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION
| ["\uAA71" - "\uAA76"] // Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM
| "\uAA7A" // Lo MYANMAR LETTER AITON RA
| ["\uAA7E" - "\uAAAF"] // Lo [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O
| "\uAAB1" // Lo TAI VIET VOWEL AA
| ["\uAAB5" - "\uAAB6"] // Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O
| ["\uAAB9" - "\uAABD"] // Lo [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN
| "\uAAC0" // Lo TAI VIET TONE MAI NUENG
| "\uAAC2" // Lo TAI VIET TONE MAI SONG
| ["\uAADB" - "\uAADC"] // Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG
| "\uAADD" // Lm TAI VIET SYMBOL SAM
| ["\uAAE0" - "\uAAEA"] // Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA
| "\uAAF2" // Lo MEETEI MAYEK ANJI
| ["\uAAF3" - "\uAAF4"] // Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK
| ["\uAB01" - "\uAB06"] // Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO
| ["\uAB09" - "\uAB0E"] // Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO
| ["\uAB11" - "\uAB16"] // Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO
| ["\uAB20" - "\uAB26"] // Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO
| ["\uAB28" - "\uAB2E"] // Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO
| ["\uAB30" - "\uAB5A"] // L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
| ["\uAB5C" - "\uAB5F"] // Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
| ["\uAB60" - "\uAB68"] // L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
| "\uAB69" // Lm MODIFIER LETTER SMALL TURNED W
| ["\uAB70" - "\uABBF"] // L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
| ["\uABC0" - "\uABE2"] // Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM
| ["\uAC00" - "\uD7A3"] // Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
| ["\uD7B0" - "\uD7C6"] // Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E
| ["\uD7CB" - "\uD7FB"] // Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH
| ["\uF900" - "\uFA6D"] // Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
| ["\uFA70" - "\uFAD9"] // Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
| ["\uFB00" - "\uFB06"] // L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
| ["\uFB13" - "\uFB17"] // L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
| "\uFB1D" // Lo HEBREW LETTER YOD WITH HIRIQ
| ["\uFB1F" - "\uFB28"] // Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV
| ["\uFB2A" - "\uFB36"] // Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH
| ["\uFB38" - "\uFB3C"] // Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH
| "\uFB3E" // Lo HEBREW LETTER MEM WITH DAGESH
| ["\uFB40" - "\uFB41"] // Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH
| ["\uFB43" - "\uFB44"] // Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH
| ["\uFB46" - "\uFBB1"] // Lo [108] HEBREW LETTER TSADI WITH DAGESH..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
| ["\uFBD3" - "\uFD3D"] // Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM
| ["\uFD50" - "\uFD8F"] // Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM
| ["\uFD92" - "\uFDC7"] // Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
| ["\uFDF0" - "\uFDFB"] // Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU
| ["\uFE70" - "\uFE74"] // Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM
| ["\uFE76" - "\uFEFC"] // Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
| ["\uFF21" - "\uFF3A"] // L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
| ["\uFF41" - "\uFF5A"] // L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
| ["\uFF66" - "\uFF6F"] // Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU
| "\uFF70" // Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
| ["\uFF71" - "\uFF9D"] // Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N
| ["\uFF9E" - "\uFF9F"] // Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
| ["\uFFA0" - "\uFFBE"] // Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH
| ["\uFFC2" - "\uFFC7"] // Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E
| ["\uFFCA" - "\uFFCF"] // Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE
| ["\uFFD2" - "\uFFD7"] // Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU
| ["\uFFDA" - "\uFFDC"] // Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I
| "_" >
| <#ID_CONTINUE: <ID_START> | <DIGIT> >
}
SKIP: {
<SPACE: " " | "\t">
// Here and for all other places where we match newline,
// it is not necessary to separately match the combined
// versions like "\n\r". This is ok because interpreting
// the second character as a separate newline is always
// fine because blank lines are ignored.
| <SKIPPED_NEWLINE: "\n" | "\r"> {
if (openParenCounter() == 0) {
SwitchTo(NEWLINE_EMIT);
}
}
| <LINE_JOIN: "\\" ("\n" | "\r" | "\r\n")>
}
SPECIAL_TOKEN: {
<COMMENT: "#" (~["\n", "\r"])*>
}
<INDENT_CHECK> SPECIAL_TOKEN: {
<INDENT_CHECK_COMMENT: "#" (~["\n", "\r"])*>
}
<INDENT_CHECK> SKIP: {
<INDENT_CHECK_SPACE: " "> { currentIndent += 1; }
| <INDENT_CHECK_TAB: "\t"> { currentIndent = currentIndent / 8 + 8; }
| <INDENT_CHECK_NEWLINE: "\n" | "\r"> { currentIndent = 0; }
| <INDENT_CHECK_END: ~[]> {
// Rewind input stream by the one consumed character because we dont
// really wont to skip it. We just used it as the end marker for the
// indentation check. The documentation says input_stream is read only
// so we are a little bit of the beaten path here but so far it works.
input_stream.backup(1);
int lastIndent = indentStack.peek();
if (currentIndent > lastIndent) {
indentStack.push(currentIndent);
SwitchTo(INDENT_EMIT);
} else if (currentIndent < lastIndent) {
assert(dedentsToEmit == 0);
while (currentIndent < lastIndent) {
dedentsToEmit += 1;
indentStack.pop();
lastIndent = indentStack.peek();
}
if (currentIndent == lastIndent) {
SwitchTo(DEDENT_EMIT);
} else {
SwitchTo(MISSDENT_EMIT);
}
} else {
SwitchTo(DEFAULT);
}
currentIndent = 0;
}
}
<MISSDENT_EMIT> TOKEN: {
// Defining this as ""|"" causes javaCC to produce better readable debug output
// in the form a proper tokenImage string <MISSDENT> instead of "". This has no
// impact on the generated matching code in the state.
<MISSINDENT: "" | ""> {
SwitchTo(DEFAULT);
}
}
<INDENT_EMIT> TOKEN: {
// Defining this as ""|"" causes javaCC to produce better readable debug output
// in the form a proper tokenImage string <INDENT> instead of "". This has no
// impact on the generated matching code in the state.
<INDENT: "" | ""> {
SwitchTo(DEFAULT);
}
}
<DEDENT_EMIT> TOKEN: {
// Defining this as ""|"" causes javaCC to produce better readable debug output
// in the form a proper tokenImage string <DEDENT> instead of "". This has no
// impact on the generated matching code in the state.
<DEDENT: "" | ""> {
dedentsToEmit -= 1;
if (dedentsToEmit == 0) {
SwitchTo(DEFAULT);
}
// This disables the build in infinite loop detection.
// We take care of not looping forever by decrementing
// dedentsToEmit.
jjbeenHere[DEDENT_EMIT] = false;
}
}
<NEWLINE_EMIT> TOKEN: {
// Defining this as ""|"" causes javaCC to produce better readable debug output
// in the form a proper tokenImage string <NEWLINE> instead of "". This has no
// impact on the generated matching code in the state.
<NEWLINE: "" | ""> {
SwitchTo(INDENT_CHECK);
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// String lexer rules:
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
TOKEN: {
<STRING_PREFIX:
("r" | "R" | "b" | "B" | "u" | "U")
("\"" | "'" | "\"\"\"" | "'''")> {
matchedToken.image = adjustTokenAndBackupInput(matchedToken, 1);
}
| <STRING_PREFIX2:
("rb" | "rB" | "Rb" | "RB" | "br" | "bR" | "Br" | "BR" | "ur" | "uR" | "Ur" | "UR")
("\"" | "'" | "\"\"\"" | "'''")> {
matchedToken.image = adjustTokenAndBackupInput(matchedToken, 2);
matchedToken.kind = STRING_PREFIX;
}
| <FORMAT_STRING_PREFIX:
("f" | "F")
("\"" | "'" | "\"\"\"" | "'''")> {
matchedToken.image = adjustTokenAndBackupInput(matchedToken, 1);
} : FORMAT_STRING_LEX
| <FORMAT_STRING_PREFIX2:
("rf" | "rF" | "Rf" | "RF" | "fr" | "fR" | "Fr" | "FR")
("\"" | "'" | "\"\"\"" | "'''")> {
matchedToken.image = adjustTokenAndBackupInput(matchedToken, 2);
matchedToken.kind = FORMAT_STRING_PREFIX;
} : FORMAT_STRING_LEX
}
///////////////////////////////////////////////////////////////////////////////
// Normal string lexer rules:
///////////////////////////////////////////////////////////////////////////////
// The lexer state NEVER is never reached. We use it only to define common lexer token kinds
// to which the different string lexer map to provide a uniform interface to the parsing stage.
<NEVER> TOKEN: {
<STRING_QUOTE_OPEN: "STRING_QUOTE_OPEN">
| <STRING_CONTENT: "STRING_CONTENT">
}
// NSDQ stands for: normal single double quote
// NSSQ stands for: normal single single quote
// NTDQ stands for: normal triple double quote
// NTSQ stands for: normal single double quote
TOKEN: {
<NSDQ_QUOTE_OPEN: "\"" > { matchedToken.kind = STRING_QUOTE_OPEN; }: NSDQ_LEX
| <NSSQ_QUOTE_OPEN: "'"> { matchedToken.kind = STRING_QUOTE_OPEN; }: NSSQ_LEX
| <NTDQ_QUOTE_OPEN: "\"\"\""> { matchedToken.kind = STRING_QUOTE_OPEN; }: NTDQ_LEX
| <NTSQ_QUOTE_OPEN: "'''"> { matchedToken.kind = STRING_QUOTE_OPEN; }: NTSQ_LEX
}
<NSDQ_LEX> TOKEN: {
<NSDQ_CONTENT: "\""> {
matchedToken.image = cutContentTokenImage(matchedToken, 1);
matchedToken.kind = STRING_CONTENT;
}: DEFAULT
}
<NSDQ_LEX> MORE: {
<NSDQ_ESCAPED_ESCAPE: "\\\\">
| <NSDQ_ESCAPED_QUOTE: "\\\"">
| <NSDQ_ANY: ~[]>
}
<NSSQ_LEX> TOKEN: {
<NSSQ_CONTENT: "'"> {
matchedToken.image = cutContentTokenImage(matchedToken, 1);
matchedToken.kind = STRING_CONTENT;
}: DEFAULT
}
<NSSQ_LEX> MORE: {
<NSSQ_ESCAPED_ESCAPE: "\\\\">
| <NSSQ_ESCAPED_QUOTE: "\\'">
| <NSSQ_ANY: ~[]>
}
<NTDQ_LEX> TOKEN: {
<NTDQ_CONTENT: "\"\"\""> {
matchedToken.image = cutContentTokenImage(matchedToken, 3);
matchedToken.kind = STRING_CONTENT;
}: DEFAULT
}
<NTDQ_LEX> MORE: {
<NTDQ_ESCAPED_ESCAPE: "\\\\">
| <NTDQ_ESCAPED_QUOTE: "\\\"">
| <NTDQ_ANY: ~[]>
}
<NTSQ_LEX> TOKEN: {
<NTSQ_CONTENT: "'''"> {
matchedToken.image = cutContentTokenImage(matchedToken, 3);
matchedToken.kind = STRING_CONTENT;
}: DEFAULT
}
<NTSQ_LEX> MORE: {
<NTSQ_ESCAPED_ESCAPE: "\\\\">
| <NTSQ_ESCAPED_QUOTE: "\\'">
| <NTSQ_ANY: ~[]>
}
///////////////////////////////////////////////////////////////////////////////
// Format string lexer rules:
///////////////////////////////////////////////////////////////////////////////
// TODO currently we parse format specs as one single string and do not break it down in its elements.
<FORMAT_SPEC_LEX> MORE: {
<FORMAT_SPEC_CURLY_OPEN: "{"> {
formatSpecOpenCurly += 1;
}
| <FORMAT_SPEC_CURLY_CLOSE: "}"> {
formatSpecOpenCurly -= 1;