Merge 1be3499 into 2e29a40

harfbuzz · Mar 15, 2020 · f147d30 · f147d30
2 parents 2e29a40 + 1be3499
commit f147d30
Show file tree

Hide file tree

Showing 21 changed files with 3,541 additions and 3,220 deletions.
diff --git a/src/Makefile.am b/src/Makefile.am
@@ -262,10 +262,11 @@ unicode-tables: \
 	arabic-table \
 	emoji-table \
 	indic-table \
+	os2-table \
 	tag-table \
 	ucd-table \
 	use-table \
-	emoji-table \
+	vowel-constraints \
 	$(NULL)
 
 arabic-table: gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt
@@ -277,6 +278,9 @@ emoji-table: gen-emoji-table.py emoji-data.txt
 indic-table: gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt
 	$(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-ot-shape-complex-indic-table.cc \
 	|| ($(RM) $(srcdir)/hb-ot-shape-complex-indic-table.cc; false)
+os2-table: ./gen-os2-unicode-ranges.py OS2UnicodeRanges.txt
+	$(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-ot-os2-unicode-ranges.hh \
+	|| ($(RM) $(srcdir)/hb-ot-os2-unicode-ranges.hh; false)
 tag-table: gen-tag-table.py languagetags language-subtag-registry
 	$(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-ot-tag-table.hh \
 	|| ($(RM) $(srcdir)/hb-ot-tag-table.hh; false)

diff --git a/src/OS2UnicodeRanges.txt b/src/OS2UnicodeRanges.txt
@@ -0,0 +1,169 @@
+0	Basic Latin	0000-007F
+1	Latin-1 Supplement	0080-00FF
+2	Latin Extended-A	0100-017F
+3	Latin Extended-B	0180-024F
+4	IPA Extensions	0250-02AF
+	Phonetic Extensions	1D00-1D7F
+	Phonetic Extensions Supplement	1D80-1DBF
+5	Spacing Modifier Letters	02B0-02FF
+	Modifier Tone Letters	A700-A71F
+6	Combining Diacritical Marks	0300-036F
+	Combining Diacritical Marks Supplement	1DC0-1DFF
+7	Greek and Coptic	0370-03FF
+8	Coptic	2C80-2CFF
+9	Cyrillic	0400-04FF
+	Cyrillic Supplement	0500-052F
+	Cyrillic Extended-A	2DE0-2DFF
+	Cyrillic Extended-B	A640-A69F
+10	Armenian	0530-058F
+11	Hebrew	0590-05FF
+12	Vai	A500-A63F
+13	Arabic	0600-06FF
+	Arabic Supplement	0750-077F
+14	NKo	07C0-07FF
+15	Devanagari	0900-097F
+16	Bengali	0980-09FF
+17	Gurmukhi	0A00-0A7F
+18	Gujarati	0A80-0AFF
+19	Oriya	0B00-0B7F
+20	Tamil	0B80-0BFF
+21	Telugu	0C00-0C7F
+22	Kannada	0C80-0CFF
+23	Malayalam	0D00-0D7F
+24	Thai	0E00-0E7F
+25	Lao	0E80-0EFF
+26	Georgian	10A0-10FF
+	Georgian Supplement	2D00-2D2F
+27	Balinese	1B00-1B7F
+28	Hangul Jamo	1100-11FF
+29	Latin Extended Additional	1E00-1EFF
+	Latin Extended-C	2C60-2C7F
+	Latin Extended-D	A720-A7FF
+30	Greek Extended	1F00-1FFF
+31	General Punctuation	2000-206F
+	Supplemental Punctuation	2E00-2E7F
+32	Superscripts And Subscripts	2070-209F
+33	Currency Symbols	20A0-20CF
+34	Combining Diacritical Marks For Symbols	20D0-20FF
+35	Letterlike Symbols	2100-214F
+36	Number Forms	2150-218F
+37	Arrows	2190-21FF
+	Supplemental Arrows-A	27F0-27FF
+	Supplemental Arrows-B	2900-297F
+	Miscellaneous Symbols and Arrows	2B00-2BFF
+38	Mathematical Operators	2200-22FF
+	Supplemental Mathematical Operators	2A00-2AFF
+	Miscellaneous Mathematical Symbols-A	27C0-27EF
+	Miscellaneous Mathematical Symbols-B	2980-29FF
+39	Miscellaneous Technical	2300-23FF
+40	Control Pictures	2400-243F
+41	Optical Character Recognition	2440-245F
+42	Enclosed Alphanumerics	2460-24FF
+43	Box Drawing	2500-257F
+44	Block Elements	2580-259F
+45	Geometric Shapes	25A0-25FF
+46	Miscellaneous Symbols	2600-26FF
+47	Dingbats	2700-27BF
+48	CJK Symbols And Punctuation	3000-303F
+49	Hiragana	3040-309F
+50	Katakana	30A0-30FF
+	Katakana Phonetic Extensions	31F0-31FF
+51	Bopomofo	3100-312F
+	Bopomofo Extended	31A0-31BF
+52	Hangul Compatibility Jamo	3130-318F
+53	Phags-pa	A840-A87F
+54	Enclosed CJK Letters And Months	3200-32FF
+55	CJK Compatibility	3300-33FF
+56	Hangul Syllables	AC00-D7AF
+57	Non-Plane 0	10000-10FFFF
+58	Phoenician	10900-1091F
+59	CJK Unified Ideographs	4E00-9FFF
+	CJK Radicals Supplement	2E80-2EFF
+	Kangxi Radicals	2F00-2FDF
+	Ideographic Description Characters	2FF0-2FFF
+	CJK Unified Ideographs Extension A	3400-4DBF
+	CJK Unified Ideographs Extension B	20000-2A6DF
+	Kanbun	3190-319F
+60	Private Use Area (plane 0)	E000-F8FF
+61	CJK Strokes	31C0-31EF
+	CJK Compatibility Ideographs	F900-FAFF
+	CJK Compatibility Ideographs Supplement	2F800-2FA1F
+62	Alphabetic Presentation Forms	FB00-FB4F
+63	Arabic Presentation Forms-A	FB50-FDFF
+64	Combining Half Marks	FE20-FE2F
+65	Vertical Forms	FE10-FE1F
+	CJK Compatibility Forms	FE30-FE4F
+66	Small Form Variants	FE50-FE6F
+67	Arabic Presentation Forms-B	FE70-FEFF
+68	Halfwidth And Fullwidth Forms	FF00-FFEF
+69	Specials	FFF0-FFFF
+70	Tibetan	0F00-0FFF
+71	Syriac	0700-074F
+72	Thaana	0780-07BF
+73	Sinhala	0D80-0DFF
+74	Myanmar	1000-109F
+75	Ethiopic	1200-137F
+	Ethiopic Supplement	1380-139F
+	Ethiopic Extended	2D80-2DDF
+76	Cherokee	13A0-13FF
+77	Unified Canadian Aboriginal Syllabics	1400-167F
+78	Ogham	1680-169F
+79	Runic	16A0-16FF
+80	Khmer	1780-17FF
+	Khmer Symbols	19E0-19FF
+81	Mongolian	1800-18AF
+82	Braille Patterns	2800-28FF
+83	Yi Syllables	A000-A48F
+	Yi Radicals	A490-A4CF
+84	Tagalog	1700-171F
+	Hanunoo	1720-173F
+	Buhid	1740-175F
+	Tagbanwa	1760-177F
+85	Old Italic	10300-1032F
+86	Gothic	10330-1034F
+87	Deseret	10400-1044F
+88	Byzantine Musical Symbols	1D000-1D0FF
+	Musical Symbols	1D100-1D1FF
+	Ancient Greek Musical Notation	1D200-1D24F
+89	Mathematical Alphanumeric Symbols	1D400-1D7FF
+90	Private Use (plane 15)	F0000-FFFFD
+	Private Use (plane 16)	100000-10FFFD
+91	Variation Selectors	FE00-FE0F
+	Variation Selectors Supplement	E0100-E01EF
+92	Tags	E0000-E007F
+93	Limbu	1900-194F
+94	Tai Le	1950-197F
+95	New Tai Lue	1980-19DF
+96	Buginese	1A00-1A1F
+97	Glagolitic	2C00-2C5F
+98	Tifinagh	2D30-2D7F
+99	Yijing Hexagram Symbols	4DC0-4DFF
+100	Syloti Nagri	A800-A82F
+101	Linear B Syllabary	10000-1007F
+	Linear B Ideograms	10080-100FF
+	Aegean Numbers	10100-1013F
+102	Ancient Greek Numbers	10140-1018F
+103	Ugaritic	10380-1039F
+104	Old Persian	103A0-103DF
+105	Shavian	10450-1047F
+106	Osmanya	10480-104AF
+107	Cypriot Syllabary	10800-1083F
+108	Kharoshthi	10A00-10A5F
+109	Tai Xuan Jing Symbols	1D300-1D35F
+110	Cuneiform	12000-123FF
+	Cuneiform Numbers and Punctuation	12400-1247F
+111	Counting Rod Numerals	1D360-1D37F
+112	Sundanese	1B80-1BBF
+113	Lepcha	1C00-1C4F
+114	Ol Chiki	1C50-1C7F
+115	Saurashtra	A880-A8DF
+116	Kayah Li	A900-A92F
+117	Rejang	A930-A95F
+118	Cham	AA00-AA5F
+119	Ancient Symbols	10190-101CF
+120	Phaistos Disc	101D0-101FF
+121	Carian	102A0-102DF
+	Lycian	10280-1029F
+	Lydian	10920-1093F
+122	Domino Tiles	1F030-1F09F
+	Mahjong Tiles	1F000-1F02F
diff --git a/src/gen-arabic-table.py b/src/gen-arabic-table.py
@@ -5,7 +5,7 @@
 if len (sys.argv) != 4:
 	print ("""usage: ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt
 
-Input files, as of Unicode 12:
+Input files:
 * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
@@ -65,7 +65,7 @@ def print_joining_table(f):
 		values[u] = value
 
 	short_value = {}
-	for value in set([v for v in values.values()] + ['JOINING_TYPE_X']):
+	for value in sorted (set ([v for v in values.values ()] + ['JOINING_TYPE_X'])):
 		short = ''.join(x[0] for x in value.split('_')[2:])
 		assert short not in short_value.values()
 		short_value[value] = short

diff --git a/src/gen-emoji-table.py b/src/gen-emoji-table.py
@@ -8,8 +8,8 @@
 if len (sys.argv) != 2:
 	print("""usage: ./gen-emoji-table.py emoji-data.txt
 
-Input file, as of Unicode 12:
-* https://www.unicode.org/Public/emoji/12.0/emoji-data.txt""", file=sys.stderr)
+Input file:
+* https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt""", file=sys.stderr)
 	sys.exit (1)
 
 f = open(sys.argv[1])

diff --git a/src/gen-indic-table.py b/src/gen-indic-table.py
@@ -5,7 +5,7 @@
 if len (sys.argv) != 4:
 	print ("""usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt
 
-Input files, as of Unicode 12:
+Input files:
 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt""", file=sys.stderr)

diff --git a/src/gen-os2-unicode-ranges.py b/src/gen-os2-unicode-ranges.py
@@ -14,7 +14,32 @@
 except NameError:
   pass  # Python 3
 
-print ("""static OS2Range _hb_os2_unicode_ranges[] =
+print ("/* == Start of generated table == */")
+print ("/*")
+print (" * The following table is generated by running:")
+print (" *")
+print (" *   ./gen-os2-unicode-ranges.py OS2UnicodeRanges.txt")
+print (" */")
+print ()
+print ("#ifndef HB_OT_OS2_UNICODE_RANGES_HH")
+print ("#define HB_OT_OS2_UNICODE_RANGES_HH")
+print ()
+print ('#include "hb.hh"')
+print ()
+print ("namespace OT {")
+print ()
+print ("struct OS2Range")
+print ("{")
+print ("  int cmp (hb_codepoint_t key) const")
+print ("  { return (key < start) ? -1 : key <= end ? 0 : +1; }")
+print ()
+print ("  hb_codepoint_t start;")
+print ("  hb_codepoint_t end;")
+print ("  unsigned int bit;")
+print ("};")
+print ()
+print ("/* Note: The contents of this array was generated using gen-os2-unicode-ranges.py. */")
+print ("""static const OS2Range _hb_os2_unicode_ranges[] =
 {""")
 
 args = sys.argv[1:]
@@ -53,3 +78,22 @@
   print ("  {%s, %s, %s}, // %s" % (start, end, bit, ranges[3]))
 
 print ("""};""")
+print ()
+print ("/**")
+print (" * _hb_ot_os2_get_unicode_range_bit:")
+print (" * Returns the bit to be set in os/2 ulUnicodeOS2Range for a given codepoint.")
+print (" **/")
+print ("static unsigned int")
+print ("_hb_ot_os2_get_unicode_range_bit (hb_codepoint_t cp)")
+print ("{")
+print ("  auto* range = hb_bsearch (cp, _hb_os2_unicode_ranges, ARRAY_LENGTH (_hb_os2_unicode_ranges));")
+print ("  if (range != nullptr)")
+print ("    return range->bit;")
+print ("  return -1;")
+print ("}")
+print ("")
+print ("} /* namespace OT */")
+print ()
+print ("#endif /* HB_OT_OS2_UNICODE_RANGES_HH */")
+print ()
+print ("/* == End of generated table == */")
diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py
@@ -32,7 +32,7 @@ def write (s):
 if len (sys.argv) != 3:
 	print ('''usage: ./gen-tag-table.py languagetags language-subtag-registry
 
-Input files, as of Unicode 12:
+Input files:
 * https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
 * https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry''', file=sys.stderr)
 	sys.exit (1)

diff --git a/src/gen-ucd-table.py b/src/gen-ucd-table.py
@@ -7,7 +7,7 @@
 if len (sys.argv) not in (2, 3):
 	print("""usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h]
 
-Input file, as of Unicode 12:
+Input file:
 * https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip""", file=sys.stderr)
 	sys.exit(1)
 

diff --git a/src/gen-use-table.py b/src/gen-use-table.py
@@ -7,7 +7,7 @@
 if len (sys.argv) != 5:
 	print ("""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt
 
-Input file, as of Unicode 12:
+Input file:
 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
@@ -302,7 +302,7 @@ def is_VOWEL_MOD(U, UISC, UGC):
 	},
 	'M': {
 		'Abv': [Top],
-		'Blw': [Bottom, Bottom_And_Left],
+		'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
 		'Pst': [Right],
 		'Pre': [Left],
 	},
@@ -399,6 +399,7 @@ def map_to_use(data):
 		if 0x1CF8 <= U <= 0x1CF9: UIPC = Top
 
 		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
+			USE == 'R' or
 			USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC)
 
 		pos_mapping = use_positions.get(USE, None)

diff --git a/src/gen-vowel-constraints.py b/src/gen-vowel-constraints.py
@@ -21,7 +21,7 @@ def write (s):
 if len (sys.argv) != 3:
 	print ("""usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt
 
-Input file, as of Unicode 12:
+Input file:
 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt""", file=sys.stderr)
 	sys.exit (1)
 

diff --git a/src/hb-common.h b/src/hb-common.h
@@ -367,6 +367,14 @@ typedef enum
   /*12.0*/HB_SCRIPT_NYIAKENG_PUACHUE_HMONG	= HB_TAG ('H','m','n','p'),
   /*12.0*/HB_SCRIPT_WANCHO			= HB_TAG ('W','c','h','o'),
 
+  /*
+   * Since REPLACEME
+   */
+  /*13.0*/HB_SCRIPT_CHORASMIAN			= HB_TAG ('C','h','r','s'),
+  /*13.0*/HB_SCRIPT_DIVES_AKURU			= HB_TAG ('D','i','a','k'),
+  /*13.0*/HB_SCRIPT_KHITAN_SMALL_SCRIPT		= HB_TAG ('K','i','t','s'),
+  /*13.0*/HB_SCRIPT_YEZIDI			= HB_TAG ('Y','e','z','i'),
+
   /* No script set. */
   HB_SCRIPT_INVALID				= HB_TAG_NONE,