HPCC-31739 Document changes in moving Unicode regex from ICU to PCRE2

Signed-off-by: Jim DeFabia <jamesdefabia@lexisnexis.com>
hpcc-systems · May 17, 2024 · f880b7a · f880b7a
1 parent fae417c
commit f880b7a
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 55 deletions.
diff --git a/docs/EN_US/ECLLanguageReference/ECLR_mods/BltInFunc-REGEXFIND.xml b/docs/EN_US/ECLLanguageReference/ECLR_mods/BltInFunc-REGEXFIND.xml
@@ -66,22 +66,16 @@
   find matches. The <emphasis>regex</emphasis> must be a standard Perl regular
   expression<indexterm>
       <primary>Perl regular expression</primary>
-    </indexterm>. We use third-party libraries to support this, so for
-  non-unicode <emphasis>text</emphasis>, see the Perl-compatible Regular
-  Expressions (PCRE2) documentation at <ulink
-  url="https://www.pcre.org/current/doc/html/pcre2pattern.html">https://www.pcre.org/current/doc/html/pcre2pattern.html</ulink>.
-  For unicode <emphasis>text</emphasis>, see the ICU docs, the sections
-  'Regular Expression Metacharacters' and 'Regular Expression Operators' at
-  <emphasis
-  role="underline">http://userguide.icu-project.org/strings/regexp</emphasis>
-  and the links from there, in particular the section 'UnicodeSet patterns' at
-  <emphasis
-  role="underline">http://userguide.icu-project.org/strings/unicodeset</emphasis>.
-  We use version 2.6 which should support all listed features.</para>
+    </indexterm>. </para>
+
+  <para>We use a third-party library -- Perl-compatible Regular Expressions
+  (PCRE2) to support this. See <ulink
+  url="https://www.pcre.org/current/doc/html/pcre2syntax.html">https://www.pcre.org/current/doc/html/pcre2syntax.html</ulink>
+  for details on the PCRE2 pattern syntax.</para>
 
   <para>Example:</para>
 
-  <programlisting lang="ECL" role="runnable">namesRecord := RECORD
+  <programlisting lang="ECL_Runnable">namesRecord := RECORD
 STRING20 surname;
 STRING10 forename;
 STRING10 userdate;

diff --git a/docs/EN_US/ECLLanguageReference/ECLR_mods/BltInFunc-REGEXFINDSET.xml b/docs/EN_US/ECLLanguageReference/ECLR_mods/BltInFunc-REGEXFINDSET.xml
@@ -54,25 +54,19 @@
   find matches. The <emphasis>regex</emphasis> must be a standard Perl regular
   expression<indexterm>
       <primary>Perl regular expression</primary>
-    </indexterm>. We use third-party libraries to support this, so for
-  non-unicode <emphasis>text</emphasis>, see the Perl-compatible Regular
-  Expressions (PCRE2) documentation at <ulink
-  url="https://www.pcre.org/current/doc/html/pcre2pattern.html">https://www.pcre.org/current/doc/html/pcre2pattern.html</ulink>.
-  For unicode <emphasis>text</emphasis>, see the ICU docs, the sections
-  'Regular Expression Metacharacters' and 'Regular Expression Operators' at
-  <emphasis
-  role="underline">http://userguide.icu-project.org/strings/regexp</emphasis>
-  and the links from there, in particular the section 'UnicodeSet patterns' at
-  <emphasis
-  role="underline">http://userguide.icu-project.org/strings/unicodeset</emphasis>.
-  We use version 2.6 which should support all listed features.</para>
+    </indexterm>. </para>
+
+  <para>We use a third-party library -- Perl-compatible Regular Expressions
+  (PCRE2) to support this. See <ulink
+  url="https://www.pcre.org/current/doc/html/pcre2syntax.html">https://www.pcre.org/current/doc/html/pcre2syntax.html</ulink>
+  for details on the PCRE2 pattern syntax.</para>
 
   <para>REGEXFINDSET ignores capture groups. REGEXFINDSET repeatedly extracts
   the text matching the entire <emphasis>regex</emphasis> pattern.</para>
 
   <para>Example:</para>
 
-  <programlisting lang="ECL" role="runnable">sampleStr := 
+  <programlisting lang="ECL_Runnable">sampleStr := 
   'To: jane@example.com From: john@example.com This is the winter of our discontent.';
 eMails:=REGEXFINDSET('\\w+@[a-zA-Z_]+?\\.[a-zA-Z]{2,3}' , sampleStr);
 OUTPUT(eMails);

diff --git a/docs/EN_US/ECLLanguageReference/ECLR_mods/BltInFunc-REGEXREPLACE.xml b/docs/EN_US/ECLLanguageReference/ECLR_mods/BltInFunc-REGEXREPLACE.xml
@@ -64,51 +64,43 @@
   string. The <emphasis>regex</emphasis> must be a standard Perl regular
   expression<indexterm>
       <primary>Perl regular expression</primary>
-    </indexterm>. We use third-party libraries to support this, so for
-  non-unicode <emphasis>text</emphasis>, see the Perl-compatible Regular
-  Expressions (PCRE2) documentation at <ulink
-  url="https://www.pcre.org/current/doc/html/pcre2pattern.html">https://www.pcre.org/current/doc/html/pcre2pattern.html</ulink>.
-  For unicode <emphasis>text</emphasis>, see the ICU docs, the sections
-  'Regular Expression Metacharacters' and 'Regular Expression Operators' at
-  <emphasis
-  role="underline">http://userguide.icu-project.org/strings/regexp</emphasis>
-  and the links from there, in particular the section 'UnicodeSet patterns' at
-  <emphasis
-  role="underline">http://userguide.icu-project.org/strings/unicodeset</emphasis>.
-  We use version 2.6 which should support all listed features.</para>
+    </indexterm>. </para>
+
+  <para>We use a third-party library -- Perl-compatible Regular Expressions
+  (PCRE2) to support this. See <ulink
+  url="https://www.pcre.org/current/doc/html/pcre2syntax.html">https://www.pcre.org/current/doc/html/pcre2syntax.html</ulink>
+  for details on the PCRE2 pattern syntax.</para>
 
   <para>Example:</para>
 
-  <programlisting lang="ECL" role="runnable">REGEXREPLACE('(.a)t', 'the cat sat on the mat', '$1p');
+  <programlisting lang="ECL_Runnable">REGEXREPLACE('(.a)t', 'the cat sat on the mat', '$1p');
         //ASCII
 REGEXREPLACE(u'(.a)t', u'the cat sat on the mat', u'$1p');
         //UNICODE
 // both of these examples return 'the cap sap on the map'
 
 inrec := {STRING10 str, UNICODE10 ustr};
 inset := DATASET([{'She', u'Eins'}, {'Sells', u'Zwei'},
-{'Sea', u'Drei'}, {'Shells', u'Vier'}], inrec);
-outrec := {STRING10 orig, STRING10 withcase, STRING10
-        wocase,
-UNICODE10 uorig,UNICODE10 uwithcase,UNICODE10 uwocase};
+                  {'Sea', u'Drei'}, {'Shells', u'Vier'}], inrec);
+outrec := {STRING10 orig, STRING10 withcase, STRING10 
+           wocase,UNICODE10 uorig,UNICODE10 uwithcase,UNICODE10 uwocase};
 
 outrec trans(inrec l) := TRANSFORM
-SELF.orig := l.str;
-SELF.withcase := REGEXREPLACE('s', l.str, 'f');
-SELF.wocase := REGEXREPLACE('s', l.str, 'f', NOCASE);
-SELF.uorig := l.ustr;
-SELF.uwithcase := REGEXREPLACE(u'e', l.ustr, u'\u00EB');
-SELF.uwocase := REGEXREPLACE(u'e', l.ustr, u'\u00EB',
-        NOCASE);
+  SELF.orig := l.str;
+  SELF.withcase := REGEXREPLACE('s', l.str, 'f');
+  SELF.wocase := REGEXREPLACE('s', l.str, 'f', NOCASE);
+  SELF.uorig := l.ustr;
+  SELF.uwithcase := REGEXREPLACE(u'e', l.ustr, u'\u00EB');
+  SELF.uwocase := REGEXREPLACE(u'e', l.ustr, u'\u00EB',NOCASE);
 END;
 OUTPUT(PROJECT(inset, trans(LEFT)));
 
 /* the result set is:
-orig withcase wocase uorig uwithcase uwocase
-She She fhe Eins Eins \xc3\xabins
-Sells Sellf fellf Zwei Zw\xc3\xabi Zw\xc3\xabi
-Sea Sea fea Drei Dr\xc3\xabi Dr\xc3\xabi
-Shells Shellf fhellf Vier Vi\xc3\xabr Vi\xc3\xabr */
+orig    withcase wocase uorig uwithcase    uwocase
+She     She      fhe    Eins  Eins         \xc3\xabins
+Sells   Sellf    fellf  Zwei  Zw\xc3\xabi  Zw\xc3\xabi
+Sea     Sea      fea    Drei  Dr\xc3\xabi  Dr\xc3\xabi
+Shells  Shellf   fhellf Vier  Vi\xc3\xabr  Vi\xc3\xabr */
 </programlisting>
 
   <para>See Also: <link linkend="PARSE">PARSE</link>, <link