revdep/library/fastDummies/old/stringr/doc/regular-expressions.html

<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />

<meta name="viewport" content="width=device-width, initial-scale=1">


<title>Regular expressions</title>


<style type="text/css">code{white-space: pre;}</style>
<style type="text/css" data-origin="pandoc">
a.sourceLine { display: inline-block; line-height: 1.25; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
a.sourceLine:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource a.sourceLine
  { position: relative; left: -4em; }
pre.numberSource a.sourceLine::before
  { content: attr(data-line-number);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; pointer-events: all; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {  }
@media screen {
a.sourceLine::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */

</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
  var sheets = document.styleSheets;
  for (var i = 0; i < sheets.length; i++) {
    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
    for (var j = 0; j < rules.length; j++) {
      var rule = rules[j];
      // check if there is a div.sourceCode rule
      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") continue;
      var style = rule.style.cssText;
      // check if color or background-color is set
      if (rule.style.color === '' || rule.style.backgroundColor === '') continue;
      // replace div.sourceCode by a pre.sourceCode rule
      sheets[i].deleteRule(j);
      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
    }
  }
})();
</script>


<style type="text/css">body {
background-color: #fff;
margin: 1em auto;
max-width: 700px;
overflow: visible;
padding-left: 2em;
padding-right: 2em;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
line-height: 1.35;
}
#header {
text-align: center;
}
#TOC {
clear: both;
margin: 0 0 10px 10px;
padding: 4px;
width: 400px;
border: 1px solid #CCCCCC;
border-radius: 5px;
background-color: #f6f6f6;
font-size: 13px;
line-height: 1.3;
}
#TOC .toctitle {
font-weight: bold;
font-size: 15px;
margin-left: 5px;
}
#TOC ul {
padding-left: 40px;
margin-left: -1.5em;
margin-top: 5px;
margin-bottom: 5px;
}
#TOC ul ul {
margin-left: -2em;
}
#TOC li {
line-height: 16px;
}
table {
margin: 1em auto;
border-width: 1px;
border-color: #DDDDDD;
border-style: outset;
border-collapse: collapse;
}
table th {
border-width: 2px;
padding: 5px;
border-style: inset;
}
table td {
border-width: 1px;
border-style: inset;
line-height: 18px;
padding: 5px 5px;
}
table, table th, table td {
border-left-style: none;
border-right-style: none;
}
table thead, table tr.even {
background-color: #f7f7f7;
}
p {
margin: 0.5em 0;
}
blockquote {
background-color: #f6f6f6;
padding: 0.25em 0.75em;
}
hr {
border-style: solid;
border: none;
border-top: 1px solid #777;
margin: 28px 0;
}
dl {
margin-left: 0;
}
dl dd {
margin-bottom: 13px;
margin-left: 13px;
}
dl dt {
font-weight: bold;
}
ul {
margin-top: 0;
}
ul li {
list-style: circle outside;
}
ul ul {
margin-bottom: 0;
}
pre, code {
background-color: #f7f7f7;
border-radius: 3px;
color: #333;
white-space: pre-wrap; 
}
pre {
border-radius: 3px;
margin: 5px 0px 10px 0px;
padding: 10px;
}
pre:not([class]) {
background-color: #f7f7f7;
}
code {
font-family: Consolas, Monaco, 'Courier New', monospace;
font-size: 85%;
}
p > code, li > code {
padding: 2px 0px;
}
div.figure {
text-align: center;
}
img {
background-color: #FFFFFF;
padding: 2px;
border: 1px solid #DDDDDD;
border-radius: 3px;
border: 1px solid #CCCCCC;
margin: 0 5px;
}
h1 {
margin-top: 0;
font-size: 35px;
line-height: 40px;
}
h2 {
border-bottom: 4px solid #f7f7f7;
padding-top: 10px;
padding-bottom: 2px;
font-size: 145%;
}
h3 {
border-bottom: 2px solid #f7f7f7;
padding-top: 10px;
font-size: 120%;
}
h4 {
border-bottom: 1px solid #f7f7f7;
margin-left: 8px;
font-size: 105%;
}
h5, h6 {
border-bottom: 1px solid #ccc;
font-size: 105%;
}
a {
color: #0033dd;
text-decoration: none;
}
a:hover {
color: #6666ff; }
a:visited {
color: #800080; }
a:visited:hover {
color: #BB00BB; }
a[href^="http:"] {
text-decoration: underline; }
a[href^="https:"] {
text-decoration: underline; }

code > span.kw { color: #555; font-weight: bold; } 
code > span.dt { color: #902000; } 
code > span.dv { color: #40a070; } 
code > span.bn { color: #d14; } 
code > span.fl { color: #d14; } 
code > span.ch { color: #d14; } 
code > span.st { color: #d14; } 
code > span.co { color: #888888; font-style: italic; } 
code > span.ot { color: #007020; } 
code > span.al { color: #ff0000; font-weight: bold; } 
code > span.fu { color: #900; font-weight: bold; }  code > span.er { color: #a61717; background-color: #e3d2d2; } 
</style>

</head>

<body>


<h1 class="title toc-ignore">Regular expressions</h1>


<p>Regular expressions are a concise and flexible tool for describing patterns in strings. This vignette describes the key features of stringr’s regular expressions, as implemented by <a href="https://github.com/gagolews/stringi">stringi</a>. It is not a tutorial, so if you’re unfamiliar regular expressions, I’d recommend starting at <a href="http://r4ds.had.co.nz/strings.html" class="uri">http://r4ds.had.co.nz/strings.html</a>. If you want to master the details, I’d recommend reading the classic <a href="https://amzn.com/0596528124"><em>Mastering Regular Expressions</em></a> by Jeffrey E. F. Friedl.</p>
<p>Regular expressions are the default pattern engine in stringr. That means when you use a pattern matching function with a bare string, it’s equivalent to wrapping it in a call to <code>regex()</code>:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb1-1" data-line-number="1"><span class="co"># The regular call:</span></a>
<a class="sourceLine" id="cb1-2" data-line-number="2"><span class="kw">str_extract</span>(fruit, <span class="st">&quot;nana&quot;</span>)</a>
<a class="sourceLine" id="cb1-3" data-line-number="3"><span class="co"># Is shorthand for</span></a>
<a class="sourceLine" id="cb1-4" data-line-number="4"><span class="kw">str_extract</span>(fruit, <span class="kw">regex</span>(<span class="st">&quot;nana&quot;</span>))</a></code></pre></div>
<p>You will need to use <code>regex()</code> explicitly if you want to override the default options, as you’ll see in examples below.</p>
<div id="basic-matches" class="section level2">
<h2>Basic matches</h2>
<p>The simplest patterns match exact strings:</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb2-1" data-line-number="1">x &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;apple&quot;</span>, <span class="st">&quot;banana&quot;</span>, <span class="st">&quot;pear&quot;</span>)</a>
<a class="sourceLine" id="cb2-2" data-line-number="2"><span class="kw">str_extract</span>(x, <span class="st">&quot;an&quot;</span>)</a>
<a class="sourceLine" id="cb2-3" data-line-number="3"><span class="co">#&gt; [1] NA   &quot;an&quot; NA</span></a></code></pre></div>
<p>You can perform a case-insensitive match using <code>ignore_case = TRUE</code>:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb3-1" data-line-number="1">bananas &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;banana&quot;</span>, <span class="st">&quot;Banana&quot;</span>, <span class="st">&quot;BANANA&quot;</span>)</a>
<a class="sourceLine" id="cb3-2" data-line-number="2"><span class="kw">str_detect</span>(bananas, <span class="st">&quot;banana&quot;</span>)</a>
<a class="sourceLine" id="cb3-3" data-line-number="3"><span class="co">#&gt; [1]  TRUE FALSE FALSE</span></a>
<a class="sourceLine" id="cb3-4" data-line-number="4"><span class="kw">str_detect</span>(bananas, <span class="kw">regex</span>(<span class="st">&quot;banana&quot;</span>, <span class="dt">ignore_case =</span> <span class="ot">TRUE</span>))</a>
<a class="sourceLine" id="cb3-5" data-line-number="5"><span class="co">#&gt; [1] TRUE TRUE TRUE</span></a></code></pre></div>
<p>The next step up in complexity is <code>.</code>, which matches any character except a newline:</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb4-1" data-line-number="1"><span class="kw">str_extract</span>(x, <span class="st">&quot;.a.&quot;</span>)</a>
<a class="sourceLine" id="cb4-2" data-line-number="2"><span class="co">#&gt; [1] NA    &quot;ban&quot; &quot;ear&quot;</span></a></code></pre></div>
<p>You can allow <code>.</code> to match everything, including <code>\n</code>, by setting <code>dotall = TRUE</code>:</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb5-1" data-line-number="1"><span class="kw">str_detect</span>(<span class="st">&quot;</span><span class="ch">\n</span><span class="st">X</span><span class="ch">\n</span><span class="st">&quot;</span>, <span class="st">&quot;.X.&quot;</span>)</a>
<a class="sourceLine" id="cb5-2" data-line-number="2"><span class="co">#&gt; [1] FALSE</span></a>
<a class="sourceLine" id="cb5-3" data-line-number="3"><span class="kw">str_detect</span>(<span class="st">&quot;</span><span class="ch">\n</span><span class="st">X</span><span class="ch">\n</span><span class="st">&quot;</span>, <span class="kw">regex</span>(<span class="st">&quot;.X.&quot;</span>, <span class="dt">dotall =</span> <span class="ot">TRUE</span>))</a>
<a class="sourceLine" id="cb5-4" data-line-number="4"><span class="co">#&gt; [1] TRUE</span></a></code></pre></div>
</div>
<div id="escaping" class="section level2">
<h2>Escaping</h2>
<p>If “<code>.</code>” matches any character, how do you match a literal “<code>.</code>”? You need to use an “escape” to tell the regular expression you want to match it exactly, not use its special behaviour. Like strings, regexps use the backslash, <code>\</code>, to escape special behaviour. So to match an <code>.</code>, you need the regexp <code>\.</code>. Unfortunately this creates a problem. We use strings to represent regular expressions, and <code>\</code> is also used as an escape symbol in strings. So to create the regular expression <code>\.</code> we need the string <code>&quot;\\.&quot;</code>.</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb6-1" data-line-number="1"><span class="co"># To create the regular expression, we need \\</span></a>
<a class="sourceLine" id="cb6-2" data-line-number="2">dot &lt;-<span class="st"> &quot;</span><span class="ch">\\</span><span class="st">.&quot;</span></a>
<a class="sourceLine" id="cb6-3" data-line-number="3"></a>
<a class="sourceLine" id="cb6-4" data-line-number="4"><span class="co"># But the expression itself only contains one:</span></a>
<a class="sourceLine" id="cb6-5" data-line-number="5"><span class="kw">writeLines</span>(dot)</a>
<a class="sourceLine" id="cb6-6" data-line-number="6"><span class="co">#&gt; \.</span></a>
<a class="sourceLine" id="cb6-7" data-line-number="7"></a>
<a class="sourceLine" id="cb6-8" data-line-number="8"><span class="co"># And this tells R to look for an explicit .</span></a>
<a class="sourceLine" id="cb6-9" data-line-number="9"><span class="kw">str_extract</span>(<span class="kw">c</span>(<span class="st">&quot;abc&quot;</span>, <span class="st">&quot;a.c&quot;</span>, <span class="st">&quot;bef&quot;</span>), <span class="st">&quot;a</span><span class="ch">\\</span><span class="st">.c&quot;</span>)</a>
<a class="sourceLine" id="cb6-10" data-line-number="10"><span class="co">#&gt; [1] NA    &quot;a.c&quot; NA</span></a></code></pre></div>
<p>If <code>\</code> is used as an escape character in regular expressions, how do you match a literal <code>\</code>? Well you need to escape it, creating the regular expression <code>\\</code>. To create that regular expression, you need to use a string, which also needs to escape <code>\</code>. That means to match a literal <code>\</code> you need to write <code>&quot;\\\\&quot;</code> — you need four backslashes to match one!</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb7-1" data-line-number="1">x &lt;-<span class="st"> &quot;a</span><span class="ch">\\</span><span class="st">b&quot;</span></a>
<a class="sourceLine" id="cb7-2" data-line-number="2"><span class="kw">writeLines</span>(x)</a>
<a class="sourceLine" id="cb7-3" data-line-number="3"><span class="co">#&gt; a\b</span></a>
<a class="sourceLine" id="cb7-4" data-line-number="4"></a>
<a class="sourceLine" id="cb7-5" data-line-number="5"><span class="kw">str_extract</span>(x, <span class="st">&quot;</span><span class="ch">\\\\</span><span class="st">&quot;</span>)</a>
<a class="sourceLine" id="cb7-6" data-line-number="6"><span class="co">#&gt; [1] &quot;\\&quot;</span></a></code></pre></div>
<p>In this vignette, I use <code>\.</code> to denote the regular expression, and <code>&quot;\\.&quot;</code> to denote the string that represents the regular expression.</p>
<p>An alternative quoting mechanism is <code>\Q...\E</code>: all the characters in <code>...</code> are treated as exact matches. This is useful if you want to exactly match user input as part of a regular expression.</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb8-1" data-line-number="1">x &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;a.b.c.d&quot;</span>, <span class="st">&quot;aeb&quot;</span>)</a>
<a class="sourceLine" id="cb8-2" data-line-number="2">starts_with &lt;-<span class="st"> &quot;a.b&quot;</span></a>
<a class="sourceLine" id="cb8-3" data-line-number="3"></a>
<a class="sourceLine" id="cb8-4" data-line-number="4"><span class="kw">str_detect</span>(x, <span class="kw">paste0</span>(<span class="st">&quot;^&quot;</span>, starts_with))</a>
<a class="sourceLine" id="cb8-5" data-line-number="5"><span class="co">#&gt; [1] TRUE TRUE</span></a>
<a class="sourceLine" id="cb8-6" data-line-number="6"><span class="kw">str_detect</span>(x, <span class="kw">paste0</span>(<span class="st">&quot;^</span><span class="ch">\\</span><span class="st">Q&quot;</span>, starts_with, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">E&quot;</span>))</a>
<a class="sourceLine" id="cb8-7" data-line-number="7"><span class="co">#&gt; [1]  TRUE FALSE</span></a></code></pre></div>
</div>
<div id="special-characters" class="section level2">
<h2>Special characters</h2>
<p>Escapes also allow you to specify individual characters that are otherwise hard to type. You can specify individual unicode characters in five ways, either as a variable number of hex digits (four is most common), or by name:</p>
<ul>
<li><p><code>\xhh</code>: 2 hex digits.</p></li>
<li><p><code>\x{hhhh}</code>: 1-6 hex digits.</p></li>
<li><p><code>\uhhhh</code>: 4 hex digits.</p></li>
<li><p><code>\Uhhhhhhhh</code>: 8 hex digits.</p></li>
<li><p><code>\N{name}</code>, e.g. <code>\N{grinning face}</code> matches the basic smiling emoji.</p></li>
</ul>
<p>Similarly, you can specify many common control characters:</p>
<ul>
<li><p><code>\a</code>: bell.</p></li>
<li><p><code>\cX</code>: match a control-X character.</p></li>
<li><p><code>\e</code>: escape (<code>\u001B</code>).</p></li>
<li><p><code>\f</code>: form feed (<code>\u000C</code>).</p></li>
<li><p><code>\n</code>: line feed (<code>\u000A</code>).</p></li>
<li><p><code>\r</code>: carriage return (<code>\u000D</code>).</p></li>
<li><p><code>\t</code>: horizontal tabulation (<code>\u0009</code>).</p></li>
<li><p><code>\0ooo</code> match an octal character. ‘ooo’ is from one to three octal digits, from 000 to 0377. The leading zero is required.</p></li>
</ul>
<p>(Many of these are only of historical interest and are only included here for the sake of completeness.)</p>
</div>
<div id="matching-multiple-characters" class="section level2">
<h2>Matching multiple characters</h2>
<p>There are a number of patterns that match more than one character. You’ve already seen <code>.</code>, which matches any character (except a newline). A closely related operator is <code>\X</code>, which matches a <strong>grapheme cluster</strong>, a set of individual elements that form a single symbol. For example, one way of representing “á” is as the letter “a” plus an accent: <code>.</code> will match the component “a”, while <code>\X</code> will match the complete symbol:</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb9-1" data-line-number="1">x &lt;-<span class="st"> &quot;a\u0301&quot;</span></a>
<a class="sourceLine" id="cb9-2" data-line-number="2"><span class="kw">str_extract</span>(x, <span class="st">&quot;.&quot;</span>)</a>
<a class="sourceLine" id="cb9-3" data-line-number="3"><span class="co">#&gt; [1] &quot;a&quot;</span></a>
<a class="sourceLine" id="cb9-4" data-line-number="4"><span class="kw">str_extract</span>(x, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">X&quot;</span>)</a>
<a class="sourceLine" id="cb9-5" data-line-number="5"><span class="co">#&gt; [1] &quot;á&quot;</span></a></code></pre></div>
<p>There are five other escaped pairs that match narrower classes of characters:</p>
<ul>
<li><p><code>\d</code>: matches any digit. The complement, <code>\D</code>, matches any character that is not a decimal digit.</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb10-1" data-line-number="1"><span class="kw">str_extract_all</span>(<span class="st">&quot;1 + 2 = 3&quot;</span>, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">d+&quot;</span>)[[<span class="dv">1</span>]]</a>
<a class="sourceLine" id="cb10-2" data-line-number="2"><span class="co">#&gt; [1] &quot;1&quot; &quot;2&quot; &quot;3&quot;</span></a></code></pre></div>
<p>Technically, <code>\d</code> includes any character in the Unicode Category of Nd (“Number, Decimal Digit”), which also includes numeric symbols from other languages:</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb11-1" data-line-number="1"><span class="co"># Some Laotian numbers</span></a>
<a class="sourceLine" id="cb11-2" data-line-number="2"><span class="kw">str_detect</span>(<span class="st">&quot;១២៣&quot;</span>, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">d&quot;</span>)</a>
<a class="sourceLine" id="cb11-3" data-line-number="3"><span class="co">#&gt; [1] TRUE</span></a></code></pre></div></li>
<li><p><code>\s</code>: matches any whitespace. This includes tabs, newlines, form feeds, and any character in the Unicode Z Category (which includes a variety of space characters and other separators.). The complement, <code>\S</code>, matches any non-whitespace character.</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb12-1" data-line-number="1">(text &lt;-<span class="st"> &quot;Some  </span><span class="ch">\t</span><span class="st"> badly</span><span class="ch">\n\t\t</span><span class="st">spaced </span><span class="ch">\f</span><span class="st"> text&quot;</span>)</a>
<a class="sourceLine" id="cb12-2" data-line-number="2"><span class="co">#&gt; [1] &quot;Some  \t badly\n\t\tspaced \f text&quot;</span></a>
<a class="sourceLine" id="cb12-3" data-line-number="3"><span class="kw">str_replace_all</span>(text, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">s+&quot;</span>, <span class="st">&quot; &quot;</span>)</a>
<a class="sourceLine" id="cb12-4" data-line-number="4"><span class="co">#&gt; [1] &quot;Some badly spaced text&quot;</span></a></code></pre></div></li>
<li><p><code>\p{property name}</code> matches any character with specific unicode property, like <code>\p{Uppercase}</code> or <code>\p{Diacritic}</code>. The complement, <code>\P{property name}</code>, matches all characters without the property. A complete list of unicode properties can be found at <a href="http://www.unicode.org/reports/tr44/#Property_Index" class="uri">http://www.unicode.org/reports/tr44/#Property_Index</a>.</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb13-1" data-line-number="1">(text &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">'&quot;Double quotes&quot;'</span>, <span class="st">&quot;«Guillemet»&quot;</span>, <span class="st">&quot;“Fancy quotes”&quot;</span>))</a>
<a class="sourceLine" id="cb13-2" data-line-number="2"><span class="co">#&gt; [1] &quot;\&quot;Double quotes\&quot;&quot; &quot;«Guillemet»&quot;       &quot;“Fancy quotes”&quot;</span></a>
<a class="sourceLine" id="cb13-3" data-line-number="3"><span class="kw">str_replace_all</span>(text, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">p{quotation mark}&quot;</span>, <span class="st">&quot;'&quot;</span>)</a>
<a class="sourceLine" id="cb13-4" data-line-number="4"><span class="co">#&gt; [1] &quot;'Double quotes'&quot; &quot;'Guillemet'&quot;     &quot;'Fancy quotes'&quot;</span></a></code></pre></div></li>
<li><p><code>\w</code> matches any “word” character, which includes alphabetic characters, marks and decimal numbers. The complement, <code>\W</code>, matches any non-word character.</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb14-1" data-line-number="1"><span class="kw">str_extract_all</span>(<span class="st">&quot;Don't eat that!&quot;</span>, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">w+&quot;</span>)[[<span class="dv">1</span>]]</a>
<a class="sourceLine" id="cb14-2" data-line-number="2"><span class="co">#&gt; [1] &quot;Don&quot;  &quot;t&quot;    &quot;eat&quot;  &quot;that&quot;</span></a>
<a class="sourceLine" id="cb14-3" data-line-number="3"><span class="kw">str_split</span>(<span class="st">&quot;Don't eat that!&quot;</span>, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">W&quot;</span>)[[<span class="dv">1</span>]]</a>
<a class="sourceLine" id="cb14-4" data-line-number="4"><span class="co">#&gt; [1] &quot;Don&quot;  &quot;t&quot;    &quot;eat&quot;  &quot;that&quot; &quot;&quot;</span></a></code></pre></div>
<p>Technically, <code>\w</code> also matches connector punctuation, <code>\u200c</code> (zero width connector), and <code>\u200d</code> (zero width joiner), but these are rarely seen in the wild.</p></li>
<li><p><code>\b</code> matches word boundaries, the transition between word and non-word characters. <code>\B</code> matches the opposite: boundaries that have either both word or non-word characters on either side.</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb15-1" data-line-number="1"><span class="kw">str_replace_all</span>(<span class="st">&quot;The quick brown fox&quot;</span>, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">b&quot;</span>, <span class="st">&quot;_&quot;</span>)</a>
<a class="sourceLine" id="cb15-2" data-line-number="2"><span class="co">#&gt; [1] &quot;_The_ _quick_ _brown_ _fox_&quot;</span></a>
<a class="sourceLine" id="cb15-3" data-line-number="3"><span class="kw">str_replace_all</span>(<span class="st">&quot;The quick brown fox&quot;</span>, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">B&quot;</span>, <span class="st">&quot;_&quot;</span>)</a>
<a class="sourceLine" id="cb15-4" data-line-number="4"><span class="co">#&gt; [1] &quot;T_h_e q_u_i_c_k b_r_o_w_n f_o_x&quot;</span></a></code></pre></div></li>
</ul>
<p>You can also create your own <strong>character classes</strong> using <code>[]</code>:</p>
<ul>
<li><code>[abc]</code>: matches a, b, or c.</li>
<li><code>[a-z]</code>: matches every character between a and z (in Unicode code point order).</li>
<li><code>[^abc]</code>: matches anything except a, b, or c.</li>
<li><code>[\^\-]</code>: matches <code>^</code> or <code>-</code>.</li>
</ul>
<p>There are a number of pre-built classes that you can use inside <code>[]</code>:</p>
<ul>
<li><code>[:punct:]</code>: punctuation.</li>
<li><code>[:alpha:]</code>: letters.</li>
<li><code>[:lower:]</code>: lowercase letters.</li>
<li><code>[:upper:]</code>: upperclass letters.</li>
<li><code>[:digit:]</code>: digits.</li>
<li><code>[:xdigit:]</code>: hex digits.</li>
<li><code>[:alnum:]</code>: letters and numbers.</li>
<li><code>[:cntrl:]</code>: control characters.</li>
<li><code>[:graph:]</code>: letters, numbers, and punctuation.</li>
<li><code>[:print:]</code>: letters, numbers, punctuation, and whitespace.</li>
<li><code>[:space:]</code>: space characters (basically equivalent to <code>\s</code>).</li>
<li><code>[:blank:]</code>: space and tab.</li>
</ul>
<p>These all go inside the <code>[]</code> for character classes, i.e. <code>[[:digit:]AX]</code> matches all digits, A, and X.</p>
<p>You can also using Unicode properties, like <code>[\p{Letter}]</code>, and various set operations, like <code>[\p{Letter}--\p{script=latin}]</code>. See <code>?&quot;stringi-search-charclass&quot;</code> for details.</p>
</div>
<div id="alternation" class="section level2">
<h2>Alternation</h2>
<p><code>|</code> is the <strong>alternation</strong> operator, which will pick between one or more possible matches. For example, <code>abc|def</code> will match <code>abc</code> or <code>def</code>.</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb16-1" data-line-number="1"><span class="kw">str_detect</span>(<span class="kw">c</span>(<span class="st">&quot;abc&quot;</span>, <span class="st">&quot;def&quot;</span>, <span class="st">&quot;ghi&quot;</span>), <span class="st">&quot;abc|def&quot;</span>)</a>
<a class="sourceLine" id="cb16-2" data-line-number="2"><span class="co">#&gt; [1]  TRUE  TRUE FALSE</span></a></code></pre></div>
<p>Note that the precedence for <code>|</code> is low, so that <code>abc|def</code> matches <code>abc</code> or <code>def</code> not <code>abcyz</code> or <code>abxyz</code>.</p>
</div>
<div id="grouping" class="section level2">
<h2>Grouping</h2>
<p>You can use parentheses to override the default precedence rules:</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb17-1" data-line-number="1"><span class="kw">str_extract</span>(<span class="kw">c</span>(<span class="st">&quot;grey&quot;</span>, <span class="st">&quot;gray&quot;</span>), <span class="st">&quot;gre|ay&quot;</span>)</a>
<a class="sourceLine" id="cb17-2" data-line-number="2"><span class="co">#&gt; [1] &quot;gre&quot; &quot;ay&quot;</span></a>
<a class="sourceLine" id="cb17-3" data-line-number="3"><span class="kw">str_extract</span>(<span class="kw">c</span>(<span class="st">&quot;grey&quot;</span>, <span class="st">&quot;gray&quot;</span>), <span class="st">&quot;gr(e|a)y&quot;</span>)</a>
<a class="sourceLine" id="cb17-4" data-line-number="4"><span class="co">#&gt; [1] &quot;grey&quot; &quot;gray&quot;</span></a></code></pre></div>
<p>Parenthesis also define “groups” that you can refer to with <strong>backreferences</strong>, like <code>\1</code>, <code>\2</code> etc, and can be extracted with <code>str_match()</code>. For example, the following regular expression finds all fruits that have a repeated pair of letters:</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb18-1" data-line-number="1">pattern &lt;-<span class="st"> &quot;(..)</span><span class="ch">\\</span><span class="st">1&quot;</span></a>
<a class="sourceLine" id="cb18-2" data-line-number="2">fruit <span class="op">%&gt;%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb18-3" data-line-number="3"><span class="st">  </span><span class="kw">str_subset</span>(pattern)</a>
<a class="sourceLine" id="cb18-4" data-line-number="4"><span class="co">#&gt; [1] &quot;banana&quot;      &quot;coconut&quot;     &quot;cucumber&quot;    &quot;jujube&quot;      &quot;papaya&quot;     </span></a>
<a class="sourceLine" id="cb18-5" data-line-number="5"><span class="co">#&gt; [6] &quot;salal berry&quot;</span></a>
<a class="sourceLine" id="cb18-6" data-line-number="6"></a>
<a class="sourceLine" id="cb18-7" data-line-number="7">fruit <span class="op">%&gt;%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb18-8" data-line-number="8"><span class="st">  </span><span class="kw">str_subset</span>(pattern) <span class="op">%&gt;%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb18-9" data-line-number="9"><span class="st">  </span><span class="kw">str_match</span>(pattern)</a>
<a class="sourceLine" id="cb18-10" data-line-number="10"><span class="co">#&gt;      [,1]   [,2]</span></a>
<a class="sourceLine" id="cb18-11" data-line-number="11"><span class="co">#&gt; [1,] &quot;anan&quot; &quot;an&quot;</span></a>
<a class="sourceLine" id="cb18-12" data-line-number="12"><span class="co">#&gt; [2,] &quot;coco&quot; &quot;co&quot;</span></a>
<a class="sourceLine" id="cb18-13" data-line-number="13"><span class="co">#&gt; [3,] &quot;cucu&quot; &quot;cu&quot;</span></a>
<a class="sourceLine" id="cb18-14" data-line-number="14"><span class="co">#&gt; [4,] &quot;juju&quot; &quot;ju&quot;</span></a>
<a class="sourceLine" id="cb18-15" data-line-number="15"><span class="co">#&gt; [5,] &quot;papa&quot; &quot;pa&quot;</span></a>
<a class="sourceLine" id="cb18-16" data-line-number="16"><span class="co">#&gt; [6,] &quot;alal&quot; &quot;al&quot;</span></a></code></pre></div>
<p>You can use <code>(?:...)</code>, the non-grouping parentheses, to control precedence but not capture the match in a group. This is slightly more efficient than capturing parentheses.</p>
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb19-1" data-line-number="1"><span class="kw">str_match</span>(<span class="kw">c</span>(<span class="st">&quot;grey&quot;</span>, <span class="st">&quot;gray&quot;</span>), <span class="st">&quot;gr(e|a)y&quot;</span>)</a>
<a class="sourceLine" id="cb19-2" data-line-number="2"><span class="co">#&gt;      [,1]   [,2]</span></a>
<a class="sourceLine" id="cb19-3" data-line-number="3"><span class="co">#&gt; [1,] &quot;grey&quot; &quot;e&quot; </span></a>
<a class="sourceLine" id="cb19-4" data-line-number="4"><span class="co">#&gt; [2,] &quot;gray&quot; &quot;a&quot;</span></a>
<a class="sourceLine" id="cb19-5" data-line-number="5"><span class="kw">str_match</span>(<span class="kw">c</span>(<span class="st">&quot;grey&quot;</span>, <span class="st">&quot;gray&quot;</span>), <span class="st">&quot;gr(?:e|a)y&quot;</span>)</a>
<a class="sourceLine" id="cb19-6" data-line-number="6"><span class="co">#&gt;      [,1]  </span></a>
<a class="sourceLine" id="cb19-7" data-line-number="7"><span class="co">#&gt; [1,] &quot;grey&quot;</span></a>
<a class="sourceLine" id="cb19-8" data-line-number="8"><span class="co">#&gt; [2,] &quot;gray&quot;</span></a></code></pre></div>
<p>This is most useful for more complex cases where you need to capture matches and control precedence independently.</p>
</div>
<div id="anchors" class="section level2">
<h2>Anchors</h2>
<p>By default, regular expressions will match any part of a string. It’s often useful to <strong>anchor</strong> the regular expression so that it matches from the start or end of the string:</p>
<ul>
<li><code>^</code> matches the start of string.</li>
<li><code>$</code> matches the end of the string.</li>
</ul>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb20-1" data-line-number="1">x &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;apple&quot;</span>, <span class="st">&quot;banana&quot;</span>, <span class="st">&quot;pear&quot;</span>)</a>
<a class="sourceLine" id="cb20-2" data-line-number="2"><span class="kw">str_extract</span>(x, <span class="st">&quot;^a&quot;</span>)</a>
<a class="sourceLine" id="cb20-3" data-line-number="3"><span class="co">#&gt; [1] &quot;a&quot; NA  NA</span></a>
<a class="sourceLine" id="cb20-4" data-line-number="4"><span class="kw">str_extract</span>(x, <span class="st">&quot;a$&quot;</span>)</a>
<a class="sourceLine" id="cb20-5" data-line-number="5"><span class="co">#&gt; [1] NA  &quot;a&quot; NA</span></a></code></pre></div>
<p>To match a literal “$” or “^”, you need to escape them, <code>\$</code>, and <code>\^</code>.</p>
<p>For multiline strings, you can use <code>regex(multiline = TRUE)</code>. This changes the behaviour of <code>^</code> and <code>$</code>, and introduces three new operators:</p>
<ul>
<li><p><code>^</code> now matches the start of each line.</p></li>
<li><p><code>$</code> now matches the end of each line.</p></li>
<li><p><code>\A</code> matches the start of the input.</p></li>
<li><p><code>\z</code> matches the end of the input.</p></li>
<li><p><code>\Z</code> matches the end of the input, but before the final line terminator, if it exists.</p></li>
</ul>
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb21-1" data-line-number="1">x &lt;-<span class="st"> &quot;Line 1</span><span class="ch">\n</span><span class="st">Line 2</span><span class="ch">\n</span><span class="st">Line 3</span><span class="ch">\n</span><span class="st">&quot;</span></a>
<a class="sourceLine" id="cb21-2" data-line-number="2"><span class="kw">str_extract_all</span>(x, <span class="st">&quot;^Line..&quot;</span>)[[<span class="dv">1</span>]]</a>
<a class="sourceLine" id="cb21-3" data-line-number="3"><span class="co">#&gt; [1] &quot;Line 1&quot;</span></a>
<a class="sourceLine" id="cb21-4" data-line-number="4"><span class="kw">str_extract_all</span>(x, <span class="kw">regex</span>(<span class="st">&quot;^Line..&quot;</span>, <span class="dt">multiline =</span> <span class="ot">TRUE</span>))[[<span class="dv">1</span>]]</a>
<a class="sourceLine" id="cb21-5" data-line-number="5"><span class="co">#&gt; [1] &quot;Line 1&quot; &quot;Line 2&quot; &quot;Line 3&quot;</span></a>
<a class="sourceLine" id="cb21-6" data-line-number="6"><span class="kw">str_extract_all</span>(x, <span class="kw">regex</span>(<span class="st">&quot;</span><span class="ch">\\</span><span class="st">ALine..&quot;</span>, <span class="dt">multiline =</span> <span class="ot">TRUE</span>))[[<span class="dv">1</span>]]</a>
<a class="sourceLine" id="cb21-7" data-line-number="7"><span class="co">#&gt; [1] &quot;Line 1&quot;</span></a></code></pre></div>
</div>
<div id="repetition" class="section level2">
<h2>Repetition</h2>
<p>You can control how many times a pattern matches with the repetition operators:</p>
<ul>
<li><code>?</code>: 0 or 1.</li>
<li><code>+</code>: 1 or more.</li>
<li><code>*</code>: 0 or more.</li>
</ul>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb22-1" data-line-number="1">x &lt;-<span class="st"> &quot;1888 is the longest year in Roman numerals: MDCCCLXXXVIII&quot;</span></a>
<a class="sourceLine" id="cb22-2" data-line-number="2"><span class="kw">str_extract</span>(x, <span class="st">&quot;CC?&quot;</span>)</a>
<a class="sourceLine" id="cb22-3" data-line-number="3"><span class="co">#&gt; [1] &quot;CC&quot;</span></a>
<a class="sourceLine" id="cb22-4" data-line-number="4"><span class="kw">str_extract</span>(x, <span class="st">&quot;CC+&quot;</span>)</a>
<a class="sourceLine" id="cb22-5" data-line-number="5"><span class="co">#&gt; [1] &quot;CCC&quot;</span></a>
<a class="sourceLine" id="cb22-6" data-line-number="6"><span class="kw">str_extract</span>(x, <span class="st">'C[LX]+'</span>)</a>
<a class="sourceLine" id="cb22-7" data-line-number="7"><span class="co">#&gt; [1] &quot;CLXXX&quot;</span></a></code></pre></div>
<p>Note that the precedence of these operators is high, so you can write: <code>colou?r</code> to match either American or British spellings. That means most uses will need parentheses, like <code>bana(na)+</code>.</p>
<p>You can also specify the number of matches precisely:</p>
<ul>
<li><code>{n}</code>: exactly n</li>
<li><code>{n,}</code>: n or more</li>
<li><code>{n,m}</code>: between n and m</li>
</ul>
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb23-1" data-line-number="1"><span class="kw">str_extract</span>(x, <span class="st">&quot;C{2}&quot;</span>)</a>
<a class="sourceLine" id="cb23-2" data-line-number="2"><span class="co">#&gt; [1] &quot;CC&quot;</span></a>
<a class="sourceLine" id="cb23-3" data-line-number="3"><span class="kw">str_extract</span>(x, <span class="st">&quot;C{2,}&quot;</span>)</a>
<a class="sourceLine" id="cb23-4" data-line-number="4"><span class="co">#&gt; [1] &quot;CCC&quot;</span></a>
<a class="sourceLine" id="cb23-5" data-line-number="5"><span class="kw">str_extract</span>(x, <span class="st">&quot;C{2,3}&quot;</span>)</a>
<a class="sourceLine" id="cb23-6" data-line-number="6"><span class="co">#&gt; [1] &quot;CCC&quot;</span></a></code></pre></div>
<p>By default these matches are “greedy”: they will match the longest string possible. You can make them “lazy”, matching the shortest string possible by putting a <code>?</code> after them:</p>
<ul>
<li><code>??</code>: 0 or 1, prefer 0.</li>
<li><code>+?</code>: 1 or more, match as few times as possible.</li>
<li><code>*?</code>: 0 or more, match as few times as possible.</li>
<li><code>{n,}?</code>: n or more, match as few times as possible.</li>
<li><code>{n,m}?</code>: between n and m, , match as few times as possible, but at least n.</li>
</ul>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb24-1" data-line-number="1"><span class="kw">str_extract</span>(x, <span class="kw">c</span>(<span class="st">&quot;C{2,3}&quot;</span>, <span class="st">&quot;C{2,3}?&quot;</span>))</a>
<a class="sourceLine" id="cb24-2" data-line-number="2"><span class="co">#&gt; [1] &quot;CCC&quot; &quot;CC&quot;</span></a>
<a class="sourceLine" id="cb24-3" data-line-number="3"><span class="kw">str_extract</span>(x, <span class="kw">c</span>(<span class="st">&quot;C[LX]+&quot;</span>, <span class="st">&quot;C[LX]+?&quot;</span>))</a>
<a class="sourceLine" id="cb24-4" data-line-number="4"><span class="co">#&gt; [1] &quot;CLXXX&quot; &quot;CL&quot;</span></a></code></pre></div>
<p>You can also make the matches possessive by putting a <code>+</code> after them, which means that if later parts of the match fail, the repetition will not be re-tried with a smaller number of characters. This is an advanced feature used to improve performance in worst-case scenarios (called “catastrophic backtracking”).</p>
<ul>
<li><code>?+</code>: 0 or 1, possessive.</li>
<li><code>++</code>: 1 or more, possessive.</li>
<li><code>*+</code>: 0 or more, possessive.</li>
<li><code>{n}+</code>: exactly n, possessive.</li>
<li><code>{n,}+</code>: n or more, possessive.</li>
<li><code>{n,m}+</code>: between n and m, possessive.</li>
</ul>
<p>A related concept is the <strong>atomic-match</strong> parenthesis, <code>(?&gt;...)</code>. If a later match fails and the engine needs to back-track, an atomic match is kept as is: it succeeds or fails as a whole. Compare the following two regular expressions:</p>
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb25-1" data-line-number="1"><span class="kw">str_detect</span>(<span class="st">&quot;ABC&quot;</span>, <span class="st">&quot;(?&gt;A|.B)C&quot;</span>)</a>
<a class="sourceLine" id="cb25-2" data-line-number="2"><span class="co">#&gt; [1] FALSE</span></a>
<a class="sourceLine" id="cb25-3" data-line-number="3"><span class="kw">str_detect</span>(<span class="st">&quot;ABC&quot;</span>, <span class="st">&quot;(?:A|.B)C&quot;</span>)</a>
<a class="sourceLine" id="cb25-4" data-line-number="4"><span class="co">#&gt; [1] TRUE</span></a></code></pre></div>
<p>The atomic match fails because it matches A, and then the next character is a C so it fails. The regular match succeeds because it matches A, but then C doesn’t match, so it back-tracks and tries B instead.</p>
</div>
<div id="look-arounds" class="section level2">
<h2>Look arounds</h2>
<p>These assertions look ahead or behind the current match without “consuming” any characters (i.e. changing the input position).</p>
<ul>
<li><p><code>(?=...)</code>: positive look-ahead assertion. Matches if <code>...</code> matches at the current input.</p></li>
<li><p><code>(?!...)</code>: negative look-ahead assertion. Matches if <code>...</code> <strong>does not</strong> match at the current input.</p></li>
<li><p><code>(?&lt;=...)</code>: positive look-behind assertion. Matches if <code>...</code> matches text preceding the current position, with the last character of the match being the character just before the current position. Length must be bounded<br />
(i.e. no <code>*</code> or <code>+</code>).</p></li>
<li><p><code>(?&lt;!...)</code>: negative look-behind assertion. Matches if <code>...</code> <strong>does not</strong> match text preceding the current position. Length must be bounded<br />
(i.e. no <code>*</code> or <code>+</code>).</p></li>
</ul>
<p>These are useful when you want to check that a pattern exists, but you don’t want to include it in the result:</p>
<div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb26-1" data-line-number="1">x &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;1 piece&quot;</span>, <span class="st">&quot;2 pieces&quot;</span>, <span class="st">&quot;3&quot;</span>)</a>
<a class="sourceLine" id="cb26-2" data-line-number="2"><span class="kw">str_extract</span>(x, <span class="st">&quot;</span><span class="ch">\\</span><span class="st">d+(?= pieces?)&quot;</span>)</a>
<a class="sourceLine" id="cb26-3" data-line-number="3"><span class="co">#&gt; [1] &quot;1&quot; &quot;2&quot; NA</span></a>
<a class="sourceLine" id="cb26-4" data-line-number="4"></a>
<a class="sourceLine" id="cb26-5" data-line-number="5">y &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;100&quot;</span>, <span class="st">&quot;$400&quot;</span>)</a>
<a class="sourceLine" id="cb26-6" data-line-number="6"><span class="kw">str_extract</span>(y, <span class="st">&quot;(?&lt;=</span><span class="ch">\\</span><span class="st">$)</span><span class="ch">\\</span><span class="st">d+&quot;</span>)</a>
<a class="sourceLine" id="cb26-7" data-line-number="7"><span class="co">#&gt; [1] NA    &quot;400&quot;</span></a></code></pre></div>
</div>
<div id="comments" class="section level2">
<h2>Comments</h2>
<p>There are two ways to include comments in a regular expression. The first is with <code>(?#...)</code>:</p>
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb27-1" data-line-number="1"><span class="kw">str_detect</span>(<span class="st">&quot;xyz&quot;</span>, <span class="st">&quot;x(?#this is a comment)&quot;</span>)</a>
<a class="sourceLine" id="cb27-2" data-line-number="2"><span class="co">#&gt; [1] TRUE</span></a></code></pre></div>
<p>The second is to use <code>regex(comments = TRUE)</code>. This form ignores spaces and newlines, and anything everything after <code>#</code>. To match a literal space, you’ll need to escape it: <code>&quot;\\ &quot;</code>. This is a useful way of describing complex regular expressions:</p>
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb28-1" data-line-number="1">phone &lt;-<span class="st"> </span><span class="kw">regex</span>(<span class="st">&quot;</span></a>
<a class="sourceLine" id="cb28-2" data-line-number="2"><span class="st">  </span><span class="ch">\\</span><span class="st">(?     # optional opening parens</span></a>
<a class="sourceLine" id="cb28-3" data-line-number="3"><span class="st">  (</span><span class="ch">\\</span><span class="st">d{3}) # area code</span></a>
<a class="sourceLine" id="cb28-4" data-line-number="4"><span class="st">  [)- ]?   # optional closing parens, dash, or space</span></a>
<a class="sourceLine" id="cb28-5" data-line-number="5"><span class="st">  (</span><span class="ch">\\</span><span class="st">d{3}) # another three numbers</span></a>
<a class="sourceLine" id="cb28-6" data-line-number="6"><span class="st">  [ -]?    # optional space or dash</span></a>
<a class="sourceLine" id="cb28-7" data-line-number="7"><span class="st">  (</span><span class="ch">\\</span><span class="st">d{3}) # three more numbers</span></a>
<a class="sourceLine" id="cb28-8" data-line-number="8"><span class="st">  &quot;</span>, <span class="dt">comments =</span> <span class="ot">TRUE</span>)</a>
<a class="sourceLine" id="cb28-9" data-line-number="9"></a>
<a class="sourceLine" id="cb28-10" data-line-number="10"><span class="kw">str_match</span>(<span class="st">&quot;514-791-8141&quot;</span>, phone)</a>
<a class="sourceLine" id="cb28-11" data-line-number="11"><span class="co">#&gt;      [,1]          [,2]  [,3]  [,4] </span></a>
<a class="sourceLine" id="cb28-12" data-line-number="12"><span class="co">#&gt; [1,] &quot;514-791-814&quot; &quot;514&quot; &quot;791&quot; &quot;814&quot;</span></a></code></pre></div>
</div>


<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>