From 6ef4be3f8f269f147b5abedecf7da6f19af305d3 Mon Sep 17 00:00:00 2001 From: Liam3851 Date: Wed, 28 Feb 2018 06:14:11 -0500 Subject: [PATCH] ENH: Allow literal (non-regex) replacement using .str.replace #16808 (#19584) --- doc/source/text.rst | 28 +++++++--- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/strings.py | 92 ++++++++++++++++++++++----------- pandas/tests/test_strings.py | 21 ++++++++ 4 files changed, 105 insertions(+), 37 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 1e620acb1f88a..da8e40892716e 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -118,8 +118,8 @@ i.e., from the end of the string to the beginning of the string: s2.str.rsplit('_', expand=True, n=1) -Methods like ``replace`` and ``findall`` take `regular expressions -`__, too: +``replace`` by default replaces `regular expressions +`__: .. ipython:: python @@ -146,12 +146,25 @@ following code will cause trouble because of the regular expression meaning of # We need to escape the special character (for >1 len patterns) dollars.str.replace(r'-\$', '-') +.. versionadded:: 0.23.0 + +If you do want literal replacement of a string (equivalent to +:meth:`str.replace`), you can set the optional ``regex`` parameter to +``False``, rather than escaping each character. In this case both ``pat`` +and ``repl`` must be strings: + +.. ipython:: python + + # These lines are equivalent + dollars.str.replace(r'-\$', '-') + dollars.str.replace('-$', '-', regex=False) + +.. versionadded:: 0.20.0 + The ``replace`` method can also take a callable as replacement. It is called on every ``pat`` using :func:`re.sub`. The callable should expect one positional argument (a regex object) and return a string. -.. versionadded:: 0.20.0 - .. ipython:: python # Reverse every lowercase alphabetic word @@ -164,12 +177,12 @@ positional argument (a regex object) and return a string. repl = lambda m: m.group('two').swapcase() pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) +.. versionadded:: 0.20.0 + The ``replace`` method also accepts a compiled regular expression object from :func:`re.compile` as a pattern. All flags should be included in the compiled regular expression object. -.. versionadded:: 0.20.0 - .. ipython:: python import re @@ -186,6 +199,7 @@ regular expression object will raise a ``ValueError``. --------------------------------------------------------------------------- ValueError: case and flags cannot be set when pat is a compiled regex + Indexing with ``.str`` ---------------------- @@ -432,7 +446,7 @@ Method Summary :meth:`~Series.str.join`;Join strings in each element of the Series with passed separator :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex - :meth:`~Series.str.replace`;Replace occurrences of pattern/regex with some other string or the return value of a callable given the occurrence + :meth:`~Series.str.replace`;Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" :meth:`~Series.str.center`;Equivalent to ``str.center`` diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6865428c352c1..542e62aa374be 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -620,6 +620,7 @@ Other API Changes - Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`) - :class:`DateOffset` objects render more simply, e.g. ```` instead of ```` (:issue:`19403`) - ``Categorical.fillna`` now validates its ``value`` and ``method`` keyword arguments. It now raises when both or none are specified, matching the behavior of :meth:`Series.fillna` (:issue:`19682`) +- :func:`Series.str.replace` now takes an optional `regex` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`) .. _whatsnew_0230.deprecations: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ce688f8b16fe5..6b427ed1da834 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -306,7 +306,7 @@ def str_endswith(arr, pat, na=np.nan): return _na_map(f, arr, na, dtype=bool) -def str_replace(arr, pat, repl, n=-1, case=None, flags=0): +def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): r""" Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to :meth:`str.replace` or @@ -337,25 +337,50 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0): flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE - Cannot be set if `pat` is a compiled regex + regex : boolean, default True + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + .. versionadded:: 0.23.0 Returns ------- replaced : Series/Index of objects + Raises + ------ + ValueError + * if `regex` is False and `repl` is a callable or `pat` is a compiled + regex + * if `pat` is a compiled regex and `case` or `flags` is set + Notes ----- When `pat` is a compiled regex, all flags should be included in the - compiled regex. Use of `case` or `flags` with a compiled regex will - raise an error. + compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled + regex will raise an error. Examples -------- - When `repl` is a string, every `pat` is replaced as with - :meth:`str.replace`. NaN value(s) in the Series are left as is. + When `pat` is a string and `regex` is True (the default), the given `pat` + is compiled as a regex. When `repl` is a string, it replaces matching + regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are + left as is: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) + 0 bao + 1 baz + 2 NaN + dtype: object - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', 'b') - 0 boo - 1 buz + When `pat` is a string and `regex` is False, every `pat` is replaced with + `repl` as with :meth:`str.replace`: + + >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) + 0 bao + 1 fuz 2 NaN dtype: object @@ -397,6 +422,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0): 1 bar 2 NaN dtype: object + """ # Check whether repl is valid (GH 13438, GH 15055) @@ -404,27 +430,33 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0): raise TypeError("repl must be a string or callable") is_compiled_re = is_re(pat) - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError("case and flags cannot be set" - " when pat is a compiled regex") - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - - use_re = is_compiled_re or len(pat) > 1 or flags or callable(repl) - - if use_re: - n = n if n >= 0 else 0 - regex = re.compile(pat, flags=flags) - f = lambda x: regex.sub(repl=repl, string=x, count=n) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError("case and flags cannot be set" + " when pat is a compiled regex") + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: + flags |= re.IGNORECASE + if is_compiled_re or len(pat) > 1 or flags or callable(repl): + n = n if n >= 0 else 0 + compiled = re.compile(pat, flags=flags) + f = lambda x: compiled.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) else: + if is_compiled_re: + raise ValueError("Cannot use a compiled regex as replacement " + "pattern with regex=False") + if callable(repl): + raise ValueError("Cannot use a callable replacement when " + "regex=False") f = lambda x: x.replace(pat, repl, n) return _na_map(f, arr) @@ -1596,9 +1628,9 @@ def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=None): return self._wrap_result(result) @copy(str_replace) - def replace(self, pat, repl, n=-1, case=None, flags=0): + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): result = str_replace(self._data, pat, repl, n=n, case=case, - flags=flags) + flags=flags, regex=regex) return self._wrap_result(result) @copy(str_repeat) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 178c5ff655b04..a878d6ed7b052 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -530,6 +530,27 @@ def test_replace_compiled_regex(self): exp = Series(['foObaD__baRbaD', NA]) tm.assert_series_equal(result, exp) + def test_replace_literal(self): + # GH16808 literal replace (regex=False vs regex=True) + values = Series(['f.o', 'foo', NA]) + exp = Series(['bao', 'bao', NA]) + result = values.str.replace('f.', 'ba') + tm.assert_series_equal(result, exp) + + exp = Series(['bao', 'foo', NA]) + result = values.str.replace('f.', 'ba', regex=False) + tm.assert_series_equal(result, exp) + + # Cannot do a literal replace if given a callable repl or compiled + # pattern + callable_repl = lambda m: m.group(0).swapcase() + compiled_pat = re.compile('[a-z][A-Z]{2}') + + pytest.raises(ValueError, values.str.replace, 'abc', callable_repl, + regex=False) + pytest.raises(ValueError, values.str.replace, compiled_pat, '', + regex=False) + def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd'])