jbesomi · mk2510 · Jul 26, 2020 · Jul 27, 2020 · Jul 27, 2020 · Jul 27, 2020
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -114,6 +114,46 @@ def test_pipeline_stopwords(self):
         pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
         self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)
 
+    """
+    Test clean
+    """
+
+    def _get_default_clean_pipeline(self):
+        """
+        Return a list contaning all the methods used in the default cleaning pipeline.
+
+        Return a list with the following functions:
+        1. :meth:`texthero.preprocessing.fillna`
+        2. :meth:`texthero.preprocessing.lowercase`
+        3. :meth:`texthero.preprocessing.remove_digits`
+        4. :meth:`texthero.preprocessing.remove_punctuation`
+        5. :meth:`texthero.preprocessing.remove_diacritics`
+        6. :meth:`texthero.preprocessing.remove_stopwords`
+        7. :meth:`texthero.preprocessing.remove_whitespace`
+        """
+
+        return [
+            preprocessing.fillna,
+            preprocessing.lowercase,
+            preprocessing.remove_digits,
+            preprocessing.remove_punctuation,
+            preprocessing.remove_diacritics,
+            preprocessing.remove_stopwords,
+            preprocessing.remove_whitespace,
+        ]
+
+    def test_clean(self):
+        s = pd.Series(
+            ["This serös 42 should bE CLeaned.! I am a stopword    \n", np.NAN]
+        )
+        s_true = pd.Series(
+            ["This serös 42 should bE CLeaned.! I am a stopword    \n", np.NAN]
+        )
+        self.assertEqual(
+            preprocessing.clean(s),
+            preprocessing.clean(s_true, self._get_default_clean_pipeline()),
+        )
+
     """
     Test stopwords.
     """

diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py
@@ -17,6 +17,44 @@
 
 from typing import List, Callable
 
+# REGEX pattern constants
+DIGITS_BLOCK = r"\b\d+\b"
+PUNCTUATION = rf"([{string.punctuation}])+"
+STOPWORD_TOKENIZER = r"""(?x)                          # Set flag to allow verbose regexps
+                    \w+(?:-\w+)*                              # Words with optional internal hyphens 
+                    | \s*                                     # Any space
+                    | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~]    # Any symbol 
+                    """
+ROUND_BRACKETS = r"\([^()]*\)"
+CURLY_BRACKETS = r"\{[^{}]*\}"
+SQUARE_BRACKETS = r"\[[^\[\]]*\]"
+ANGLE_BRACKETS = r"<[^<>]*>"
+HTML_TAG = r"""(?x)                    # Turn on free-spacing
+            <[^>]+>                             # Remove <html> tags
+            | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
+            """
+
+URLS = r"http\S+"
+TAGS = r"@[a-zA-Z0-9]+"
+HASHTAGS = r"#[a-zA-Z0-9_]+"
+
+
+def _get_pattern_for_tokenisation(punct: str) -> str:
+    """
+    Return the standart tokenisation pattern
+
+    The standart tokenisation will seperate all "regex words" '\w' from each other and also 
+    puts the punctuation in its own tokens
+
+    Parameters
+    ----------
+    punct : String
+            the text, which should get tokenized by this pattern, but all '_' characters should have been removed,
+            as '\w' in regex already includes this one
+    """
+    return rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"
+
+
 # Ignore gensim annoying warnings
 import warnings
 
@@ -91,8 +129,7 @@ def replace_digits(s: pd.Series, symbols: str = " ", only_blocks=True) -> pd.Ser
     """
 
     if only_blocks:
-        pattern = r"\b\d+\b"
-        return s.str.replace(pattern, symbols)
+        return s.str.replace(DIGITS_BLOCK, symbols)
     else:
         return s.str.replace(r"\d+", symbols)
 
@@ -157,7 +194,7 @@ def replace_punctuation(s: pd.Series, symbol: str = " ") -> pd.Series:
     dtype: object
     """
 
-    return s.str.replace(rf"([{string.punctuation}])+", symbol)
+    return s.str.replace(PUNCTUATION, symbol)
 
 
 def remove_punctuation(s: pd.Series) -> pd.Series:
@@ -266,13 +303,9 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str:
 
     """
 
-    pattern = r"""(?x)                          # Set flag to allow verbose regexps
-      \w+(?:-\w+)*                              # Words with optional internal hyphens 
-      | \s*                                     # Any space
-      | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~]    # Any symbol 
-    """
-
-    return "".join(t if t not in words else symbol for t in re.findall(pattern, text))
+    return "".join(
+        t if t not in words else symbol for t in re.findall(STOPWORD_TOKENIZER, text)
+    )
 
 
 def replace_stopwords(
@@ -525,7 +558,7 @@ def remove_round_brackets(s: pd.Series) -> pd.Series:
     :meth:`remove_square_brackets`
 
     """
-    return s.str.replace(r"\([^()]*\)", "")
+    return s.str.replace(ROUND_BRACKETS, "")
 
 
 def remove_curly_brackets(s: pd.Series) -> pd.Series:
@@ -549,7 +582,7 @@ def remove_curly_brackets(s: pd.Series) -> pd.Series:
     :meth:`remove_square_brackets`
 
     """
-    return s.str.replace(r"\{[^{}]*\}", "")
+    return s.str.replace(CURLY_BRACKETS, "")
 
 
 def remove_square_brackets(s: pd.Series) -> pd.Series:
@@ -574,7 +607,7 @@ def remove_square_brackets(s: pd.Series) -> pd.Series:
 
 
     """
-    return s.str.replace(r"\[[^\[\]]*\]", "")
+    return s.str.replace(SQUARE_BRACKETS, "")
 
 
 def remove_angle_brackets(s: pd.Series) -> pd.Series:
@@ -598,7 +631,7 @@ def remove_angle_brackets(s: pd.Series) -> pd.Series:
     :meth:`remove_square_brackets`
 
     """
-    return s.str.replace(r"<[^<>]*>", "")
+    return s.str.replace(ANGLE_BRACKETS, "")
 
 
 def remove_brackets(s: pd.Series) -> pd.Series:
@@ -651,12 +684,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series:
 
     """
 
-    pattern = r"""(?x)                    # Turn on free-spacing
-      <[^>]+>                             # Remove <html> tags
-      | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
-      """
-
-    return s.str.replace(pattern, "")
+    return s.str.replace(HTML_TAG, "")
 
 
 def tokenize(s: pd.Series) -> pd.Series:
@@ -680,12 +708,12 @@ def tokenize(s: pd.Series) -> pd.Series:
 
     """
 
-    punct = string.punctuation.replace("_", "")
     # In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w
+    punct = string.punctuation.replace("_", "")
 
-    pattern = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"
-
-    return s.str.replace(pattern, r"\2 \3 \4 \5").str.split()
+    return s.str.replace(
+        _get_pattern_for_tokenisation(punct), r"\2 \3 \4 \5"
+    ).str.split()
 
 
 # Warning message for not-tokenized inputs
@@ -775,9 +803,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series:
 
     """
 
-    pattern = r"http\S+"
-
-    return s.str.replace(pattern, symbol)
+    return s.str.replace(URLS, symbol)
 
 
 def remove_urls(s: pd.Series) -> pd.Series:
@@ -826,8 +852,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series:
 
     """
 
-    pattern = r"@[a-zA-Z0-9]+"
-    return s.str.replace(pattern, symbol)
+    return s.str.replace(TAGS, symbol)
 
 
 def remove_tags(s: pd.Series) -> pd.Series:
@@ -873,8 +898,7 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series:
     dtype: object
 
     """
-    pattern = r"#[a-zA-Z0-9_]+"
-    return s.str.replace(pattern, symbol)
+    return s.str.replace(HASHTAGS, symbol)
 
 
 def remove_hashtags(s: pd.Series) -> pd.Series: