From 3ab14fd72348fed1b7c5331d024c78f59d37f85f Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Sun, 26 Jul 2020 22:49:27 +0200 Subject: [PATCH 01/17] Speed up default clean function speeded up the default function, by writing it in just one and let it operate on strings Co-authored-by: Henri Froese --- texthero/preprocessing.py | 58 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 506a9be9..d90b9826 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -458,13 +458,69 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series: """ if not pipeline: - pipeline = get_default_pipeline() + return _optimised_default_clean(s) for f in pipeline: s = s.pipe(f) return s +def _optimised_default_clean(s: pd.Series) -> pd.Series: + """ + Applies the default clean pipeline in an optimised way to a series, + that is about 30% faster. + + Default pipeline: + 1. :meth:`texthero.preprocessing.fillna` + 2. :meth:`texthero.preprocessing.lowercase` + 3. :meth:`texthero.preprocessing.remove_digits` + 4. :meth:`texthero.preprocessing.remove_punctuation` + 5. :meth:`texthero.preprocessing.remove_diacritics` + 6. :meth:`texthero.preprocessing.remove_stopwords` + 7. :meth:`texthero.preprocessing.remove_whitespace` + """ + return s.apply(_optimised_default_clean_single_cell) + + +def _optimised_default_clean_single_cell(text: str) -> str: + """ + Applies the default clean pipeline to one cell. + + Default pipeline: + 1. :meth:`texthero.preprocessing.fillna` + 2. :meth:`texthero.preprocessing.lowercase` + 3. :meth:`texthero.preprocessing.remove_digits` + 4. :meth:`texthero.preprocessing.remove_punctuation` + 5. :meth:`texthero.preprocessing.remove_diacritics` + 6. :meth:`texthero.preprocessing.remove_stopwords` + 7. :meth:`texthero.preprocessing.remove_whitespace` + """ + + # fillna + if pd.isna(text): + return "" + + # lowercase + text = text.lower() + + # remove digits and punctuation + pattern_digits_remove = r"\b\d+\b" + pattern_punctuation_remove = rf"([{string.punctuation}])+" + pattern_mixed_remove = pattern_digits_remove + "|" + pattern_punctuation_remove + text = re.sub(pattern_mixed_remove, "", text) + + # remove diacritics + text = _remove_diacritics(text) + + # remove stopwords + text = _replace_stopwords(text, _stopwords.DEFAULT, "") + + # remove whitespace + text = " ".join(re.sub("\xa0", " ", text).split()) + + return text + + def has_content(s: pd.Series) -> pd.Series: r""" Return a Boolean Pandas Series indicating if the rows have content. From e0f02c53279491beb8309c62f492fb6b58320837 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 27 Jul 2020 19:18:16 +0200 Subject: [PATCH 02/17] Regex pattern to constancs removed the regex pattern from the functions and placed them in an constant above Co-authored-by: Henri Froese --- texthero/preprocessing.py | 81 +++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index d90b9826..6a84e888 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -17,6 +17,34 @@ from typing import List, Callable +# REGEX pattern constants +PATTERN_REMOVE_DIGITS_BLOCK = r"\b\d+\b" +PATTERN_REMOVE_PUNCTUATION = rf"([{string.punctuation}])+" +PATTERN_STOPWORD_TOKENIZER = r"""(?x) # Set flag to allow verbose regexps + \w+(?:-\w+)* # Words with optional internal hyphens + | \s* # Any space + | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol + """ +PATTERN_REMOVE_ROUND_BRACKETS = r"\([^()]*\)" +PATERN_REMOVE_CURLY_BRACKETS = r"\{[^{}]*\}" +PATTERN_REMOVE_SQUARE_BRACKETS = r"\[[^\[\]]*\]" +PATTERN_REMOVE_ANGLE_BRACKETS = r"<[^<>]*>" +PATTERN_REMOVE_HTML_TAG = r"""(?x) # Turn on free-spacing + <[^>]+> # Remove tags + | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove   + """ + + +def GET_PATTERN_TOKENIZATION(punct: str) -> str: + """ + Returns the standart tokenisation pattern + """ + return rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" + + +PATTERN_REPLACE_URLS = r"http\S+" +PATTERN_REPLACE_TAGS = r"@[a-zA-Z0-9]+" +PATTERN_REPLACE_HASHTAGS = r"#[a-zA-Z0-9_]+" # Ignore gensim annoying warnings import warnings @@ -92,8 +120,7 @@ def replace_digits(s: pd.Series, symbols: str = " ", only_blocks=True) -> pd.Ser """ if only_blocks: - pattern = r"\b\d+\b" - return s.str.replace(pattern, symbols) + return s.str.replace(PATTERN_REMOVE_DIGITS_BLOCK, symbols) else: return s.str.replace(r"\d+", symbols) @@ -158,7 +185,7 @@ def replace_punctuation(s: pd.Series, symbol: str = " ") -> pd.Series: dtype: object """ - return s.str.replace(rf"([{string.punctuation}])+", symbol) + return s.str.replace(PATTERN_REMOVE_PUNCTUATION, symbol) def remove_punctuation(s: pd.Series) -> pd.Series: @@ -267,13 +294,10 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str: """ - pattern = r"""(?x) # Set flag to allow verbose regexps - \w+(?:-\w+)* # Words with optional internal hyphens - | \s* # Any space - | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol - """ - - return "".join(t if t not in words else symbol for t in re.findall(pattern, text)) + return "".join( + t if t not in words else symbol + for t in re.findall(PATTERN_STOPWORD_TOKENIZER, text) + ) def replace_stopwords( @@ -504,9 +528,9 @@ def _optimised_default_clean_single_cell(text: str) -> str: text = text.lower() # remove digits and punctuation - pattern_digits_remove = r"\b\d+\b" - pattern_punctuation_remove = rf"([{string.punctuation}])+" - pattern_mixed_remove = pattern_digits_remove + "|" + pattern_punctuation_remove + pattern_mixed_remove = ( + PATTERN_REMOVE_DIGITS_BLOCK + "|" + PATTERN_REMOVE_PUNCTUATION + ) text = re.sub(pattern_mixed_remove, "", text) # remove diacritics @@ -582,7 +606,7 @@ def remove_round_brackets(s: pd.Series) -> pd.Series: :meth:`remove_square_brackets` """ - return s.str.replace(r"\([^()]*\)", "") + return s.str.replace(PATTERN_REMOVE_ROUND_BRACKETS, "") def remove_curly_brackets(s: pd.Series) -> pd.Series: @@ -606,7 +630,7 @@ def remove_curly_brackets(s: pd.Series) -> pd.Series: :meth:`remove_square_brackets` """ - return s.str.replace(r"\{[^{}]*\}", "") + return s.str.replace(PATERN_REMOVE_CURLY_BRACKETS, "") def remove_square_brackets(s: pd.Series) -> pd.Series: @@ -631,7 +655,7 @@ def remove_square_brackets(s: pd.Series) -> pd.Series: """ - return s.str.replace(r"\[[^\[\]]*\]", "") + return s.str.replace(PATTERN_REMOVE_SQUARE_BRACKETS, "") def remove_angle_brackets(s: pd.Series) -> pd.Series: @@ -655,7 +679,7 @@ def remove_angle_brackets(s: pd.Series) -> pd.Series: :meth:`remove_square_brackets` """ - return s.str.replace(r"<[^<>]*>", "") + return s.str.replace(PATTERN_REMOVE_ANGLE_BRACKETS, "") def remove_brackets(s: pd.Series) -> pd.Series: @@ -708,12 +732,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series: """ - pattern = r"""(?x) # Turn on free-spacing - <[^>]+> # Remove tags - | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove   - """ - - return s.str.replace(pattern, "") + return s.str.replace(PATTERN_REMOVE_HTML_TAG, "") def tokenize(s: pd.Series) -> pd.Series: @@ -737,12 +756,10 @@ def tokenize(s: pd.Series) -> pd.Series: """ - punct = string.punctuation.replace("_", "") # In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w + punct = string.punctuation.replace("_", "") - pattern = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" - - return s.str.replace(pattern, r"\2 \3 \4 \5").str.split() + return s.str.replace(GET_PATTERN_TOKENIZATION(punct), r"\2 \3 \4 \5").str.split() def tokenize_with_phrases( @@ -818,9 +835,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series: """ - pattern = r"http\S+" - - return s.str.replace(pattern, symbol) + return s.str.replace(PATTERN_REPLACE_URLS, symbol) def remove_urls(s: pd.Series) -> pd.Series: @@ -869,8 +884,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series: """ - pattern = r"@[a-zA-Z0-9]+" - return s.str.replace(pattern, symbol) + return s.str.replace(PATTERN_REPLACE_TAGS, symbol) def remove_tags(s: pd.Series) -> pd.Series: @@ -916,8 +930,7 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series: dtype: object """ - pattern = r"#[a-zA-Z0-9_]+" - return s.str.replace(pattern, symbol) + return s.str.replace(PATTERN_REPLACE_HASHTAGS, symbol) def remove_hashtags(s: pd.Series) -> pd.Series: From 2ea3caf00bc8742ecf1db9b1d5e687683e936b8f Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 27 Jul 2020 19:34:59 +0200 Subject: [PATCH 03/17] changed clean function docstring changed Docstring Co-authored-by: Henri Froese --- texthero/preprocessing.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 6a84e888..5e8056be 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -451,7 +451,12 @@ def get_default_pipeline() -> List[Callable[[pd.Series], pd.Series]]: def clean(s: pd.Series, pipeline=None) -> pd.Series: """ - Pre-process a text-based Pandas Series, by using the following default pipline. + Pre-process a text-based Pandas Series. + + There are two options to use this function. You can either use this function, buy not specifiying an pipeline. + In this case the clean function will use a default pipeline, which was hardcoded, to gain 30% performance improvements, + over the "pipe" method. + If you specify your own cleaning pipeline, the clean function will use this one instead. Default pipeline: 1. :meth:`texthero.preprocessing.fillna` @@ -462,6 +467,7 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series: 6. :meth:`texthero.preprocessing.remove_stopwords` 7. :meth:`texthero.preprocessing.remove_whitespace` + Parameters ---------- s : Pandas Series From 697a229110bc363549394523f26eef6a556e2fe5 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 27 Jul 2020 19:50:11 +0200 Subject: [PATCH 04/17] removed default pipeline for cleaning --- texthero/preprocessing.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 5e8056be..a5da42df 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -425,29 +425,6 @@ def _stem(text): return s.str.split().apply(_stem) -def get_default_pipeline() -> List[Callable[[pd.Series], pd.Series]]: - """ - Return a list contaning all the methods used in the default cleaning pipeline. - - Return a list with the following functions: - 1. :meth:`texthero.preprocessing.fillna` - 2. :meth:`texthero.preprocessing.lowercase` - 3. :meth:`texthero.preprocessing.remove_digits` - 4. :meth:`texthero.preprocessing.remove_punctuation` - 5. :meth:`texthero.preprocessing.remove_diacritics` - 6. :meth:`texthero.preprocessing.remove_stopwords` - 7. :meth:`texthero.preprocessing.remove_whitespace` - """ - return [ - fillna, - lowercase, - remove_digits, - remove_punctuation, - remove_diacritics, - remove_stopwords, - remove_whitespace, - ] - def clean(s: pd.Series, pipeline=None) -> pd.Series: """ From 4bb9860c3618086f3eb7ddc22f8638acf5752f76 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 27 Jul 2020 19:50:28 +0200 Subject: [PATCH 05/17] added unittest for the clean function --- tests/test_preprocessing.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index f661a816..064a355c 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -7,7 +7,6 @@ from texthero import preprocessing, stopwords from . import PandasTestCase - """ Test doctest """ @@ -113,6 +112,39 @@ def test_pipeline_stopwords(self): pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords] self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true) + """ + Test clean + """ + + def _get_default_clean_pipeline(self): + """ + Return a list contaning all the methods used in the default cleaning pipeline. + + Return a list with the following functions: + 1. :meth:`texthero.preprocessing.fillna` + 2. :meth:`texthero.preprocessing.lowercase` + 3. :meth:`texthero.preprocessing.remove_digits` + 4. :meth:`texthero.preprocessing.remove_punctuation` + 5. :meth:`texthero.preprocessing.remove_diacritics` + 6. :meth:`texthero.preprocessing.remove_stopwords` + 7. :meth:`texthero.preprocessing.remove_whitespace` + """ + + return [ + preprocessing.fillna, + preprocessing.lowercase, + preprocessing.remove_digits, + preprocessing.remove_punctuation, + preprocessing.remove_diacritics, + preprocessing.remove_stopwords, + preprocessing.remove_whitespace, + ] + + def test_clean(self): + s = pd.Series(["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]) + s_true = pd.Series(["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]) + self.assertEqual(preprocessing.clean(s), preprocessing.clean(s_true,self._get_default_clean_pipeline())) + """ Test stopwords. """ From 57c37c1e46826a5e33a3402f50ee46bb811e2106 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 27 Jul 2020 20:14:47 +0200 Subject: [PATCH 06/17] format file --- texthero/preprocessing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index a5da42df..62376336 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -425,7 +425,6 @@ def _stem(text): return s.str.split().apply(_stem) - def clean(s: pd.Series, pipeline=None) -> pd.Series: """ Pre-process a text-based Pandas Series. From db4934e820878d07ec5de93c2c19ebcf7ce7c1a4 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 27 Jul 2020 20:15:01 +0200 Subject: [PATCH 07/17] added unit test for clean function --- tests/test_preprocessing.py | 71 +++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 064a355c..e0e168b5 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -141,9 +141,74 @@ def _get_default_clean_pipeline(self): ] def test_clean(self): - s = pd.Series(["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]) - s_true = pd.Series(["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]) - self.assertEqual(preprocessing.clean(s), preprocessing.clean(s_true,self._get_default_clean_pipeline())) + s = pd.Series( + ["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN] + ) + s_true = pd.Series( + ["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN] + ) + self.assertEqual( + preprocessing.clean(s), + preprocessing.clean(s_true, self._get_default_clean_pipeline()), + ) + + def test_clean_fillna(self): + s = pd.Series(np.NaN) + s_true = pd.Series(np.NaN) + self.assertEqual( + preprocessing.clean(s), + preprocessing.clean(s_true, self._get_default_clean_pipeline()), + ) + + def test_clean_lowercase(self): + s = pd.Series("this text Is MiXed CasE") + s_true = pd.Series("this text Is MiXed CasE") + self.assertEqual( + preprocessing.clean(s), + preprocessing.clean(s_true, self._get_default_clean_pipeline()), + ) + + def test_clean_digits(self): + s = pd.Series("Here are 42 digits blocks 89") + s_true = pd.Series("Here are 42 digits blocks 89") + self.assertEqual( + preprocessing.clean(s), + preprocessing.clean(s_true, self._get_default_clean_pipeline()), + ) + + def test_clean_punctuation(self): + s = pd.Series("Some. wired, punctiation;.:!!!!") + s_true = pd.Series("Some. wired, punctiation;.:!!!") + self.assertEqual( + preprocessing.clean(s), + preprocessing.clean(s_true, self._get_default_clean_pipeline()), + ) + + def test_clean_diacritics(self): + s = pd.Series("Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس") + s_true = pd.Series( + "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس" + ) + self.assertEqual( + preprocessing.clean(s), + preprocessing.clean(s_true, self._get_default_clean_pipeline()), + ) + + def test_clean_stopwords(self): + s = pd.Series("some stopwords are here\nAnd on") + s_true = pd.Series("some stopwords are here\nAnd on") + self.assertEqual( + preprocessing.clean(s), + preprocessing.clean(s_true, self._get_default_clean_pipeline()), + ) + + def test_clean_whitespaces(self): + s = pd.Series("hello world hello world \n there ") + s_true = pd.Series("hello world hello world \n there ") + self.assertEqual( + preprocessing.clean(s), + preprocessing.clean(s_true, self._get_default_clean_pipeline()), + ) """ Test stopwords. From 58874859109b33af80951847cb84748438c0deb4 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 29 Jul 2020 08:08:45 +0200 Subject: [PATCH 08/17] updated naming schema --- texthero/preprocessing.py | 68 +++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 62376336..424f9f92 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -18,33 +18,33 @@ from typing import List, Callable # REGEX pattern constants -PATTERN_REMOVE_DIGITS_BLOCK = r"\b\d+\b" -PATTERN_REMOVE_PUNCTUATION = rf"([{string.punctuation}])+" -PATTERN_STOPWORD_TOKENIZER = r"""(?x) # Set flag to allow verbose regexps - \w+(?:-\w+)* # Words with optional internal hyphens - | \s* # Any space - | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol - """ -PATTERN_REMOVE_ROUND_BRACKETS = r"\([^()]*\)" -PATERN_REMOVE_CURLY_BRACKETS = r"\{[^{}]*\}" -PATTERN_REMOVE_SQUARE_BRACKETS = r"\[[^\[\]]*\]" -PATTERN_REMOVE_ANGLE_BRACKETS = r"<[^<>]*>" -PATTERN_REMOVE_HTML_TAG = r"""(?x) # Turn on free-spacing - <[^>]+> # Remove tags - | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove   - """ - - -def GET_PATTERN_TOKENIZATION(punct: str) -> str: +DIGITS_BLOCK = r"\b\d+\b" +PUNCTUATION = rf"([{string.punctuation}])+" +STOPWORD_TOKENIZER = r"""(?x) # Set flag to allow verbose regexps + \w+(?:-\w+)* # Words with optional internal hyphens + | \s* # Any space + | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol + """ +ROUND_BRACKETS = r"\([^()]*\)" +CURLY_BRACKETS = r"\{[^{}]*\}" +SQUARE_BRACKETS = r"\[[^\[\]]*\]" +ANGLE_BRACKETS = r"<[^<>]*>" +HTML_TAG = r"""(?x) # Turn on free-spacing + <[^>]+> # Remove tags + | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove   + """ + + +def _get_pattern_for_tokenisation(punct: str) -> str: """ Returns the standart tokenisation pattern """ return rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" -PATTERN_REPLACE_URLS = r"http\S+" -PATTERN_REPLACE_TAGS = r"@[a-zA-Z0-9]+" -PATTERN_REPLACE_HASHTAGS = r"#[a-zA-Z0-9_]+" +URLS = r"http\S+" +TAGS = r"@[a-zA-Z0-9]+" +HASHTAGS = r"#[a-zA-Z0-9_]+" # Ignore gensim annoying warnings import warnings @@ -120,7 +120,7 @@ def replace_digits(s: pd.Series, symbols: str = " ", only_blocks=True) -> pd.Ser """ if only_blocks: - return s.str.replace(PATTERN_REMOVE_DIGITS_BLOCK, symbols) + return s.str.replace(DIGITS_BLOCK, symbols) else: return s.str.replace(r"\d+", symbols) @@ -185,7 +185,7 @@ def replace_punctuation(s: pd.Series, symbol: str = " ") -> pd.Series: dtype: object """ - return s.str.replace(PATTERN_REMOVE_PUNCTUATION, symbol) + return s.str.replace(PUNCTUATION, symbol) def remove_punctuation(s: pd.Series) -> pd.Series: @@ -296,7 +296,7 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str: return "".join( t if t not in words else symbol - for t in re.findall(PATTERN_STOPWORD_TOKENIZER, text) + for t in re.findall(STOPWORD_TOKENIZER, text) ) @@ -511,7 +511,7 @@ def _optimised_default_clean_single_cell(text: str) -> str: # remove digits and punctuation pattern_mixed_remove = ( - PATTERN_REMOVE_DIGITS_BLOCK + "|" + PATTERN_REMOVE_PUNCTUATION + DIGITS_BLOCK + "|" + PUNCTUATION ) text = re.sub(pattern_mixed_remove, "", text) @@ -588,7 +588,7 @@ def remove_round_brackets(s: pd.Series) -> pd.Series: :meth:`remove_square_brackets` """ - return s.str.replace(PATTERN_REMOVE_ROUND_BRACKETS, "") + return s.str.replace(ROUND_BRACKETS, "") def remove_curly_brackets(s: pd.Series) -> pd.Series: @@ -612,7 +612,7 @@ def remove_curly_brackets(s: pd.Series) -> pd.Series: :meth:`remove_square_brackets` """ - return s.str.replace(PATERN_REMOVE_CURLY_BRACKETS, "") + return s.str.replace(CURLY_BRACKETS, "") def remove_square_brackets(s: pd.Series) -> pd.Series: @@ -637,7 +637,7 @@ def remove_square_brackets(s: pd.Series) -> pd.Series: """ - return s.str.replace(PATTERN_REMOVE_SQUARE_BRACKETS, "") + return s.str.replace(SQUARE_BRACKETS, "") def remove_angle_brackets(s: pd.Series) -> pd.Series: @@ -661,7 +661,7 @@ def remove_angle_brackets(s: pd.Series) -> pd.Series: :meth:`remove_square_brackets` """ - return s.str.replace(PATTERN_REMOVE_ANGLE_BRACKETS, "") + return s.str.replace(ANGLE_BRACKETS, "") def remove_brackets(s: pd.Series) -> pd.Series: @@ -714,7 +714,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series: """ - return s.str.replace(PATTERN_REMOVE_HTML_TAG, "") + return s.str.replace(HTML_TAG, "") def tokenize(s: pd.Series) -> pd.Series: @@ -741,7 +741,7 @@ def tokenize(s: pd.Series) -> pd.Series: # In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w punct = string.punctuation.replace("_", "") - return s.str.replace(GET_PATTERN_TOKENIZATION(punct), r"\2 \3 \4 \5").str.split() + return s.str.replace(_get_pattern_for_tokenisation(punct), r"\2 \3 \4 \5").str.split() def tokenize_with_phrases( @@ -817,7 +817,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series: """ - return s.str.replace(PATTERN_REPLACE_URLS, symbol) + return s.str.replace(URLS, symbol) def remove_urls(s: pd.Series) -> pd.Series: @@ -866,7 +866,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series: """ - return s.str.replace(PATTERN_REPLACE_TAGS, symbol) + return s.str.replace(TAGS, symbol) def remove_tags(s: pd.Series) -> pd.Series: @@ -912,7 +912,7 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series: dtype: object """ - return s.str.replace(PATTERN_REPLACE_HASHTAGS, symbol) + return s.str.replace(HASHTAGS, symbol) def remove_hashtags(s: pd.Series) -> pd.Series: From addc23b3b8e49c2147f0181fed1743d19fc28e4a Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 29 Jul 2020 08:21:10 +0200 Subject: [PATCH 09/17] improved comment on helper function _get_pattern_for_tokenasiton() --- texthero/preprocessing.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 424f9f92..4c5c2d3e 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -34,18 +34,27 @@ | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove   """ +URLS = r"http\S+" +TAGS = r"@[a-zA-Z0-9]+" +HASHTAGS = r"#[a-zA-Z0-9_]+" + def _get_pattern_for_tokenisation(punct: str) -> str: """ Returns the standart tokenisation pattern + + The standart tokenisation will seperate all "regex words" '\w' from each other and also + puts the punctuation in its own tokens + + Parameters + ---------- + punct : String + the text, which should get tokenized by this pattern, but all '_' characters should have been removed, + as '\w' in regex already includes this one """ return rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" -URLS = r"http\S+" -TAGS = r"@[a-zA-Z0-9]+" -HASHTAGS = r"#[a-zA-Z0-9_]+" - # Ignore gensim annoying warnings import warnings @@ -295,8 +304,7 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str: """ return "".join( - t if t not in words else symbol - for t in re.findall(STOPWORD_TOKENIZER, text) + t if t not in words else symbol for t in re.findall(STOPWORD_TOKENIZER, text) ) @@ -510,9 +518,7 @@ def _optimised_default_clean_single_cell(text: str) -> str: text = text.lower() # remove digits and punctuation - pattern_mixed_remove = ( - DIGITS_BLOCK + "|" + PUNCTUATION - ) + pattern_mixed_remove = DIGITS_BLOCK + "|" + PUNCTUATION text = re.sub(pattern_mixed_remove, "", text) # remove diacritics @@ -741,7 +747,9 @@ def tokenize(s: pd.Series) -> pd.Series: # In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w punct = string.punctuation.replace("_", "") - return s.str.replace(_get_pattern_for_tokenisation(punct), r"\2 \3 \4 \5").str.split() + return s.str.replace( + _get_pattern_for_tokenisation(punct), r"\2 \3 \4 \5" + ).str.split() def tokenize_with_phrases( From 84652eea016ec476c58b9ffb5be891ec1b98adb9 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 29 Jul 2020 08:22:41 +0200 Subject: [PATCH 10/17] fixed spelling mistake --- texthero/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 4c5c2d3e..cfdeb74d 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -41,7 +41,7 @@ def _get_pattern_for_tokenisation(punct: str) -> str: """ - Returns the standart tokenisation pattern + Return the standart tokenisation pattern The standart tokenisation will seperate all "regex words" '\w' from each other and also puts the punctuation in its own tokens From 16b775fee5e21f2da3ef89f412482367460ec3dd Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Wed, 29 Jul 2020 08:37:11 +0200 Subject: [PATCH 11/17] fixed some merge conflicts --- texthero/preprocessing.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 99787834..d36391cd 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -752,10 +752,12 @@ def tokenize(s: pd.Series) -> pd.Series: ).str.split() -def tokenize_with_phrases( - s: pd.Series, min_count: int = 5, threshold: int = 10 -) -> pd.Series: - r"""Tokenize and group up collocations words +# Warning message for not-tokenized inputs +_not_tokenized_warning_message = ( + "It seems like the given Pandas Series s is not tokenized. This function will" + " tokenize it automatically using hero.tokenize(s) first. You should consider" + " tokenizing it yourself first with hero.tokenize(s) in the future." +) def phrases(s: pd.Series, min_count: int = 5, threshold: int = 10, symbol: str = "_"): From af6625267029112e2793b1555a7ef1b6f1524c00 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Thu, 6 Aug 2020 16:06:35 +0200 Subject: [PATCH 12/17] rm optimised clean --- texthero/preprocessing.py | 55 --------------------------------------- 1 file changed, 55 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index d36391cd..de553629 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -478,61 +478,6 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series: s = s.pipe(f) return s - -def _optimised_default_clean(s: pd.Series) -> pd.Series: - """ - Applies the default clean pipeline in an optimised way to a series, - that is about 30% faster. - - Default pipeline: - 1. :meth:`texthero.preprocessing.fillna` - 2. :meth:`texthero.preprocessing.lowercase` - 3. :meth:`texthero.preprocessing.remove_digits` - 4. :meth:`texthero.preprocessing.remove_punctuation` - 5. :meth:`texthero.preprocessing.remove_diacritics` - 6. :meth:`texthero.preprocessing.remove_stopwords` - 7. :meth:`texthero.preprocessing.remove_whitespace` - """ - return s.apply(_optimised_default_clean_single_cell) - - -def _optimised_default_clean_single_cell(text: str) -> str: - """ - Applies the default clean pipeline to one cell. - - Default pipeline: - 1. :meth:`texthero.preprocessing.fillna` - 2. :meth:`texthero.preprocessing.lowercase` - 3. :meth:`texthero.preprocessing.remove_digits` - 4. :meth:`texthero.preprocessing.remove_punctuation` - 5. :meth:`texthero.preprocessing.remove_diacritics` - 6. :meth:`texthero.preprocessing.remove_stopwords` - 7. :meth:`texthero.preprocessing.remove_whitespace` - """ - - # fillna - if pd.isna(text): - return "" - - # lowercase - text = text.lower() - - # remove digits and punctuation - pattern_mixed_remove = DIGITS_BLOCK + "|" + PUNCTUATION - text = re.sub(pattern_mixed_remove, "", text) - - # remove diacritics - text = _remove_diacritics(text) - - # remove stopwords - text = _replace_stopwords(text, _stopwords.DEFAULT, "") - - # remove whitespace - text = " ".join(re.sub("\xa0", " ", text).split()) - - return text - - def has_content(s: pd.Series) -> pd.Series: r""" Return a Boolean Pandas Series indicating if the rows have content. From 7ab3a389772e094b213bbef06295c08e8c366929 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Thu, 6 Aug 2020 16:09:40 +0200 Subject: [PATCH 13/17] inserted old clean inserted the old cleaning pipeline Co-authored-by: Henri Froese --- texthero/preprocessing.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index de553629..5fb083af 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -433,14 +433,33 @@ def _stem(text): return s.str.split().apply(_stem) +def get_default_pipeline() -> List[Callable[[pd.Series], pd.Series]]: + """ + Return a list contaning all the methods used in the default cleaning pipeline. + + Return a list with the following functions: + 1. :meth:`texthero.preprocessing.fillna` + 2. :meth:`texthero.preprocessing.lowercase` + 3. :meth:`texthero.preprocessing.remove_digits` + 4. :meth:`texthero.preprocessing.remove_punctuation` + 5. :meth:`texthero.preprocessing.remove_diacritics` + 6. :meth:`texthero.preprocessing.remove_stopwords` + 7. :meth:`texthero.preprocessing.remove_whitespace` + """ + return [ + fillna, + lowercase, + remove_digits, + remove_punctuation, + remove_diacritics, + remove_stopwords, + remove_whitespace, + ] + + def clean(s: pd.Series, pipeline=None) -> pd.Series: """ - Pre-process a text-based Pandas Series. - - There are two options to use this function. You can either use this function, buy not specifiying an pipeline. - In this case the clean function will use a default pipeline, which was hardcoded, to gain 30% performance improvements, - over the "pipe" method. - If you specify your own cleaning pipeline, the clean function will use this one instead. + Pre-process a text-based Pandas Series, by using the following default pipline. Default pipeline: 1. :meth:`texthero.preprocessing.fillna` @@ -451,7 +470,6 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series: 6. :meth:`texthero.preprocessing.remove_stopwords` 7. :meth:`texthero.preprocessing.remove_whitespace` - Parameters ---------- s : Pandas Series @@ -472,7 +490,7 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series: """ if not pipeline: - return _optimised_default_clean(s) + pipeline = get_default_pipeline() for f in pipeline: s = s.pipe(f) From a8fa33c75586ac4db336fe482227fe3c43745e9a Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Thu, 6 Aug 2020 16:26:39 +0200 Subject: [PATCH 14/17] removed clean tests --- tests/test_preprocessing.py | 58 ------------------------------------- texthero/preprocessing.py | 1 + 2 files changed, 1 insertion(+), 58 deletions(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index dd9a8586..03616dac 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -154,64 +154,6 @@ def test_clean(self): preprocessing.clean(s_true, self._get_default_clean_pipeline()), ) - def test_clean_fillna(self): - s = pd.Series(np.NaN) - s_true = pd.Series(np.NaN) - self.assertEqual( - preprocessing.clean(s), - preprocessing.clean(s_true, self._get_default_clean_pipeline()), - ) - - def test_clean_lowercase(self): - s = pd.Series("this text Is MiXed CasE") - s_true = pd.Series("this text Is MiXed CasE") - self.assertEqual( - preprocessing.clean(s), - preprocessing.clean(s_true, self._get_default_clean_pipeline()), - ) - - def test_clean_digits(self): - s = pd.Series("Here are 42 digits blocks 89") - s_true = pd.Series("Here are 42 digits blocks 89") - self.assertEqual( - preprocessing.clean(s), - preprocessing.clean(s_true, self._get_default_clean_pipeline()), - ) - - def test_clean_punctuation(self): - s = pd.Series("Some. wired, punctiation;.:!!!!") - s_true = pd.Series("Some. wired, punctiation;.:!!!") - self.assertEqual( - preprocessing.clean(s), - preprocessing.clean(s_true, self._get_default_clean_pipeline()), - ) - - def test_clean_diacritics(self): - s = pd.Series("Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس") - s_true = pd.Series( - "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس" - ) - self.assertEqual( - preprocessing.clean(s), - preprocessing.clean(s_true, self._get_default_clean_pipeline()), - ) - - def test_clean_stopwords(self): - s = pd.Series("some stopwords are here\nAnd on") - s_true = pd.Series("some stopwords are here\nAnd on") - self.assertEqual( - preprocessing.clean(s), - preprocessing.clean(s_true, self._get_default_clean_pipeline()), - ) - - def test_clean_whitespaces(self): - s = pd.Series("hello world hello world \n there ") - s_true = pd.Series("hello world hello world \n there ") - self.assertEqual( - preprocessing.clean(s), - preprocessing.clean(s_true, self._get_default_clean_pipeline()), - ) - """ Test stopwords. """ diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 5fb083af..998b2a03 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -496,6 +496,7 @@ def clean(s: pd.Series, pipeline=None) -> pd.Series: s = s.pipe(f) return s + def has_content(s: pd.Series) -> pd.Series: r""" Return a Boolean Pandas Series indicating if the rows have content. From ebf6e49124a289711b0980e60b0f3b7addd04de1 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Sat, 8 Aug 2020 11:44:16 +0200 Subject: [PATCH 15/17] changed the reviewed files --- texthero/preprocessing.py | 43 ++++++++++++++------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 998b2a03..d9edbcb1 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -11,13 +11,20 @@ import numpy as np import pandas as pd import unidecode + +# Ignore gensim annoying warnings +import warnings from nltk.stem import PorterStemmer, SnowballStemmer from texthero import stopwords as _stopwords from typing import List, Callable -# REGEX pattern constants +""" +Define all regex pattern, which will be used in the functions below. They define different charateristics, on how to clean +a text +""" + DIGITS_BLOCK = r"\b\d+\b" PUNCTUATION = rf"([{string.punctuation}])+" STOPWORD_TOKENIZER = r"""(?x) # Set flag to allow verbose regexps @@ -29,35 +36,20 @@ CURLY_BRACKETS = r"\{[^{}]*\}" SQUARE_BRACKETS = r"\[[^\[\]]*\]" ANGLE_BRACKETS = r"<[^<>]*>" -HTML_TAG = r"""(?x) # Turn on free-spacing +HTML_TAGS = r"""(?x) # Turn on free-spacing <[^>]+> # Remove tags | &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove   """ - URLS = r"http\S+" TAGS = r"@[a-zA-Z0-9]+" HASHTAGS = r"#[a-zA-Z0-9_]+" - -def _get_pattern_for_tokenisation(punct: str) -> str: - """ - Return the standart tokenisation pattern - - The standart tokenisation will seperate all "regex words" '\w' from each other and also - puts the punctuation in its own tokens - - Parameters - ---------- - punct : String - the text, which should get tokenized by this pattern, but all '_' characters should have been removed, - as '\w' in regex already includes this one - """ - return rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" +# In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w +punct = string.punctuation.replace("_", "") +TOKENIZE = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" # The standart tokenisation will seperate all "regex words" '\w' from each other and also +# puts the punctuation in its own tokens -# Ignore gensim annoying warnings -import warnings - warnings.filterwarnings(action="ignore", category=UserWarning, module="gensim") @@ -684,7 +676,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series: """ - return s.str.replace(HTML_TAG, "") + return s.str.replace(HTML_TAGS, "") def tokenize(s: pd.Series) -> pd.Series: @@ -708,12 +700,7 @@ def tokenize(s: pd.Series) -> pd.Series: """ - # In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w - punct = string.punctuation.replace("_", "") - - return s.str.replace( - _get_pattern_for_tokenisation(punct), r"\2 \3 \4 \5" - ).str.split() + return s.str.replace(TOKENIZE, r"\2 \3 \4 \5").str.split() # Warning message for not-tokenized inputs From c3962312039a206bc7037fabd9bb5c9531c9c9e2 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Sat, 8 Aug 2020 20:50:46 +0200 Subject: [PATCH 16/17] fixed docstring length --- tests/test_preprocessing.py | 26 +------------------------- texthero/preprocessing.py | 7 ++++--- 2 files changed, 5 insertions(+), 28 deletions(-) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 03616dac..5ec7c27d 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -118,30 +118,6 @@ def test_pipeline_stopwords(self): Test clean """ - def _get_default_clean_pipeline(self): - """ - Return a list contaning all the methods used in the default cleaning pipeline. - - Return a list with the following functions: - 1. :meth:`texthero.preprocessing.fillna` - 2. :meth:`texthero.preprocessing.lowercase` - 3. :meth:`texthero.preprocessing.remove_digits` - 4. :meth:`texthero.preprocessing.remove_punctuation` - 5. :meth:`texthero.preprocessing.remove_diacritics` - 6. :meth:`texthero.preprocessing.remove_stopwords` - 7. :meth:`texthero.preprocessing.remove_whitespace` - """ - - return [ - preprocessing.fillna, - preprocessing.lowercase, - preprocessing.remove_digits, - preprocessing.remove_punctuation, - preprocessing.remove_diacritics, - preprocessing.remove_stopwords, - preprocessing.remove_whitespace, - ] - def test_clean(self): s = pd.Series( ["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN] @@ -151,7 +127,7 @@ def test_clean(self): ) self.assertEqual( preprocessing.clean(s), - preprocessing.clean(s_true, self._get_default_clean_pipeline()), + preprocessing.clean(s_true, preprocessing.get_default_pipeline()), ) """ diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index d9edbcb1..080f3177 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -1,5 +1,6 @@ """ -The texthero.preprocess module allow for efficient pre-processing of text-based Pandas Series and DataFrame. +The texthero.preprocess module allow for efficient pre-processing of +text-based Pandas Series and DataFrame. """ from gensim.sklearn_api.phrases import PhrasesTransformer @@ -21,8 +22,8 @@ from typing import List, Callable """ -Define all regex pattern, which will be used in the functions below. They define different charateristics, on how to clean -a text +Define all regex pattern, which will be used in the functions below. +They define different charateristics, on how to clean a text """ DIGITS_BLOCK = r"\b\d+\b" From 244aca243bdcc386ff62150496d159aaf8a76b4c Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Sat, 8 Aug 2020 20:53:58 +0200 Subject: [PATCH 17/17] fixed formatting --- texthero/preprocessing.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index afba4622..9d8541e1 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -317,7 +317,6 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str: ) - @InputSeries(TextSeries) def replace_stopwords( s: TextSeries, symbol: str, stopwords: Optional[Set[str]] = None @@ -726,7 +725,6 @@ def remove_html_tags(s: TextSeries) -> TextSeries: return s.str.replace(HTML_TAGS, "") - @InputSeries(TextSeries) def tokenize(s: TextSeries) -> TokenSeries: """ @@ -845,7 +843,6 @@ def replace_urls(s: TextSeries, symbol: str) -> TextSeries: return s.str.replace(URLS, symbol) - @InputSeries(TextSeries) def remove_urls(s: TextSeries) -> TextSeries: r"""Remove all urls from a given Pandas Series.