Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleaned up regex function by introducing constants for patterns #143

Draft
wants to merge 20 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
40 changes: 40 additions & 0 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,46 @@ def test_pipeline_stopwords(self):
pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)

"""
Test clean
"""

def _get_default_clean_pipeline(self):
henrifroese marked this conversation as resolved.
Show resolved Hide resolved
"""
Return a list contaning all the methods used in the default cleaning pipeline.

Return a list with the following functions:
1. :meth:`texthero.preprocessing.fillna`
2. :meth:`texthero.preprocessing.lowercase`
3. :meth:`texthero.preprocessing.remove_digits`
4. :meth:`texthero.preprocessing.remove_punctuation`
5. :meth:`texthero.preprocessing.remove_diacritics`
6. :meth:`texthero.preprocessing.remove_stopwords`
7. :meth:`texthero.preprocessing.remove_whitespace`
"""

return [
preprocessing.fillna,
preprocessing.lowercase,
preprocessing.remove_digits,
preprocessing.remove_punctuation,
preprocessing.remove_diacritics,
preprocessing.remove_stopwords,
preprocessing.remove_whitespace,
]

def test_clean(self):
s = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
s_true = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, self._get_default_clean_pipeline()),
)

"""
Test stopwords.
"""
Expand Down
86 changes: 55 additions & 31 deletions texthero/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,44 @@

from typing import List, Callable

# REGEX pattern constants
mk2510 marked this conversation as resolved.
Show resolved Hide resolved
DIGITS_BLOCK = r"\b\d+\b"
PUNCTUATION = rf"([{string.punctuation}])+"
STOPWORD_TOKENIZER = r"""(?x) # Set flag to allow verbose regexps
\w+(?:-\w+)* # Words with optional internal hyphens
| \s* # Any space
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol
"""
ROUND_BRACKETS = r"\([^()]*\)"
CURLY_BRACKETS = r"\{[^{}]*\}"
SQUARE_BRACKETS = r"\[[^\[\]]*\]"
ANGLE_BRACKETS = r"<[^<>]*>"
HTML_TAG = r"""(?x) # Turn on free-spacing
mk2510 marked this conversation as resolved.
Show resolved Hide resolved
<[^>]+> # Remove <html> tags
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
"""

URLS = r"http\S+"
TAGS = r"@[a-zA-Z0-9]+"
HASHTAGS = r"#[a-zA-Z0-9_]+"


def _get_pattern_for_tokenisation(punct: str) -> str:
"""
Return the standart tokenisation pattern

The standart tokenisation will seperate all "regex words" '\w' from each other and also
puts the punctuation in its own tokens

Parameters
----------
punct : String
the text, which should get tokenized by this pattern, but all '_' characters should have been removed,
as '\w' in regex already includes this one
"""
return rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"


# Ignore gensim annoying warnings
import warnings

Expand Down Expand Up @@ -91,8 +129,7 @@ def replace_digits(s: pd.Series, symbols: str = " ", only_blocks=True) -> pd.Ser
"""

if only_blocks:
pattern = r"\b\d+\b"
return s.str.replace(pattern, symbols)
return s.str.replace(DIGITS_BLOCK, symbols)
else:
return s.str.replace(r"\d+", symbols)

Expand Down Expand Up @@ -157,7 +194,7 @@ def replace_punctuation(s: pd.Series, symbol: str = " ") -> pd.Series:
dtype: object
"""

return s.str.replace(rf"([{string.punctuation}])+", symbol)
return s.str.replace(PUNCTUATION, symbol)


def remove_punctuation(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -266,13 +303,9 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str:

"""

pattern = r"""(?x) # Set flag to allow verbose regexps
\w+(?:-\w+)* # Words with optional internal hyphens
| \s* # Any space
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol
"""

return "".join(t if t not in words else symbol for t in re.findall(pattern, text))
return "".join(
t if t not in words else symbol for t in re.findall(STOPWORD_TOKENIZER, text)
)


def replace_stopwords(
Expand Down Expand Up @@ -525,7 +558,7 @@ def remove_round_brackets(s: pd.Series) -> pd.Series:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"\([^()]*\)", "")
return s.str.replace(ROUND_BRACKETS, "")


def remove_curly_brackets(s: pd.Series) -> pd.Series:
Expand All @@ -549,7 +582,7 @@ def remove_curly_brackets(s: pd.Series) -> pd.Series:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"\{[^{}]*\}", "")
return s.str.replace(CURLY_BRACKETS, "")


def remove_square_brackets(s: pd.Series) -> pd.Series:
Expand All @@ -574,7 +607,7 @@ def remove_square_brackets(s: pd.Series) -> pd.Series:


"""
return s.str.replace(r"\[[^\[\]]*\]", "")
return s.str.replace(SQUARE_BRACKETS, "")


def remove_angle_brackets(s: pd.Series) -> pd.Series:
Expand All @@ -598,7 +631,7 @@ def remove_angle_brackets(s: pd.Series) -> pd.Series:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"<[^<>]*>", "")
return s.str.replace(ANGLE_BRACKETS, "")


def remove_brackets(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -651,12 +684,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series:

"""

pattern = r"""(?x) # Turn on free-spacing
<[^>]+> # Remove <html> tags
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
"""

return s.str.replace(pattern, "")
return s.str.replace(HTML_TAG, "")


def tokenize(s: pd.Series) -> pd.Series:
Expand All @@ -680,12 +708,12 @@ def tokenize(s: pd.Series) -> pd.Series:

"""

punct = string.punctuation.replace("_", "")
# In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w
punct = string.punctuation.replace("_", "")
mk2510 marked this conversation as resolved.
Show resolved Hide resolved

pattern = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"

return s.str.replace(pattern, r"\2 \3 \4 \5").str.split()
return s.str.replace(
_get_pattern_for_tokenisation(punct), r"\2 \3 \4 \5"
).str.split()


# Warning message for not-tokenized inputs
Expand Down Expand Up @@ -775,9 +803,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series:

"""

pattern = r"http\S+"

return s.str.replace(pattern, symbol)
return s.str.replace(URLS, symbol)


def remove_urls(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -826,8 +852,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series:

"""

pattern = r"@[a-zA-Z0-9]+"
return s.str.replace(pattern, symbol)
return s.str.replace(TAGS, symbol)


def remove_tags(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -873,8 +898,7 @@ def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series:
dtype: object

"""
pattern = r"#[a-zA-Z0-9_]+"
return s.str.replace(pattern, symbol)
return s.str.replace(HASHTAGS, symbol)


def remove_hashtags(s: pd.Series) -> pd.Series:
Expand Down