Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleaned up regex function by introducing constants for patterns #143

Draft
wants to merge 20 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 16 additions & 0 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,22 @@ def test_pipeline_stopwords(self):
pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
self.assertEqual(preprocessing.clean(s, pipeline=pipeline), s_true)

"""
Test clean
"""

def test_clean(self):
s = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
s_true = pd.Series(
["This serös 42 should bE CLeaned.! I am a stopword \n", np.NAN]
)
self.assertEqual(
preprocessing.clean(s),
preprocessing.clean(s_true, preprocessing.get_default_pipeline()),
)

"""
Test stopwords.
"""
Expand Down
86 changes: 48 additions & 38 deletions texthero/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
The texthero.preprocess module allow for efficient pre-processing of text-based Pandas Series and DataFrame.
The texthero.preprocess module allow for efficient pre-processing of
text-based Pandas Series and DataFrame.
"""

from gensim.sklearn_api.phrases import PhrasesTransformer
Expand All @@ -11,15 +12,45 @@
import numpy as np
import pandas as pd
import unidecode

# Ignore gensim annoying warnings
import warnings
from nltk.stem import PorterStemmer, SnowballStemmer

from texthero import stopwords as _stopwords
from texthero._types import TokenSeries, TextSeries, InputSeries

from typing import List, Callable, Union

# Ignore gensim annoying warnings
import warnings
"""
Define all regex pattern, which will be used in the functions below.
They define different charateristics, on how to clean a text
"""

DIGITS_BLOCK = r"\b\d+\b"
PUNCTUATION = rf"([{string.punctuation}])+"
STOPWORD_TOKENIZER = r"""(?x) # Set flag to allow verbose regexps
\w+(?:-\w+)* # Words with optional internal hyphens
| \s* # Any space
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol
"""
ROUND_BRACKETS = r"\([^()]*\)"
CURLY_BRACKETS = r"\{[^{}]*\}"
SQUARE_BRACKETS = r"\[[^\[\]]*\]"
ANGLE_BRACKETS = r"<[^<>]*>"
HTML_TAGS = r"""(?x) # Turn on free-spacing
<[^>]+> # Remove <html> tags
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
"""
URLS = r"http\S+"
TAGS = r"@[a-zA-Z0-9]+"
HASHTAGS = r"#[a-zA-Z0-9_]+"

# In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore) character." We therefore remove it from the punctuation string as this is already included in \w
punct = string.punctuation.replace("_", "")
TOKENIZE = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))" # The standart tokenisation will seperate all "regex words" '\w' from each other and also
# puts the punctuation in its own tokens


warnings.filterwarnings(action="ignore", category=UserWarning, module="gensim")

Expand Down Expand Up @@ -97,8 +128,7 @@ def replace_digits(s: TextSeries, symbols: str = " ", only_blocks=True) -> TextS
"""

if only_blocks:
pattern = r"\b\d+\b"
return s.str.replace(pattern, symbols)
return s.str.replace(DIGITS_BLOCK, symbols)
else:
return s.str.replace(r"\d+", symbols)

Expand Down Expand Up @@ -166,7 +196,7 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries:
dtype: object
"""

return s.str.replace(rf"([{string.punctuation}])+", symbol)
return s.str.replace(PUNCTUATION, symbol)


@InputSeries(TextSeries)
Expand Down Expand Up @@ -282,13 +312,9 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str:

"""

pattern = r"""(?x) # Set flag to allow verbose regexps
\w+(?:-\w+)* # Words with optional internal hyphens
| \s* # Any space
| [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol
"""

return "".join(t if t not in words else symbol for t in re.findall(pattern, text))
return "".join(
t if t not in words else symbol for t in re.findall(STOPWORD_TOKENIZER, text)
)


@InputSeries(TextSeries)
Expand Down Expand Up @@ -561,7 +587,7 @@ def remove_round_brackets(s: TextSeries) -> TextSeries:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"\([^()]*\)", "")
return s.str.replace(ROUND_BRACKETS, "")


@InputSeries(TextSeries)
Expand All @@ -587,7 +613,7 @@ def remove_curly_brackets(s: TextSeries) -> TextSeries:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"\{[^{}]*\}", "")
return s.str.replace(CURLY_BRACKETS, "")


@InputSeries(TextSeries)
Expand All @@ -614,7 +640,7 @@ def remove_square_brackets(s: TextSeries) -> TextSeries:


"""
return s.str.replace(r"\[[^\[\]]*\]", "")
return s.str.replace(SQUARE_BRACKETS, "")


@InputSeries(TextSeries)
Expand All @@ -640,7 +666,7 @@ def remove_angle_brackets(s: TextSeries) -> TextSeries:
:meth:`remove_square_brackets`

"""
return s.str.replace(r"<[^<>]*>", "")
return s.str.replace(ANGLE_BRACKETS, "")


@InputSeries(TextSeries)
Expand Down Expand Up @@ -696,12 +722,7 @@ def remove_html_tags(s: TextSeries) -> TextSeries:

"""

pattern = r"""(?x) # Turn on free-spacing
<[^>]+> # Remove <html> tags
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
"""

return s.str.replace(pattern, "")
return s.str.replace(HTML_TAGS, "")


@InputSeries(TextSeries)
Expand All @@ -726,14 +747,7 @@ def tokenize(s: TextSeries) -> TokenSeries:

"""

punct = string.punctuation.replace("_", "")
# In regex, the metacharacter 'w' is "a-z, A-Z, 0-9, including the _ (underscore)
# character." We therefore remove it from the punctuation string as this is already
# included in \w.

pattern = rf"((\w)([{punct}])(?:\B|$)|(?:^|\B)([{punct}])(\w))"

return s.str.replace(pattern, r"\2 \3 \4 \5").str.split()
return s.str.replace(TOKENIZE, r"\2 \3 \4 \5").str.split()


# Warning message for not-tokenized inputs
Expand Down Expand Up @@ -826,9 +840,7 @@ def replace_urls(s: TextSeries, symbol: str) -> TextSeries:

"""

pattern = r"http\S+"

return s.str.replace(pattern, symbol)
return s.str.replace(URLS, symbol)


@InputSeries(TextSeries)
Expand Down Expand Up @@ -880,8 +892,7 @@ def replace_tags(s: TextSeries, symbol: str) -> TextSeries:

"""

pattern = r"@[a-zA-Z0-9]+"
return s.str.replace(pattern, symbol)
return s.str.replace(TAGS, symbol)


@InputSeries(TextSeries)
Expand Down Expand Up @@ -933,8 +944,7 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries:
dtype: object

"""
pattern = r"#[a-zA-Z0-9_]+"
return s.str.replace(pattern, symbol)
return s.str.replace(HASHTAGS, symbol)


@InputSeries(TextSeries)
Expand Down