In [None]:
from src.preprocessing import Pipeline
from src.preprocessing import steps as ps
from nltk.stem import PorterStemmer

text = "Python is great! Python is easy to learn. Isn't Python amazing?"
sentence = ' My Schoooooool is reeeeeeaaaallllllllly amaaaaaazingggg!'
another_text = "Thisss is a smaplee sentnce with som misspelled wirds. Coooool!"

def porter_stem_tokens(tokens: list[str]) -> list[str]:
    """
    Stem a list of tokens using the Porter stemming algorithm.

    Parameters
    ----------
    tokens : list of str
        List of tokens to stem.

    Returns
    -------
    list of str
        List of stemmed tokens.
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

pipeline = Pipeline(steps=[
    ps.to_lowercase,
    ps.expand_contractions,
    ps.remove_punctuation,
    ps.remove_repeated_characters,
    ps.correct_spelling,
    ps.tokenize,
    ps.remove_stopwords,
    ps.correct_spelling,
    porter_stem_tokens,
    ps.lemmatize_tokens,
    ps.correct_spelling,
])

result = pipeline(sentence)
result

['school', 'really', 'ama']

In [2]:
print(pipeline.__doc__)


    A simple function-based processing pipeline.

    Each step is a callable that accepts one argument and returns a value.
    The output of one step is passed as the input to the next.

    Parameters
    ----------
    steps : list of Callable[[Any], Any]
        A list of callables that will be applied in sequence.
    


In [7]:
print(pipeline(another_text))
pipeline

['smaller', 'sentence', 'som', 'missed', 'word', 'cool']


Pipeline(
  to_lowercase
    ⬇
  expand_contractions
    ⬇
  remove_punctuation
    ⬇
  remove_repeated_characters
    ⬇
  correct_spelling
    ⬇
  tokenize
    ⬇
  remove_stopwords
    ⬇
  correct_spelling
    ⬇
  porter_stem_tokens
    ⬇
  lemmatize_tokens
    ⬇
  correct_spelling
)