# Text preprocessing

In this exercise, you'll need to implement basic text preprocessing steps.

In [1]:
from typing import List, Set
import ipytest
import string
import re

ipytest.autoconfig()

## Task 1: Tokenization

Split an input text into tokens based on whitespaces, punctuation, hyphens, and HTML markup. Additionally, lowercase all tokens.

In [2]:
def tokenize(text: str)-> List[str]:    
    """Returns a sequence of terms given an input text."""
    # Remove HTML markup using a regular expression.
    re_html = re.compile("<[^>]+>")
    text = re_html.sub(" ", text)
    # Replace punctuation marks (including hyphens) with spaces.
    for c in string.punctuation:
        text = text.replace(c, " ")
    # Lowercase and split on whitespaces.
    return text.lower().split()

Tests.

In [3]:
%%run_pytest[clean]

def test_whitespace():
    assert tokenize("aaa bbb ccc") == ["aaa", "bbb", "ccc"]
    
def test_punctuation():
    assert tokenize("aaa! bbb.ccc,ddd:eee ff\"f") == ["aaa", "bbb", "ccc", "ddd", "eee", "ff", "f"]
    
def test_hyphens():
    assert tokenize("aaa bbb-Ccc") == ["aaa", "bbb", "ccc"]
    
def test_html():
    assert tokenize("aaa <bbb>ccc <ddd>eee</ddd></bbb>fff <ggg />") == ["aaa", "ccc", "eee", "fff"]

....                                                                                                      [100%]
4 passed in 0.02s


## Task 2: Stopwords removal

Remove stopwords from a sequence of tokens, given a set of stopwords.

In [4]:
def remove_stopwords(tokens: List[str], stopwords: Set[str]) -> List[str]:
    """Removes stopwords from a sequence of tokens."""
    return [token for token in tokens if token not in stopwords]

Tests.

In [5]:
%%run_pytest[clean]

def test_no_stopwords():
    assert remove_stopwords(["this", "is", "some", "text"], {}) == ["this", "is", "some", "text"]
    
def test_stopwords():
    assert remove_stopwords(["this", "is", "some", "text"], {"is", "this"}) == ["some", "text"]
    
def test_stopwords2():
    assert remove_stopwords(["this", "isolate", "otto"], {"is", "this", "to"}) == ["isolate", "otto"]    

...                                                                                                       [100%]
3 passed in 0.01s


## Task 3: Suffix-s stemming

Remove the s-suffix from all terms in a sequence.

In [6]:
def suffix_s_stemmer(terms: List[str]) -> List[str]:
    """Removes the s-suffix from all terms in a sequence."""
    stemmed_terms = []
    for term in terms:        
        stemmed_term = term[:-1] if term[-1] == "s" else term
        stemmed_terms.append(stemmed_term)
    return stemmed_terms

Tests.

In [7]:
%%run_pytest[clean]

def test_stemming():
    assert suffix_s_stemmer(["dogs", "better", "cats"]) == ["dog", "better", "cat"]

.                                                                                                         [100%]
1 passed in 0.01s
