In [52]:
import re
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


TEXT CLEANING

In [21]:
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    s = s.lower()
    s = re.sub('[\-_\(\)\'@#]', ' ', s)
    s = re.sub('[0-9]', ' ', s)
    if 'www.' in s or 'http:' in s or 'https:' in s or '.com' in s:
        s = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "", s)
    return s

In [23]:
s_to_clean = "@Ironhack's-#Q website 776-is http://ironhack.com"

# ironhack s  q website  is

clean_string = clean_up(s_to_clean)
print(clean_string)

 ironhack s  q website     is 


TOKENIZATION


In [24]:
def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return word_tokenize(s)
    

In [39]:
tokenized_string = tokenize(clean_string)
tokenized_string

['ironhack', 's', 'q', 'website', 'is']

In [50]:
def stem_and_lemmatize(lista):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    stemmed_list = []
    for element in lista:
        lemmatizer = WordNetLemmatizer()
        z = lemmatizer.lemmatize(element)
        stemmer = PorterStemmer()
        y = stemmer.stem(z)
        stemmed_list.append(y)
    return stemmed_list
    

In [55]:
stemmed_string = stem_and_lemmatize(tokenized_string)
stemmed_string

['ironhack', 's', 'q', 'websit', 'is']

In [53]:
def remove_stopwords(l):
    """
    Remove English stopwords from a list of strings.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after stop words are removed.
    """
    english_stop_words = stopwords.words('english')
    removed_stop_words = []
    removed_stop_words.append(
            ' '.join([word for word in l 
                      if word not in english_stop_words])
        )
    return removed_stop_words

In [56]:
remove_stopwords(stemmed_string)

['ironhack q websit']