In [None]:
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import contractions

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

stop_words_list = [
    "a",
    "an",
    "a",
    "the",
    "and",
    "at",
    "by",
    "to",
    "in",
    "out",
    "y",
    "are",
    "is",
    "as",
    "s",
    "t",
    "for",
]


def remove_http_links(text):
    return re.sub(r"http\S+|www.\S+", " ", text)


def remove_html_tags(text):
    return re.sub(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});", " ", text)


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def lemmatize_word(word, pos_tag):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word, pos=pos_tag)


def replace_digits(match):
    char_map = {"1": "i", "0": "o", "3": "e"}
    return char_map[match.group(0)]


def replace_digit_with_letter(text):
    """
    Transforms specified digits within a given string 'text' into their respective letter counterparts
    commonly used in leetspeak or similar stylizations.

    This function targets the digits '1', '0', and '3' when they are sandwiched between alphabetic characters
    and replaces them with 'i', 'o', and 'e', respectively.
    """
    return re.sub(r"(?<=[a-zA-Z])[103](?=[a-zA-Z])", replace_digits, text)


def reduce_repeated_letters(text):
    """
    Compresses sequences of identical letters occurring more than twice in a row within a given string 'text'
    to a single instance of that letter.

    This function is particularly useful in normalizing text with exaggerated letter repetitions, commonly found in
    informal communication like social media posts or text messages.

    Example:
        heeeello -> hello
        worlllld -> world
    """
    return re.sub(r"(.)\1{2,}", r"\1", text)


def reduce_haha(text):
    return re.sub(r"\bhaha\w*\b", "haha", text)


def replace_hyphens_with_spaces(text):
    """
    Replaces all hyphens '-' in the string with spaces ' '.
    """
    return text.replace("-", " ")


def remove_non_word_characters(text):
    """
    Removes all characters from the string that are not alphanumeric (letters and numbers) or whitespace.
    """
    return re.sub(r"[^\w\s]", "", text)


def replace_specific_word(word_to_replace, new_word, text):
    """
    Replaces occurrences of a specified word with a new word in the given text.
    This function targets only standalone instances of the specified word,
    not parts of other words.
    """
    pattern = r"\b{}\b".format(re.escape(word_to_replace))
    return re.sub(pattern, new_word, text)


def remove_spaces_from_spaced_words(text):
    pattern = r"(?:\b\w\s){2,}\w\b"

    def replace_spaces(match):
        return match.group().replace(" ", "")

    return re.sub(pattern, replace_spaces, text)


def remove_digits(text):
    """
    Removes all digits from the string.
    """
    return re.sub(r"\d+", "", text)


def normalize_whitespace(text):
    """
    Normalizes whitespace in the string, replacing multiple consecutive whitespace characters with a single space,
    and trims leading and trailing whitespace.
    """
    return re.sub(r"\s+", " ", text)


def filter_text(text):
    text = text.lower()
    text = remove_http_links(text)
    text = remove_html_tags(text)
    text = replace_digit_with_letter(text)
    text = reduce_repeated_letters(text)
    text = reduce_haha(text)
    text = replace_hyphens_with_spaces(text)
    text = contractions.fix(text)
    text = remove_non_word_characters(text)
    text = replace_specific_word("fck", "fuck", text)
    text = remove_digits(text)
    text = remove_spaces_from_spaced_words(text)
    tokens = normalize_whitespace(text).strip()

    filtered_tokens = [word for word in tokens if word not in stop_words_list]

    pos_tags = nltk.pos_tag(filtered_tokens)
    lemmatized_tokens = [
        lemmatize_word(word, get_wordnet_pos(pos)) for word, pos in pos_tags
    ]

    return " ".join(lemmatized_tokens)

In [None]:
text = "             hello           world                   aaaaaaaaaaaaa           dddddddddddddddd             rrrrrrrrrrr  rr  "
print(normalize_whitespace(text))
print(normalize_whitespace(text).strip().split())

In [25]:
filtered_tokens = "I'm tired".strip().split()
pos_tags = nltk.pos_tag(filtered_tokens)
print(pos_tags)
lemmatized_tokens = [
    lemmatize_word(word, get_wordnet_pos(pos)) for word, pos in pos_tags
]

print(lemmatized_tokens)

[("I'm", 'NNP'), ('tired', 'VBD')]
["I'm", 'tire']


In [28]:
sentences = [
    "She is running every day.",
    "He bought running shoes.",
    "The cat is sitting on the mat.",
    "It was a bright, sunny day.",
]

for sentence in sentences:
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    print(f"Sentence: '{sentence}'")
    print("Tags:", tagged)
    lemmatized_tokens = [
        lemmatize_word(word, get_wordnet_pos(pos)) for word, pos in tagged
    ]

    print(lemmatized_tokens)

Sentence: 'She is running every day.'
Tags: [('She', 'PRP'), ('is', 'VBZ'), ('running', 'VBG'), ('every', 'DT'), ('day', 'NN'), ('.', '.')]
['She', 'be', 'run', 'every', 'day', '.']
Sentence: 'He bought running shoes.'
Tags: [('He', 'PRP'), ('bought', 'VBD'), ('running', 'VBG'), ('shoes', 'NNS'), ('.', '.')]
['He', 'buy', 'run', 'shoe', '.']
Sentence: 'The cat is sitting on the mat.'
Tags: [('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sitting', 'VBG'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN'), ('.', '.')]
['The', 'cat', 'be', 'sit', 'on', 'the', 'mat', '.']
Sentence: 'It was a bright, sunny day.'
Tags: [('It', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('bright', 'JJ'), (',', ','), ('sunny', 'JJ'), ('day', 'NN'), ('.', '.')]
['It', 'be', 'a', 'bright', ',', 'sunny', 'day', '.']
