In [None]:
import re
from typing import Generator

import contractions
from autocorrect import Speller
from nltk.corpus import wordnet


class TextProcessor:

    def __init__(self, lang: str = 'en'):
        self.autocorrect = Speller(lang=lang)


    def to_lowercase(self, text: str) -> str:
        return text.lower()
    
    def remove_punctuation(self, text: str) -> str:
        return re.sub(r'[^\w\s]', '', text)

    def correct_spelling(self, text: Generator[str, None, None]) -> Generator[str, None, None]:
        for word in text:
            yield self.autocorrect(word)

    def expand_contractions(self, text: str) -> str:
        return contractions.fix(text)
    
    def tokenize(self, text: str) -> Generator[str, None, None]:
        for token in text.split():
            yield token
    
    def is_valid_word(self, word: str) -> bool:
        return bool(wordnet.synsets(word))

    def remove_repeated_characters(self, tokens: Generator[str, None, None]) -> Generator[str, None, None]:
        repeatPattern = re.compile(r'(\w*)(\w)\2(\w*)')

        def replace_with_valid_word(word: str) -> str:
            while not self.is_valid_word(word):
                newWord = repeatPattern.sub(r'\1\2\3', word)
                if newWord == word:
                    break
                word = newWord
            return word
    
        for word in tokens:
            yield replace_with_valid_word(word)


    def process(self, text: str) -> str:
        text = self.to_lowercase(text)
        text = self.expand_contractions(text)
        text = self.remove_punctuation(text)

        text = self.tokenize(text)
        text = self.remove_repeated_characters(text)
        text = self.correct_spelling(text)
        return text
    
    __call__ = process

class FrequencyAnalyzer:

    def __init__(self):
        pass

    def word_frequencies(self, text: list[str]) -> dict[str, int]:
        freq = {}
        for word in text:
            freq[word] = freq.get(word, 0) + 1
        return freq
    
    __call__ = word_frequencies


In [17]:
text = "Python is great! Python is easy to learn. Isn't Python amazing?"
sentence = ' My Schoooooool is reeeeeeaaaallllllllly amaaaaaazingggg!'

processor = TextProcessor()
analyzer = FrequencyAnalyzer()

processed_text = processor(text)
processed_text

processed_sentence = processor(sentence)
processed_sentence

<generator object TextProcessor.correct_spelling at 0x7f365dba5540>

In [18]:
frequencies = analyzer(processed_sentence)
frequencies

{'my': 1, 'school': 1, 'is': 1, 'really': 1, 'amazing': 1}