In [32]:
import re
from typing import Generator

import contractions
from autocorrect import Speller


class TextProcessor:

    def __init__(self, lang: str = 'en'):
        self.autocorrect = Speller(lang=lang)


    def to_lowercase(self, text: str) -> str:
        return text.lower()
    
    def remove_punctuation(self, text: str) -> str:
        return re.sub(r'[^\w\s]', '', text)
    
    def correct_spelling(self, text: str) -> str:
        return self.autocorrect(text)

    def expand_contractions(self, text: str) -> str:
        return contractions.fix(text)
    
    def tokenize(self, text: str) -> Generator[str, None, None]:
        for token in text.split():
            yield token

    def process(self, text: str, tokenize: bool = True) -> str:
        text = self.to_lowercase(text)
        text = self.expand_contractions(text)
        text = self.remove_punctuation(text)
        text = self.correct_spelling(text)
        if tokenize:
            text = self.tokenize(text)
        return text
    
    __call__ = process

class FrequencyAnalyzer:

    def __init__(self):
        pass

    def word_frequencies(self, text: list[str]) -> dict[str, int]:
        freq = {}
        for word in text:
            freq[word] = freq.get(word, 0) + 1
        return freq
    
    __call__ = word_frequencies


In [34]:
text = "Python is great! Python is easy to learn. Isn't Python amazing?"

processor = TextProcessor()
analyzer = FrequencyAnalyzer()

processed_text = processor(text, tokenize=True)
processed_text

<generator object TextProcessor.tokenize at 0x7f7ab65dcee0>

In [35]:
frequencies = analyzer(processed_text)
frequencies

{'python': 3,
 'is': 3,
 'great': 1,
 'easy': 1,
 'to': 1,
 'learn': 1,
 'not': 1,
 'amazing': 1}