In [1]:
from collections import Counter
from abc import ABC, abstractmethod
import string

class TextProcessor(ABC):
    @abstractmethod
    def transform():
        pass
    
#----------------------------------------------------    
class ConvertCase(TextProcessor):
    def __init__(self, casing='lower'):
        self.casing = casing
        
    def transform(self, text):
        if self.casing == 'lower':
            return text.lower()
        else:
            return text.upper()
    
#-----------------------------------------------------
class RemoveDigit(TextProcessor):
    def transform(self, text):
        text = ''.join(filter(lambda char: char not in string.digits, text))
        return text

#-----------------------------------------------------
class RemoveSpace(TextProcessor):
    def transform(self, text):
        text = ' '.join(text.split())
        return text

#---------------------------------------------------
class RemovePunkt(TextProcessor):    
    def transform(self, text):
        text = ''.join(filter(lambda char: char not in string.punctuation, text))
        return text

#---------------------------------------------------
class PipeLineText(TextProcessor):
    def __init__(self,*args):
        self.args = args
        
    def __repr__(self):
        return ' ==> '.join([tr.__class__.__name__ for tr in self.args])
    
    def transform(self, text):
        for tr in self.args:
            text = tr.transform(text)
            
        return text

In [2]:
# define functions:
def preprocess_txt(text, *processes):
    return PipeLineText(*processes).transform(text)


def count_words(text):
    return Counter(text.split())

In [6]:
counter = Counter()
with open('data/file.txt') as f:
    for line in f:
        line = preprocess_txt(
            line,
            ConvertCase(), 
            RemoveDigit(), 
            RemovePunkt(), 
            RemoveSpace())

        counter += count_words(line)
        
print(dict(counter))
print('most freq word = ', max(counter, key=counter.get))
print('longest word   = ', max(counter, key=len))


{'woman': 3, 'life': 5, 'freedom': 1, 'or': 1, 'liberty': 1, 'is': 1, 'a': 3, 'popular': 1, 'political': 1, 'kurdish': 1, 'slogan': 1, 'that': 1, 'became': 1, 'rallying': 1, 'cry': 1, 'during': 1, 'the': 2, 'protests': 1, 'which': 1, 'occurred': 1, 'as': 1, 'response': 1, 'to': 1, 'death': 1, 'of': 1, 'mahsa': 1, 'amini': 1, 'voman': 1}
most freq word =  life
longest word   =  political
