In [17]:
from collections import Counter


class CountVectorizer:
    def __init__(self):
        self.feature_names = set()

    def fit_transform(self, corpus):
        self.fit(corpus)
        return self.transform(corpus)

    def fit(self, corpus):
        for one_text in corpus:
            words = one_text.lower().split()
            for word in words:
                self.feature_names.add(word)

    def transform(self, corpus):
        count_matrix = []
        for one_text in corpus:
            row = []
            counter = Counter(one_text.lower().split())
            for word in self.feature_names:
                row.append(counter.get(word, 0))
            count_matrix.append(row)
        return count_matrix

    def get_feature_names(self):
        return self.feature_names

class TfIdfTransformer:
    def tf_transform(self, count_matrix: []) -> []:
        transform = []
        for one_matrix in count_matrix:
            sum_ = sum(one_matrix)
            one_transform = [round(el / sum_, 3) for el in one_matrix]
            transform.append(one_transform)
        return transform

    def idf_transform(self, count_matrix: []) -> []:
        result = []
        document_count = len(count_matrix) + 1
        for column in range(len(count_matrix[0])):
            cur_sum = 0
            for row in range(len(count_matrix)):
                cur_sum += bool(count_matrix[row][column])
            result.append(cur_sum + 1)
        for i in range(len(result)):
            result[i] = round(math.log(result[i] / document_count) + 1, 3)
        return result

    def fit_transform(self, count_matrix: []) -> []:
        tf = self.tf_transform(count_matrix)
        idf = self.idf_transform(count_matrix)
        tf_idf = []
        for text in tf:
            tf_idf.append([round(a * b, 3) for a, b in zip(text, idf)])
        return tf_idf


class TfIdfVectorizer(CountVectorizer):
    def __init__(self) -> None:
        super().__init__()
        self._tfidf_transformer = TfIdfTransformer()

    def fit_transform(self, corpus):
        count_matrix = super().fit_transform(corpus)
        return self._tfidf_transformer.fit_transform(count_matrix)
    
if __name__ == '__main__':
    corpus_ = [
        'Crock Pot Pasta Never boil pasta again',
        'Pasta Pomodoro Fresh ingredients Parmesan to taste'
    ]
    vectorizer = CountVectorizer()
    count_matrix_ = vectorizer.fit_transform(corpus_)
    transformer = TfIdfTransformer()
    tf_vectorizer = TfIdfVectorizer()
    print(vectorizer.get_feature_names())
    print(count_matrix_)
    print(transformer.tf_transform(count_matrix_))
    print(transformer.idf_transform(count_matrix_))
    print(transformer.fit_transform(count_matrix_))
    print(tf_vectorizer.fit_transform(corpus_))

{'crock', 'again', 'fresh', 'never', 'to', 'taste', 'pot', 'pasta', 'pomodoro', 'parmesan', 'ingredients', 'boil'}
[[1, 1, 0, 1, 0, 0, 1, 2, 0, 0, 0, 1], [0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0]]
[[0.143, 0.143, 0.0, 0.143, 0.0, 0.0, 0.143, 0.286, 0.0, 0.0, 0.0, 0.143], [0.0, 0.0, 0.143, 0.0, 0.143, 0.143, 0.0, 0.143, 0.143, 0.143, 0.143, 0.0]]
[0.595, 0.595, 0.595, 0.595, 0.595, 0.595, 0.595, 1.0, 0.595, 0.595, 0.595, 0.595]
[[0.085, 0.085, 0.0, 0.085, 0.0, 0.0, 0.085, 0.286, 0.0, 0.0, 0.0, 0.085], [0.0, 0.0, 0.085, 0.0, 0.085, 0.085, 0.0, 0.143, 0.085, 0.085, 0.085, 0.0]]
[[0.085, 0.085, 0.0, 0.085, 0.0, 0.0, 0.085, 0.286, 0.0, 0.0, 0.0, 0.085], [0.0, 0.0, 0.085, 0.0, 0.085, 0.085, 0.0, 0.143, 0.085, 0.085, 0.085, 0.0]]
