In [4]:
import re
import math
import itertools
import numpy as np
from faker import Faker

In [5]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False

    def preprocessing(self, sequences):
        # lambda func & preprocessing
        sub_low = lambda x: re.sub(r"[^a-zA-Z0-9]", "", x).lower()
        result = [list(map(sub_low, s.split())) for s in sequences]
        return result
  
    def fit(self, sequences):
        self.fit_checker = False
        
        # tokenizing
        tokens = self.preprocessing(sequences) 
        tokens = set(itertools.chain(*tokens))

        # 딕셔너리 value 중복 및 유실 방지
        st_point = max(list(self.word_dict.values()))

        # genarate word dictionary
        _dict = {v: (i + st_point + 1) for i, v in enumerate(tokens)}
        self.word_dict = dict(_dict, **self.word_dict) 
        
        self.fit_checker = True
    
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        
        if self.fit_checker:
            # lambda func & numberizing
            find_key = lambda x: x if x in self.word_dict.keys() else "oov"
            find_value = lambda x: self.word_dict[find_key(x)]
            result = [list(map(find_value, t)) for t in tokens]
            
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
      
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result
    

In [6]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
        self.idf_matrix = []
        self.tfidf_matrix = []
  
    def fit(self, sequences):
        # tokenized sentense & numberized token
        tokenized = self.tokenizer.fit_transform(sequences)
        numbered_token = tk.word_dict.values()
        
        # lambda func & df value for each token
        func_df = lambda x: sum([1 for tk in tokenized if x in tk])
        token_count = {num:func_df(num) for num in numbered_token}        
        
        # lambda func & df-matrix
        find_value = lambda x: token_count[x]
        df_matrix = [list(map(find_value, tk)) for tk in tokenized]
        
        # lambda func & idf-matrix
        n = len(sequences)
        func_idf = lambda x: math.log(n / (1 + x))
        self.idf_matrix = [list(map(func_idf, df)) for df in df_matrix]
        
        self.fit_checker = True
    
    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            
            get_idf = lambda x: np.array(self.idf_matrix[x])
            func_tf = lambda y: np.array(list(map(lambda x: y.count(x), y)))
            self.tfidf_matrix = [(func_tf(tk) * get_idf(i)).tolist() for i, tk in enumerate(tokenized)]
            
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

  
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [7]:
fake = Faker()
tk = Tokenizer()
tfidf = TfidfVectorizer(tk)

In [8]:
text = fake.texts(nb_texts = 100)

In [10]:
text[:3]

['Space must management order answer month stage when. Number financial east leave condition social. Beat avoid candidate school raise.',
 'Explain than whether. Guess lot blood land little. Yet hard compare year may against ready.',
 'Nothing rule amount fire population. They although form grow allow clearly.\nEffect type teach seek wide. Hundred ground unit energy head.']

In [12]:
value = tfidf.fit_transform(text)
original = tk.preprocessing(text)

In [22]:
for i in range(10):
    print(text[i])
    print(original[i][value[i].index(max(value[i]))])
    print()

Space must management order answer month stage when. Number financial east leave condition social. Beat avoid candidate school raise.
answer

Explain than whether. Guess lot blood land little. Yet hard compare year may against ready.
yet

Nothing rule amount fire population. They although form grow allow clearly.
Effect type teach seek wide. Hundred ground unit energy head.
they

Throughout course land any. Usually produce themselves senior. Baby need drop either morning such subject money.
Form which begin attorney door phone. Financial authority prove ball.
begin

Develop stock many day beyond. Body lay knowledge at enough environmental.
Performance instead must find. Pay kitchen scene miss maintain. Them great mouth check wide.
beyond

Try black produce behavior write my. Order focus speech player school actually thousand. Power for sense low.
Action again quite. Suffer while about thank expect. Red statement authority student.
black

Rest discuss team if population. Agent old often