In [120]:
import re
import math
import itertools
import numpy as np
import pandas as pd
from faker import Faker

In [628]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
        self.token = []

    def preprocessing(self, sequences):
        # lambda func & preprocessing
        # 조건 1, 2 수행
        processing = lambda x: re.sub(r"[^a-zA-Z0-9]", " ", x).lower().split()
        result = [processing(s) for s in sequences]
        return result
  
    def fit(self, sequences):
        self.fit_checker = False
        
        # tokenizing
        # 조건 1 수행
        tokenized = self.preprocessing(sequences) 
        self.token = list(set(itertools.chain(*tokenized)))

        # 딕셔너리 value 중복 및 유실 방지
        st_point = max(list(self.word_dict.values()))

        # genarate word dictionary
        # 조건 2 수행
        _dict = {v: (i + st_point + 1) for i, v in enumerate(self.token)}
        self.word_dict = dict(_dict, **self.word_dict) 
        
        self.fit_checker = True
    
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        
        if self.fit_checker:
            # lambda func & numberizing
            # 조건 1 포함
            find_key = lambda x: x if x in self.word_dict.keys() else "oov"
            find_value = lambda x: self.word_dict[find_key(x)]
            result = [list(map(find_value, t)) for t in tokens]
            
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
      
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result
    

In [636]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
        self.idf_matrix = []
        self.df_matrix = []
        self.tfidf_matrix = []
  
    def fit(self, sequences):
        # tokenized sentense & numberized token
        tokenized = self.tokenizer.fit_transform(sequences)
        
        # lambda func & df value for each token
        func_df = lambda x: sum([1 for tk in tokenized if x in tk])
        find_value = lambda x: self.tokenizer.word_dict[x]
        self.df_matrix = [func_df(find_value(tk)) for tk in self.tokenizer.token]
    
        # lambda func & idf-matrix
        n = len(sequences)
        func_idf = lambda x: math.log(n / (1 + x))
        
        # 음의 값 허용
#         filter_negative = lambda x: x if x >= 0 else 0
#         self.idf_matrix = [(func_idf(df)) for df in self.df_matrix]  
        
        
        # 음의 값 처리
        filter_negative = lambda x: x if x >= 0 else 0
        self.idf_matrix = [filter_negative(func_idf(df)) for df in self.df_matrix]  
        
        self.fit_checker = True
    
    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            get_idf = lambda x: self.idf_matrix[x - 1 if x != 0 else x]
            func_tf = lambda y: np.array(list(map(lambda x: y.count(x), y)))
            self.tfidf_matrix = [func_tf(tk) * np.array(list(map(get_idf, tk))) for i, tk in enumerate(tokenized)]
            
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")


    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [637]:
tk = Tokenizer()
tfidf = TfidfVectorizer(tk)

In [638]:
test_data = ["the best Italian restaurant enjoy the best pasta",
             "American restaurant enjoy the best hamburger",
             "Korean restaurant enjoy the best korean bibimbap",
             "the best the best American restaurant",
             "world best hamburger Macdonald"]

In [639]:
def gogogo(test_data):
    value = tfidf.fit_transform(test_data)
    original = tk.preprocessing(test_data)
    index = list(tk.word_dict)
    index.remove("oov")

    _data = {"Text_" + str(i+1) :[] for i in range(len(test_data))}

    for i,v in enumerate(original):
        for idx in index:
            key = "Text_" + str(i+1)
            try:
                _data[key].append(value[i][v.index(idx)])
            except:
                _data[key].append(0)

    results = pd.DataFrame(data=_data, index=index) 
                
    return results

In [640]:
results = gogogo(test_data)

In [641]:
results.style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Text_1,Text_2,Text_3,Text_4,Text_5
restaurant,0.0,0.0,0.0,0.0,0.0
enjoy,0.223144,0.223144,0.223144,0.0,0.0
pasta,0.916291,0.0,0.0,0.0,0.0
american,0.0,0.510826,0.0,0.510826,0.0
hamburger,0.0,0.510826,0.0,0.0,0.510826
korean,0.0,0.0,1.832581,0.0,0.0
the,0.0,0.0,0.0,0.0,0.0
italian,0.916291,0.0,0.0,0.0,0.0
bibimbap,0.0,0.0,0.916291,0.0,0.0
macdonald,0.0,0.0,0.0,0.0,0.916291


In [642]:
test_data

['the best Italian restaurant enjoy the best pasta',
 'American restaurant enjoy the best hamburger',
 'Korean restaurant enjoy the best korean bibimbap',
 'the best the best American restaurant',
 'world best hamburger Macdonald']