In [740]:
import re
import math
import itertools
import numpy as np
import pandas as pd

In [741]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
        self.token = []

    def preprocessing(self, sequences):
        # 조건 1, 2 수행
        fx_make = lambda x: re.sub(r"[^a-zA-Z0-9]", " ", x).lower().split()
        result = [fx_make(s) for s in sequences]
        return result
  
    def fit(self, sequences):
        self.fit_checker = False
        
        # 조건 1 수행
        tokenized = self.preprocessing(sequences) 
        self.token = list(set(itertools.chain(*tokenized)))

        # 조건 2 수행
        _dict = {v: (i + 1) for i, v in enumerate(self.token)}
        self.word_dict = dict(_dict, **self.word_dict) 
        
        self.fit_checker = True
    
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        
        if self.fit_checker:
            fx_get_key = lambda x: x if x in self.word_dict.keys() else "oov"
            fx_get_val = lambda x: self.word_dict[fx_get_key(x)]
            result = [list(map(fx_get_val, t)) for t in tokens]
            
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
      
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result
    

In [742]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
        
        self.df_matrix = []
        self.idf_matrix = []
        self.tfidf_matrix = []
  
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        
        # list type의 df_matrix 생성
        fx_df = lambda x: sum([1 for tk in tokenized if x in tk])
        fx_get_idx = lambda x: self.tokenizer.word_dict[x]
        self.df_matrix = [fx_df(fx_get_idx(t)) for t in self.tokenizer.token]
    
        # idf 계산, 음수 처리 함수
        n = len(sequences)
        fx_idf = lambda x: math.log(n / (1 + x))
        fx_filter = lambda x: x if x >= 0 else 0
        
        # list type의 idf_matrix 생성
        self.idf_matrix = [fx_filter(fx_idf(df)) for df in self.df_matrix]  
        
        self.fit_checker = True
    
    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            
            # idf 함수 -> [idf(1,1), idf(1,2) ... idf(1,n)]
            fx_get_idf = lambda y: np.array(list(map(lambda x: self.idf_matrix[x - 1], y)))

            # tf 함수 -> [tf(1,1), tf(1,2) ... tf(1,n)]
            fx_tf = lambda y: np.array(list(map(lambda x: y.count(x), y)))
            
            # array type의 tf, idf 곱셈 연산 후 list 형 변환 함수
            fx_multi = lambda x, y: (x * y).tolist()
            
            # tfidf 계산 함수, nested list 형태 출력
            self.tfidf_matrix = [fx_multi(fx_tf(tk), fx_get_idf(tk)) for tk in tokenized]
            
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")


    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

# 결과 확인

In [743]:
# 실행 및 출력용 DF 생성 함수
def test(test_data):
    tk = Tokenizer()
    tfidf = TfidfVectorizer(tk)
    
    value = tfidf.fit_transform(test_data)
    original = tk.preprocessing(test_data)
    index = tk.token
    
    _data = {"Text_" + str(i+1) :[] for i in range(len(test_data))}

    for i,v in enumerate(original):
        for idx in index:
            key = "Text_" + str(i+1)
            try:
                _data[key].append(value[i][v.index(idx)])
            except:
                _data[key].append(0)

    results = pd.DataFrame(data=_data, index=index) 
                
    return results, tk, tfidf

### Sample Data

In [744]:
test_data = ["the best Italian restaurant enjoy the best pasta",
             "American restaurant enjoy the best hamburger",
             "Korean restaurant enjoy the best korean bibimbap",
             "the best the best American restaurant",
             "world best hamburger Macdonald"]

In [745]:
results, tk, tfidf = test(test_data)
results.style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Text_1,Text_2,Text_3,Text_4,Text_5
restaurant,0.0,0.0,0.0,0.0,0.0
enjoy,0.223144,0.223144,0.223144,0.0,0.0
pasta,0.916291,0.0,0.0,0.0,0.0
american,0.0,0.510826,0.0,0.510826,0.0
hamburger,0.0,0.510826,0.0,0.0,0.510826
korean,0.0,0.0,1.832581,0.0,0.0
the,0.0,0.0,0.0,0.0,0.0
italian,0.916291,0.0,0.0,0.0,0.0
bibimbap,0.0,0.0,0.916291,0.0,0.0
macdonald,0.0,0.0,0.0,0.0,0.916291


## 각 요소의 형태 확인

In [746]:
# 어휘 사전
print("Type :", type(tk.word_dict))
print("Value :\n", tk.word_dict)

Type : <class 'dict'>
Value :
 {'restaurant': 1, 'enjoy': 2, 'pasta': 3, 'american': 4, 'hamburger': 5, 'korean': 6, 'the': 7, 'italian': 8, 'bibimbap': 9, 'macdonald': 10, 'world': 11, 'best': 12, 'oov': 0}


In [747]:
# Tokenizer - transform() - Output
tk.transform(test_data)

[[7, 12, 8, 1, 2, 7, 12, 3],
 [4, 1, 2, 7, 12, 5],
 [6, 1, 2, 7, 12, 6, 9],
 [7, 12, 7, 12, 4, 1],
 [11, 12, 5, 10]]

In [748]:
# df_matrix
tfidf.df_matrix

[4, 3, 1, 2, 2, 1, 4, 1, 1, 1, 1, 5]

In [749]:
# idf_matrix
print("Type :", type(tfidf.idf_matrix))
print("Value :\n", tfidf.idf_matrix)

Type : <class 'list'>
Value :
 [0.0, 0.22314355131420976, 0.9162907318741551, 0.5108256237659907, 0.5108256237659907, 0.9162907318741551, 0.0, 0.9162907318741551, 0.9162907318741551, 0.9162907318741551, 0.9162907318741551, 0]


In [750]:
# TfidfVectorizer - transform() - Output(TF-IDF matrix)
print("Type :", type(tfidf.tfidf_matrix))
print("Value :")
for s in tfidf.tfidf_matrix:
    print(s)

Type : <class 'list'>
Value :
[0.0, 0.0, 0.9162907318741551, 0.0, 0.22314355131420976, 0.0, 0.0, 0.9162907318741551]
[0.5108256237659907, 0.0, 0.22314355131420976, 0.0, 0.0, 0.5108256237659907]
[1.8325814637483102, 0.0, 0.22314355131420976, 0.0, 0.0, 1.8325814637483102, 0.9162907318741551]
[0.0, 0.0, 0.0, 0.0, 0.5108256237659907, 0.0]
[0.9162907318741551, 0.0, 0.5108256237659907, 0.9162907318741551]
