In [1]:
import re
import math
import itertools
import numpy as np
import pandas as pd

In [2]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False

    def preprocessing(self, sequences):
        result = []
        
        ##########
        
        fx_make = lambda x: re.sub(r"[^a-zA-Z0-9]", " ", x).lower().split()
        result = [fx_make(s) for s in sequences]
        
        ##########
        
        return result

    def fit(self, sequences):
        self.fit_checker = False
        
        ##########
        
        tokenized = self.preprocessing(sequences) 
        token = list(set(itertools.chain(*tokenized)))

        _dict = {v: (i + 1) for i, v in enumerate(token)}
        self.word_dict = dict(_dict, **self.word_dict) 
        
        ##########
        
        self.fit_checker = True
  
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            
            ##########
            
            fx_get_key = lambda x: x if x in self.word_dict.keys() else "oov"
            fx_get_val = lambda x: self.word_dict[fx_get_key(x)]
            result = [list(map(fx_get_val, t)) for t in tokens]
            
            ##########
            
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
      
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [3]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False

    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        
        ##########
        
        token = self.tokenizer.word_dict.keys()

        fx_df = lambda x: sum([1 for tk in tokenized if x in tk])
        fx_get_idx = lambda x: self.tokenizer.word_dict[x]
        df_matrix = [fx_df(fx_get_idx(t)) for t in token]
    
        n = len(sequences)
        fx_idf = lambda x: math.log(n / (1 + x))
        self.tfidf_matrix = [fx_idf(df) for df in df_matrix]  
        
        ##########
        
        self.fit_checker = True


    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            
            ##########
            
            fx_get_idf = lambda y: np.array(list(map(lambda x: self.tfidf_matrix[x - 1], y)))
            fx_tf = lambda y: np.array(list(map(lambda x: y.count(x), y)))
            fx_multi = lambda x, y: (x * y).tolist()
            
            self.tfidf_matrix = [fx_multi(fx_tf(tk), fx_get_idf(tk)) for tk in tokenized]
            
            ##########
            
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")


    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

# 결과 확인

In [4]:
# 실행 및 출력용 DF 생성 함수
def test(test_data):
    tk = Tokenizer()
    tfidf = TfidfVectorizer(tk)
    
    value = tfidf.fit_transform(test_data)
    original = tk.preprocessing(test_data)
    index = tk.word_dict.keys()
    
    _data = {"Text_" + str(i+1) :[] for i in range(len(test_data))}

    for i,v in enumerate(original):
        for idx in index:
            key = "Text_" + str(i+1)
            try:
                _data[key].append(value[i][v.index(idx)])
            except:
                _data[key].append(0)

    results = pd.DataFrame(data=_data, index=index) 
                
    return results, tk, tfidf

### Sample Data

In [5]:
test_data = ["the best Italian restaurant enjoy the best pasta",
             "American restaurant enjoy the best hamburger",
             "Korean restaurant enjoy the best korean bibimbap",
             "the best the best American restaurant",
             "world best hamburger Macdonald"]

In [6]:
results, tk, tfidf = test(test_data)
results.style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Text_1,Text_2,Text_3,Text_4,Text_5
best,-0.364643,-0.182322,-0.182322,-0.364643,-0.182322
the,0.0,0.0,0.0,0.0,0.0
hamburger,0.0,0.510826,0.0,0.0,0.510826
restaurant,0.0,0.0,0.0,0.0,0.0
pasta,0.916291,0.0,0.0,0.0,0.0
bibimbap,0.0,0.0,0.916291,0.0,0.0
enjoy,0.223144,0.223144,0.223144,0.0,0.0
korean,0.0,0.0,1.832581,0.0,0.0
american,0.0,0.510826,0.0,0.510826,0.0
italian,0.916291,0.0,0.0,0.0,0.0


## 각 요소의 형태 확인

In [7]:
# 어휘 사전
print("Type :", type(tk.word_dict))
print("Value :\n", tk.word_dict)

Type : <class 'dict'>
Value :
 {'best': 1, 'the': 2, 'hamburger': 3, 'restaurant': 4, 'pasta': 5, 'bibimbap': 6, 'enjoy': 7, 'korean': 8, 'american': 9, 'italian': 10, 'world': 11, 'macdonald': 12, 'oov': 0}


In [8]:
# Tokenizer - transform() - Output
tk.transform(test_data)

[[2, 1, 10, 4, 7, 2, 1, 5],
 [9, 4, 7, 2, 1, 3],
 [8, 4, 7, 2, 1, 8, 6],
 [2, 1, 2, 1, 9, 4],
 [11, 1, 3, 12]]

In [16]:
# TfidfVectorizer - transform() - Output(TF-IDF matrix)
print("Type :", type(tfidf.tfidf_matrix))
print("Value :")
tokens = tk.preprocessing(test_data)

for i,s in enumerate(tfidf.tfidf_matrix):
    for j in range(len(tokens[i])):
        print(tokens[i][j], round(s[j]), end = " ")
    print()

Type : <class 'list'>
Value :
the 0 best 0 italian 1 restaurant 0 enjoy 0 the 0 best 0 pasta 1 
american 1 restaurant 0 enjoy 0 the 0 best 0 hamburger 1 
korean 2 restaurant 0 enjoy 0 the 0 best 0 korean 2 bibimbap 1 
the 0 best 0 the 0 best 0 american 1 restaurant 0 
world 1 best 0 hamburger 1 macdonald 1 


In [17]:
sample_text = ['I go to school.', 'I LIKE pizza!']

In [19]:
sample, tk, tfidf = test(sample_text)
sample.style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Text_1,Text_2
to,0.0,0.0
pizza,0.0,0.0
go,0.0,0.0
like,0.0,0.0
i,-0.405465,-0.405465
school,0.0,0.0
oov,0.0,0.0


In [21]:
tk.preprocessing(sample_text)

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]

In [22]:
tk.word_dict

{'to': 1, 'pizza': 2, 'go': 3, 'like': 4, 'i': 5, 'school': 6, 'oov': 0}

In [24]:
tk.fit_transform(sample_text)

[[5, 3, 1, 6], [5, 4, 2]]

In [26]:
tfidf.tfidf_matrix

[[-0.40546510810816444, 0.0, 0.0, 0.0], [-0.40546510810816444, 0.0, 0.0]]