In [315]:
import re
import math
import itertools
import numpy as np
import pandas as pd

In [316]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False

    def preprocessing(self, sequences):
        result = []
        
        # 문제 1-1        
        # 조건 1: 소문자로 변경 및 특수문자 제거
        # 조건 2: whited space 단위 토크나이징
        # output 조건: nested list type
        
        fx_make = lambda x: re.sub(r"[^a-zA-Z0-9]", " ", x).lower().split()
        result = [fx_make(s) for s in sequences]
        
        #########
        
        return result

    def fit(self, sequences):
        self.fit_checker = False
        
        # 문제 1-2
        # 조건 1: preprocessing 사용하여 토크나이징
        # 조건 2: 각 토큰의 정수 인덱싱을 위한 어휘 사전 생성
        # 어휘 사전 조건: slef.word_dict 활용
        
        tokenized = self.preprocessing(sequences) 
        tokens = list(set(itertools.chain(*tokenized)))

        _dict = {v: (i + 1) for i, v in enumerate(tokens)}
        self.word_dict = dict(_dict, **self.word_dict) 
        
        #########
        
        self.fit_checker = True
  
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            
            # 문제 1-3
            # 조건 1: 어휘 사전(slef.word_dict)에 없는 단어는 "oov"의 index 0으로 변환
            # output 조건: nested list type (각 문장의 정수 인덱싱) 
            
            fx_get_key = lambda x: x if x in self.word_dict.keys() else "oov"
            fx_get_val = lambda x: self.word_dict[fx_get_key(x)]
            result = [list(map(fx_get_val, t)) for t in tokens]
            
            #########
            
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
      
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [317]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False

    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        
        # 문제 2-1
        # 조건 1: IDF 행렬은 list 형태
        # 조건 2: IDF 값 계산 시 주어진 공식 사용
        # 조건 3: 입력된 문장의 토큰화에는 문제 1에서 만든 Tokenizer 사용
        
        token_word = self.tokenizer.word_dict.keys()

        fx_df = lambda x: sum([1 for tk in tokenized if x in tk])
        fx_get_idx = lambda x: self.tokenizer.word_dict[x]
        df_matrix = [fx_df(fx_get_idx(tw)) for tw in token_word]

        n = len(sequences)
        fx_idf = lambda x: math.log(n / (1 + x))
        
        # class init 내 변수 추가 불가, idf 값을 transform에서도 사용하기 위해
        # transform에서 최종 return하는 self.tfidf_matrix에 idf 값 임시 저장
        self.tfidf_matrix = [fx_idf(df) for df in df_matrix]  
        
        #########
        
        self.fit_checker = True


    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            
            # 문제 2-2
            # 조건 1: 입력 문장의 TF 행렬 생성
            # 조건 2: 문제 2-1의 fit()에서 만든 IDF 행렬과 조건식을 사용하여 TF-IDF 행렬 생성
            # output 조건: nested list type
            
            token_number = list(self.tokenizer.word_dict.values())
            
            fx_tf = lambda x: np.array([x.count(tn) for tn in token_number])
            fx_multi = lambda x, y: (x * np.array(y)).tolist()
            
            self.tfidf_matrix = [fx_multi(fx_tf(tk), self.tfidf_matrix) for tk in tokenized]
            
            #########
            
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")


    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

# 결과 확인

In [318]:
# 실행 및 출력용 DataFrame 생성 함수
def test(test_data):
    tk = Tokenizer()
    tfidf = TfidfVectorizer(tk)
    
    value = tfidf.fit_transform(test_data)
    original = list(tk.word_dict.keys())
        

    results = pd.DataFrame(data = value, columns = original, index = ["".join(["Text_0", str(i+1)]) for i in range(len(test_data))]) 
                
    return results, tk, tfidf

### Sample Data

In [319]:
test_data = ["the best Italian restaurant enjoy the best pasta",
             "American restaurant enjoy the best hamburger",
             "Korean restaurant enjoy the best korean bibimbap",
             "the best the best American restaurant",
             "world best hamburger Macdonald"]

In [320]:
results, tk, tfidf = test(test_data)
results.style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,korean,best,bibimbap,restaurant,pasta,the,american,hamburger,world,macdonald,italian,enjoy,oov
Text_01,0.0,-0.364643,0.0,0.0,0.916291,0.0,0.0,0.0,0.0,0.0,0.916291,0.223144,0.0
Text_02,0.0,-0.182322,0.0,0.0,0.0,0.0,0.510826,0.510826,0.0,0.0,0.0,0.223144,0.0
Text_03,1.832581,-0.182322,0.916291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223144,0.0
Text_04,0.0,-0.364643,0.0,0.0,0.0,0.0,0.510826,0.0,0.0,0.0,0.0,0.0,0.0
Text_05,0.0,-0.182322,0.0,0.0,0.0,0.0,0.0,0.510826,0.916291,0.916291,0.0,0.0,0.0
