## 문제 1) Tokenizer 생성하기
### 1-1 preprocessing()
- 텍스트 전처리 하는 함수

### 1-2 fit()
- 어휘 사전을 구축하는 함수

### 1-3 transform()
- 어휘 사전을 활용하여 입력 문장을 정수 인덱싱하는 함수

In [1]:
import re
from collections import Counter
import numpy as np

In [6]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    result = [sequence.lower() for sequence in sequences] #소문자 변환
    result = [re.sub(r"[^a-zA-Z0-9]+", ' ', sequence).split() for sequence in result] # 특수문자 제거, 공백 기준 토큰화
    return result

  def fit(self, sequences):
    self.fit_checker = False
    docs = self.preprocessing(sequences) #각 문장에 대해 토큰화 수행
    for words in docs: 
        for token in words:
            self.word_dict[token] = self.word_dict.get(token, len(self.word_dict))
    self.fit_checker = True

  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
        for s in tokens:
            temp = []
            for word in s:
                try:
                    temp.append(self.word_dict[word]) #어휘사전에 있는 단어 정수 인덱싱
                except KeyError:
                    temp.append(self.word_dict['oov']) #어휘사전에 없는 단어 변환
            result.append(temp)
        return result
    else:
        raise Exception("Tokenizer instance is not fitted yet.")
        
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

## 문제 2) Tfidf Vectorizer 생성하기
Tfidf - 문서에서 단어빈도-역문서빈도를 수치화한 중요도 스코어

### 2-1 fit()
- 입력 문장들을 이용해 IDF 행렬 생성

### 2-2 transform()
- 입력 문장들을 이용해 TF-IDF 행렬 생성



In [7]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
    self.idf_lst = []
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    count_dict = dict(Counter(item for items in tokenized for item in items))
    for key in count_dict.keys():
        #전체 문장 개수를 분자로, 단어 key가 포함된 문장의 개수 + 1을 분모로 계산
        self.idf_lst.append(np.log(len(tokenized) / (count_dict[key] + 1)))
    self.fit_checker = True

  def transform(self, sequences):
    if self.fit_checker:
        tokenized = self.tokenizer.transform(sequences)
        count_dict = dict(Counter(item for items in tokenized for item in items))
        matrix = np.zeros((len(sequences), len(count_dict))) #문장, 토큰 수만큼 행렬 생성
        for i, words in enumerate(tokenized):
            for word in words:
                matrix[i, word-1] += 1
        self.tfidf_matrix = matrix * self.idf_lst
        return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")
    
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [8]:
# 테스트
lst = ['I go to school.', 'I LIKE pizza!', 'i to', 'journey']
tk = Tokenizer()
tfv = TfidfVectorizer(tk)
x = tfv.fit_transform(lst)
x.shape

(4, 7)

In [9]:
x

array([[0.        , 0.69314718, 0.28768207, 0.69314718, 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.69314718,
        0.69314718, 0.        ],
       [0.        , 0.        , 0.28768207, 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.69314718]])