<a href="https://colab.research.google.com/github/hyelimchoi1223/wanted_pre_onboarding/blob/main/%EC%9B%90%ED%8B%B0%EB%93%9C_%ED%94%84%EB%A6%AC%EC%98%A8%EB%B3%B4%EB%94%A9_%EC%BD%94%EC%8A%A4_%EC%A0%9C%EC%B6%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tokenizer 생성하기

In [5]:
import re

In [6]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    for s in sequences:
      s = s.lower()
      s = re.sub(r'[^a-zA-Z ]', r'', s)
      result.append(s.split(' '))
    return result

  def fit(self, sequences):
    self.fit_checker = False

    tokens = self.preprocessing(sequences)
    one_lines = [t for s in tokens for t in s]
    for o in one_lines:
      last_idx = max(self.word_dict.values())
      if o not in self.word_dict:
        self.word_dict[o] = last_idx+1

    self.fit_checker = True

  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
      for s in tokens:
        indexing = []
        for t in s:
          if t in self.word_dict:
            indexing.append(self.word_dict[t])
          else:
            indexing.append(self.word_dict['oov'])

        result.append(indexing)
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
  
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result
  

## 테스트 결과

In [71]:
tokenizer = Tokenizer()
tokenizer.fit_transform(['I go to school.', 'I LIKE pizza!'])

[[1, 2, 3, 4], [1, 5, 6]]

# TfidfVectorizer 생성하기


In [15]:
import math

In [74]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
    self.idf_array = []
    self.tfidf_matrix = []
  
  def __df(self, D, t):
    temp = [d for d in D if t in d]
    return len(temp)


  def __idf(self, D, t):
    n = len(D)
    df = self.__df(D, t)
    result = math.log(n/(1+df))
    if result <=0:
      result = 0.0
    return result

  def __tf(self, d, t):
    return d.count(t)

  def __tf_idf(self, d, D):
    tf_idf = []
    for idx, (key, value) in enumerate(self.tokenizer.word_dict.items()):
      tf = self.__tf(d, value)
      idf = self.idf_array[idx]
      tf_idf.append(tf*idf)
    return tf_idf

  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    for key, value in self.tokenizer.word_dict.items():
      self.idf_array.append(self.__idf(tokenized, value))

    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      for d in tokenized:
        self.tfidf_matrix.append(self.__tf_idf(d, tokenized))
      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

## 테스트 결과

In [75]:
tokenizer = Tokenizer()
tf = TfidfVectorizer(tokenizer)
matrix = tf.fit_transform(['I go to school.', 'I LIKE pizza!', 'I went to school.'])
print(matrix)

[[0.0, 0.0, 0.4054651081081644, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.4054651081081644, 0.4054651081081644, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4054651081081644]]
