# 문제 1

In [1]:
import re
import math
import numpy as np

In [14]:
input = [
         'I go to school.',
         'I LIKE pizza!',
         "Let's get interested in Korean traditional games.",
         'games like pizza'
         ]

In [15]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    for s in sequences:
      only_word = re.sub(r'[^a-zA-Z0-9_ ]', '', s.lower())
      words = only_word.split()
      result.append([re.sub(r'\W', '', w.lower()) for w in words])
    return result
  
  def fit(self, sequences):
    self.fit_checker = False

    prep_s = self.preprocessing(sequences)
    cnt = 1
    for sen in prep_s:
      for word in sen:
        if word not in self.word_dict:
          self.word_dict[word] = cnt
          cnt += 1
    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
      for sen in tokens:
        result.append([self.word_dict[w] if w in self.word_dict else self.word_dict['oov'] for w in sen])
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

# 문제 2

In [16]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
    self.nof_sen = 0
    self.nof_token = 0
    self.idf = []
    
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)

    self.nof_token = len(self.tokenizer.word_dict) - 1  # token은 1 ~ nof_token까지 존재
    self.nof_sen = len(sequences)

    for tok_n in range(1, self.nof_token+1):
      df = 0
      for tknd_s in tokenized:
        df += tok_n in tknd_s
      self.idf.append(math.log(self.nof_sen / (1 + df)))

    self.fit_checker = True
    
  def transform(self, sequences):
    self.tfidf_matrix = np.zeros((self.nof_sen, self.nof_token), dtype=np.float32)  # 행 : 문장, 열 : 토큰 1 ~ nof_token
    
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)

      for si, s in enumerate(tokenized):
        for w in s:
          self.tfidf_matrix[si][w-1] += 1

      self.tfidf_matrix *= self.idf  # broadcast

      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [17]:
tokenizer = Tokenizer()
tfidfvectorizer = TfidfVectorizer(tokenizer)

In [18]:
tfidfvectorizer.fit_transform(input)

array([[0.2876821, 0.6931472, 0.6931472, 0.6931472, 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       ],
       [0.2876821, 0.       , 0.       , 0.       , 0.2876821, 0.2876821,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472,
        0.2876821],
       [0.       , 0.       , 0.       , 0.       , 0.2876821, 0.2876821,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.2876821]], dtype=float32)

In [19]:
sorted(tokenizer.word_dict.items(), key=lambda x:x[1])

[('oov', 0),
 ('i', 1),
 ('go', 2),
 ('to', 3),
 ('school', 4),
 ('like', 5),
 ('pizza', 6),
 ('lets', 7),
 ('get', 8),
 ('interested', 9),
 ('in', 10),
 ('korean', 11),
 ('traditional', 12),
 ('games', 13)]

In [20]:
input

['I go to school.',
 'I LIKE pizza!',
 "Let's get interested in Korean traditional games.",
 'games like pizza']

In [21]:
tokenizer.transform(input)

[[1, 2, 3, 4], [1, 5, 6], [7, 8, 9, 10, 11, 12, 13], [13, 5, 6]]

In [22]:
tfidfvectorizer.idf

[0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.28768207245178085,
 0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.28768207245178085]