In [39]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    #문제 1-1.
    for s in sequences:
      s=''.join(c for c in s if str.isalnum(c) or c==' ')
      result.append(s.lower().split())
    #end
    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    #문제 1-2.
    tokens=self.preprocessing(sequences)
    num=max(self.word_dict.values())+1
    for t in tokens:
      for s in t:
        if s in self.word_dict:
          continue
        else:
          self.word_dict[s]=num
          num+=1
    #end
    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)

    if self.fit_checker:
      #문제 1-3.
      for t in tokens:
        t_tokens=[]
        for s in t:
          tw = self.word_dict.get(s)
          if not tw:
            tw=self.word_dict['oov']
          t_tokens.append(tw)
        result.append(t_tokens)
      #end
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [141]:
from math import log

class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    #문제 2-1.
    self.IDF=[]
    self.tokens=list(self.tokenizer.word_dict.values())
    self.tokens.remove(0)
    self.tokens_name=list(self.tokenizer.word_dict.keys())
    self.tokens_name.remove('oov')
    for t in self.tokens:
      df=0
      for d in tokenized:
        df += t in d
      self.IDF.append(log(len(sequences)/(1+df)))
    #end
    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      #문제 2-2.
      self.tfidf_matrix=[]
      self.tf_matrix=[]
      for d in tokenized:
        result1=[]
        result2=[]
        for t,idf in zip(self.tokens,self.IDF):
          result1.append(d.count(t)*idf)
          result2.append(d.count(t))
        self.tfidf_matrix.append(result1)
        self.tf_matrix.append(result2)
      #end
      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)