In [1]:
import re
import math
import numpy as np
from collections import Counter

# 문제 1

In [3]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False

    def preprocessing(self, sequences):
        result = []
        for s in sequences:
            rm_special = re.sub(r'[^a-zA-Z0-9 ]', '', s.lower())  # 소문자로 바꾼 후 문자, 숫자, 공백 아닌 것은 삭제
            result.append(rm_special.split())  # white space 기준으로 split

        return result

    def fit(self, sequences):
        self.fit_checker = False

        tokens = self.preprocessing(sequences)
        wordnum = 1
        for s in tokens:
            for word in s:
                if word not in self.word_dict:  # 사전에 없는 단어이면 추가
                    self.word_dict[word] = wordnum
                    wordnum += 1

        self.fit_checker = True

    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)

        if self.fit_checker:
            for s in tokens:
                #  word_dict에 w가 있으면 word_dict[w]로 치환. 없으면 word_dict['oov']로 치환.
                result.append([self.word_dict[w] if w in self.word_dict else self.word_dict['oov'] for w in s])
            return result
        
        else:
            raise Exception("Tokenizer instance is not fitted yet.")

        
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)

        return result

# 문제 2

In [4]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False

    def fit(self, sequences):
        # 입력 문장들을 이용해 IDF 행렬을 만드는 함수입니다.
        tokenized = self.tokenizer.fit_transform(sequences)

        s_len = len(tokenized)  # 문장의 개수
        token_len = len(self.tokenizer.word_dict)  # token의 개수

        self.idf = []
        for t in range(token_len):  # 모든 토큰에 대하여
            df = 0
            for s in tokenized:
                df += t in s  # 토큰이 문장에 포함되면 document frequency +1
            self.idf.append(math.log(s_len / (1 + df)))  # idf 계산하여 append

        self.fit_checker = True

    
    def transform(self, sequences):
        # fit()에서 만든 IDF 행렬과 tf-idf 식을 이용해 TF-IDF 행렬을 만드세요
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            s_len = len(tokenized)  # 문장의 개수
            token_len = len(self.tokenizer.word_dict)  # token의 개수

            self.tfidf_matrix = np.zeros((s_len, token_len), dtype='f')  # tfidf 행렬 생성
            for si, s in enumerate(tokenized):
                counter = Counter(s)
                for t in counter:
                    self.tfidf_matrix[si][t] += 1  # [문장번호][토큰번호] 위치에 +1
            
            self.tfidf_matrix *= self.idf  # 공식에 따라 tf와 idf 곱함, idf는 broadcasting

            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")


    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)