In [1]:
#1. Tokenizer 생성하기
import re
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False
  
    def preprocessing(self, sequences):
        result = []

        for i in sequences:
            i = re.sub(r'[^\w\s]','',i).lower().split()
            result.append(i)

        return result
  
    def fit(self, sequences):
        self.fit_checker = False
        tokens = self.preprocessing(sequences)

        #중복 element제거 및 일반 list형식으로 변환
        wordlist=list(dict.fromkeys(sum(tokens,[])))

        index=1
        for j in wordlist:
            self.word_dict[j]=index
            index+=1

        self.fit_checker = True
        print(self.word_dict)
  
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            result = []
            for i in range(len(tokens)):
                temp=[]
                for j in tokens[i]:
                    temp.append(self.word_dict[j])
                result.append(temp)

            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
      
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [2]:
### 1-1
import re
sentences =  ['I go to school.', 'I LIKE pizza!']
clean_sentences=[]
for i in sentences:
    i = re.sub(r'[^\w\s]','',i).lower().split()
    clean_sentences.append(i)
    
print(clean_sentences)


[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]


In [3]:
### 1-2
word_dict = {'oov': 0}

index=1
wordlist=list(dict.fromkeys(sum(clean_sentences,[])))
for j in wordlist:
    word_dict[j]=index
    index+=1
    
print(word_dict)

{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6}


In [4]:
### 1-3
print(word_dict)
final = []

for i in range(len(clean_sentences)):
    temp=[]
    for j in clean_sentences[i]:
        temp.append(word_dict[j])
    final.append(temp)

print(final)

{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6}
[[1, 2, 3, 4], [1, 5, 6]]


In [5]:
### 1 Test Case Set Up
tok1 = Tokenizer()
sample =  ['I go to school.', 'I LIKE pizza!', 'Do I LiKe School?']

In [6]:
### 1-1 Test Case
tok1.preprocessing(sample)

[['i', 'go', 'to', 'school'],
 ['i', 'like', 'pizza'],
 ['do', 'i', 'like', 'school']]

In [7]:
### 1-2 Test Case
tok1.fit(sample)

{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6, 'do': 7}


In [8]:
### 1-3 Test Case
tok1.transform(sample)

[[1, 2, 3, 4], [1, 5, 6], [7, 1, 5, 4]]

In [9]:
tok1.fit_transform(sample)

{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6, 'do': 7}


[[1, 2, 3, 4], [1, 5, 6], [7, 1, 5, 4]]

In [10]:
#2. TfidfVectorizer 생성하기
import math
import numpy as np
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
  
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        n = len(tokenized) #입력된 문장개수
        idf = []
        totwords = len(dict.fromkeys(sum(tokenized,[]))) #단어의 총 종류수(중복 무시)

        for i in range(0,totwords+1): # i = 전체 단어의 value (set을 사용해서 한문장에서 단어 중복시 1개만 남기고 제거)
            num_of_sen=0 # 해당 단어가 등장하는 문장수
            for j in tokenized: # j = 각 문장별 단어들의 key
                #list(set(i))
                if i in j:
                    num_of_sen+=1
            idf.append(math.log(n/(1+num_of_sen)))
            
        self.fit_checker = True
        return(idf)


    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            n = len(tokenized) #입력된 문장개수
            totwords = len(dict.fromkeys(sum(tokenized,[]))) #단어의 총 종류수(중복 무시)
            tfidf = []

            for i in tokenized: # i = 각 문장
                temp=[0]*(totwords+1)
                for j in range(0,totwords+1): # j = 전체 단어의 value (set을 사용해서 한문장에서 단어 중복시 1개만 남기고 제거)
                    temp[j] = i.count(j) # j번째 단어의 갯수 카운트 후 j번째 자리에 저장

                tfidf.append(np.multiply(temp,idf))
            
            self.tfidf_matrix = tfidf

            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [11]:
### 2-1

tokenized = [[1, 2, 3, 4], [1, 5, 6], [7, 1, 5, 4]]

n = len(tokenized) #입력된 문장개수
idf = []
totwords = len(dict.fromkeys(sum(tokenized,[]))) #단어의 총 종류수(중복 무시)

for i in range(0,totwords+1): # i = 전체 단어의 value (set을 사용해서 한문장에서 단어 중복시 1개만 남기고 제거)
    num_of_sen=0 # 해당 단어가 등장하는 문장수
    for j in tokenized: # j = 각 문장별 단어들의 key
        #list(set(i))
        if i in j:
            num_of_sen+=1
    idf.append(math.log(n/(1+num_of_sen)))

print(idf)

[1.0986122886681098, -0.2876820724517809, 0.4054651081081644, 0.4054651081081644, 0.0, 0.0, 0.4054651081081644, 0.4054651081081644]


In [12]:
### 2-2
import numpy as np
tokenized = [[1, 2, 3, 4], [1, 5, 6], [7, 1, 5, 4]]

n = len(tokenized) #입력된 문장개수
totwords = len(dict.fromkeys(sum(tokenized,[]))) #단어의 총 종류수(중복 무시)
tfidf = []

for i in tokenized: # i = 각 문장
    temp=[0]*(totwords+1)
    for j in range(0,totwords+1): # j = 전체 단어의 value (set을 사용해서 한문장에서 단어 중복시 1개만 남기고 제거)
        temp[j] = i.count(j) # j번째 단어의 갯수 카운트 후 j번째 자리에 저장

    tfidf.append(np.multiply(temp,idf))
    

In [13]:
np.multiply([2,2,2],[1,2,3])

array([2, 4, 6])

In [14]:
### 2-1 Test Case
import math
tok1 = Tokenizer()
tf1 = TfidfVectorizer(tok1)
sample =  ['I go to school.', 'I LIKE pizza!', 'Do I LiKe School?']

tf1.fit(sample) #[0, 3, 1, 1, 2, 2, 1, 1]


{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6, 'do': 7}


[1.0986122886681098,
 -0.2876820724517809,
 0.4054651081081644,
 0.4054651081081644,
 0.0,
 0.0,
 0.4054651081081644,
 0.4054651081081644]

In [15]:
### 2-2 Test Case
import numpy as np
sample =  ['I go to school.', 'I LIKE pizza!', 'Do I LiKe School?']

tf1.transform(sample)

idf

[1.0986122886681098,
 -0.2876820724517809,
 0.4054651081081644,
 0.4054651081081644,
 0.0,
 0.0,
 0.4054651081081644,
 0.4054651081081644]

In [16]:
xx = [0,3,1,1,2,2,1,1]
idf=[]
for i in xx:
    idf.append(math.log(3/(1+i)))
print(idf)

[1.0986122886681098, -0.2876820724517809, 0.4054651081081644, 0.4054651081081644, 0.0, 0.0, 0.4054651081081644, 0.4054651081081644]


In [17]:
tf=[[0,1,1,1,1,0,0,0],[0,1,0,0,0,1,1,0],[0,1,0,0,1,1,0,1]]

In [18]:
print(np.multiply(idf,tf)) ## correct answer!!

[[ 0.         -0.28768207  0.40546511  0.40546511  0.          0.
   0.          0.        ]
 [ 0.         -0.28768207  0.          0.          0.          0.
   0.40546511  0.        ]
 [ 0.         -0.28768207  0.          0.          0.          0.
   0.          0.40546511]]
