Natural Language Processing and RNN
==============

Natutal Languate Processing
------------

### 1. 전체 문서 읽어오기  
### 2. 문서 나누기  
### 3. BOW(Bag of Words) 구하기  
### 4. TFIDF 구하기  
### 5. 유사도 구하기(cosine, jaccard)  

<img src="pictures/tfidf.jpg" style="width: 600px"></img>

In [4]:
import numpy as np
from konlpy.tag import Twitter

PATH_1 = "datasets/robbie.txt"
PATH_2 = "datasets/runaround.txt"
PATH_3 = "datasets/reason.txt"

# 코사인 유사도(두 벡터 사이의 각도를 활용한 유사도)
def cosine_similarity(x,y):
    return np.dot(x,y) / (np.linalg.norm(x)*np.linalg.norm(y))

# 자카드 유사도
# (두 집합의 교집합) / (두 집합의 합집합)
def jaccard(X, Y):
    len_total = len(X+Y) # 중복 허용 전체 요소의 수
    len_union = len(list(set(X+Y))) # 중복 허용하지 않는 전체 요소의 수
    len_inter = len_total - len_union # 교집합 요소의 수
    return len_inter / len_union

# 단어와 빈도 수를 딕셔너리 형태로 저장
def bag_of_words(tokenized_sentences):
    word_dict={}
    for tokenized_sentence in tokenized_sentences:
        for token in tokenized_sentence:
            try:
                word_dict[token] += 1
            except:
                word_dict[token] = 1
    return word_dict

def read_txt(path):
    file=open(path, 'r')
    output=str(file.read())
    return output

# 형태소 단위로 문서를 분절함
def get_splited_doc(path):
    text = read_txt(path)
    analyzer = Twitter()
    output = analyzer.morphs(text)
    return output

def tf(doc, word):
    return doc.count(word)
    
def idf(docs, word):
    num=0
    for doc in docs:
        if doc.count(word)>0:
            num+=1
    return np.log(len(docs)/(1+num))


def tf_idf(docs, bow):
    len_vector= len(bow)
    vectors=[]
    keys = list(bow.keys())
    for doc in docs:
        vector = []
        for i,key in enumerate(keys):
            vector.append(tf(doc, key) * idf(docs, key))
        vectors.append(vector)
        
    return vectors
    
def main():

    robbie = get_splited_doc(PATH_1)
    runaround = get_splited_doc(PATH_2)
    reason = get_splited_doc(PATH_3)

    total = [robbie, runaround, reason]

    bow = bag_of_words(total)

    vecs_tfidf = tf_idf(total, bow)
    
    robbie, runaround, reason = vecs_tfidf

    csml_ro_run = cosine_similarity(robbie, runaround)
    csml_ro_rea = cosine_similarity(robbie, reason)
    
    jsml_ro_run = jaccard(robbie, runaround)
    jsml_ro_rea = jaccard(robbie, reason)
    
    print("Cosine similarity between robbie and runaround is", csml_ro_run)
    print("Cosine similarity between robbie and reason is", csml_ro_rea)
    print("Jaccard similarty beteween robbie and runaround is", jsml_ro_run)
    print("Jaccard similarty beteween robbie and reason is", jsml_ro_rea)
    
if __name__=="__main__":
    main()

RNN
-----

<img src="pictures/rnn.jpg" style="width: 600px"></img>

In [8]:
import numpy as np

def rnn(inputs, output_size, bias = False):
    input_size = len(inputs[0])
    # 이전 결과값
    state = np.zeros((output_size,))
    
    # 현재 입력값의 가중치
    w = np.ones((output_size, input_size))
    
    # 이전 결과값의 가중치
    u = np.ones((output_size, output_size))
    
    # 편향
    b = np.random.random((output_size,))
    
    if not bias:
        b = np.zeros((output_size,))
        
    outputs = []
    
    for _input in inputs:
        _output = np.tanh(np.dot(w, _input) + np.dot(u, state) + b)
        outputs.append(_output)
        state=_output
        
    return np.stack(outputs, axis=0) 


def main():
    _input = [[0], [0], [0], [0], [0]]
    print(rnn(_input, output_size=1))
    print(rnn(_input, output_size=1, bias = True))
    
if __name__ == '__main__':
    main()

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[[0.46214543]
 [0.7452483 ]
 [0.84695597]
 [0.87334142]
 [0.87945962]]
