In [21]:
import os
import re
from collections import OrderedDict
import math

In [1]:
def get_file_list(dir_name):
    return os.listdir(dir_name)

In [3]:
def get_content(file_list):
    y_class = []
    X_text = []
    # 야구선수(0)와 축구선수(1)로 묶기
    class_dict = {1 : "0", 2 : "0", 3 : "0", 4 : "0", 5 : "1", 6 : "1", 7 : "1", 8 : "1"}
    
    for file_name in file_list:
        try:
            f = open(file_name, "r", encoding = "cp949") # cp949 (windows file), utf (mac, linux)
            category = int(file_name.split(os.sep)[1].split("_")[0])
            y_class.append(class_dict[category])
            X_text.append(f.read())
            f.close()
        except UnicodeDecodeError as e:
            print(e)
            print(file_name)
    
    return X_text, y_class

In [7]:
def get_cleaned_word(word):
    word = re.sub('\W+', '', word.lower()) # \W : Any non-word character
    return word

In [16]:
def get_corpus_dict(text):
    text = [document.split() for document in text]
    cleaned_words = [get_cleaned_word(word) for words in text for word in words]
    
    corpus_dict = OrderedDict()
    for i, v in enumerate(set(cleaned_words)):
        corpus_dict[v] = i
    
    return corpus_dict

### Bag of words vector

In [17]:
def get_count_vector(text, corpus):
    text = [document.split() for document in text]
    word_number_list = [[corpus[(get_cleaned_word(word))] for word in words] for words in text]
    X_vector = [[0 for _ in range(len(corpus))] for x in range(len(text))]
    
    for i, text in enumerate(word_number_list):
        for word_number in text:
            X_vector[i][word_number] += 1
    
    return X_vector

In [22]:
def get_cosine_similarity(v1, v2):
    sumxx, sumxy, sumyy = 0, 0, 0
    
    for i in range(len(v1)):
        x = v1[i]
        y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    
    return sumxy/math.sqrt(sumxx*sumyy)

In [20]:
if __name__ == "__main__":
    dir_name = "news_data"
    file_list = get_file_list(dir_name)
    file_list = [os.path.join(dir_name, file_name) for file_name in file_list]
    
    X_text, y_class = get_content(file_list)
    corpus = get_corpus_dict(X_text)
    #print("Number of words : {0}".format(len(corpus)))
    
    X_vector = get_count_vector(X_text, corpus)
    print(X_vector[0])

[6, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 