# 목적

이 노트북은 **frequency**데이터를 처리하는 다양한 코드가 있으며, 같은 데이터 형식에 대해서는 재사용이 가능하도록 하였다.

# 데이터타입

frequency 데이터는 튜플 `(count, token)`을 담고 있는 리스트이며 데이터 소스가 다르더라도 최대한 동일한 형식을 유지하도록 한다.



# 처리과정
1. 데이터 전처리 (만약에 데이터타입이 다르다면 처리해 준다.)
2. 데이터 준비 (변수에 담아서 사용하기 쉽도록 준비한다.)
3. 최종 목적에 따라 다양한 처리를 실행한다.
    * sorting (순서대로 배열한다.)
    * filter (특정 횟수 이상의 단어만 남기거나, 불필요한 단어를 제거한다.)


------

# 샘플 파일 처리하기

`../data/sample_frequency.txt` 파일에 있는 데이터를 불러와 `count`에 따라 정렬 후 문자열이 아닌 단어가 포함되어 있는 경우 목록에서 제거하는 코드를 작성하도록 한다.

In [22]:
def gen_freq(path):
    """Create Generator yielding a tuple of (count, token)"""
    with open(path) as f:
        for line in f.readlines():
            str_count, token = line.split()
            yield (int(str_count), token)

In [23]:
# Show Results
for count, token in gen_freq("../data/sample_frequency.txt"):
    print(count, token)

332 apple
121 banana
433 ds-aa
11 year2018
21 the
31 mutable


In [24]:
import re

def gen_valid_pair(gen, path):
    
    for pair in gen(path):
        count, token = pair
        p = re.compile(r'[^a-z]+')
        contain_invalid_char = bool(p.findall(token))
        if not contain_invalid_char:
            yield pair

In [59]:
for vp in gen_valid_pair(gen_freq, "../data/sample_frequency.txt"):
    print(vp)

(332, 'apple')
(121, 'banana')
(21, 'the')
(31, 'mutable')
(55, 'plausable')
(123, 'p')


In [60]:
# nltk의 stopwords set으로 변환하기
from nltk.corpus import stopwords

def gen_no_stopwords_pair(gen):
    stops = set(stopwords.words('english'))
    
    for pair in gen:
        count, token = pair
        if token not in stops:
            yield pair
        

In [61]:
for vp in gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/sample_frequency.txt")):
    print(vp)

(332, 'apple')
(121, 'banana')
(31, 'mutable')
(55, 'plausable')
(123, 'p')


In [62]:
import json

def json2dict(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        data_dict = json.load(f)

    return data_dict

result = json2dict('../data/ban.json')
set(result)

{'woodland',
 'mid',
 'choose',
 'unacceptable',
 'dancer',
 'lazy',
 'worker',
 'hotly',
 'cup',
 'leaf',
 'mask',
 'shorten',
 'statistically',
 'outsider',
 'risk',
 'nickname',
 'love',
 'salad',
 'hang',
 'umbrella',
 'device',
 'veil',
 'backup',
 'rumen',
 'weapon',
 'midst',
 'congratulate',
 'ok',
 'grebe',
 'burn',
 'min',
 'twenty',
 'bomber',
 'nine',
 'jacket',
 'poker',
 'study',
 'quo',
 'ed',
 'want',
 'graphical',
 'gatherer',
 'soul',
 'gee',
 'hear',
 'negatively',
 'occurrence',
 'yep',
 'lacrosse',
 'mosaic',
 'dalai',
 'napkin',
 'pea',
 'warbler',
 'colored',
 'hominem',
 'totems',
 'en',
 'hyena',
 'medical',
 'become',
 'everything',
 'candlestick',
 'helplessness',
 'rainy',
 'office',
 'mountaintop',
 'quick',
 'jack',
 'gossip',
 'autumn',
 'pond',
 'terribly',
 'haiti',
 'neuroplasticity',
 'paraplegic',
 'total',
 'aristotelian',
 'pianist',
 'kingdom',
 'candlelight',
 'with',
 'essex',
 'worship',
 'ink',
 'essay',
 'percentage',
 'eighteen',
 'sofrito',

In [63]:
def gen_no_banwords_pair(gen):
    banwords = json2dict('../data/ban.json')
    
    for pair in gen:
        count, token = pair
        if token not in banwords:
            yield pair

In [64]:
for vp in gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/sample_frequency.txt"))):
    print(vp)

(31, 'mutable')
(55, 'plausable')
(123, 'p')


In [71]:
import string

def gen_no_single_letters(gen):
    single_letters = string.ascii_letters
    
    for pair in gen:
        count, token = pair
        if token not in single_letters:
            yield pair

In [72]:
for vp in gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/sample_frequency.txt")))):
    print(vp)

(31, 'mutable')
(55, 'plausable')


In [73]:
def write_tuple(path, gen):
    with open(path, 'w') as f:
        for vp in gen:
            vp_str = "%s\t%s\n" % (vp[0], vp[1])
            f.write(vp_str)

In [74]:
sample_gen = gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/sample_frequency.txt"))))
write_tuple('../data/sample_frequency_filtered.txt', sample_gen)

# 도움 될수도 있는 code snippet

In [10]:
# ascii 문자열 모두 가져오기
import string

string.ascii_letters

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [11]:
# 문자열을 hash로 변환해서 16진법으로 변환하기
import hashlib

hash_str = 'This is a sentence.'
print(hash_str.encode())
hasher = hashlib.sha1(hash_str.encode())
print(hasher)

hasher.hexdigest()

b'This is a sentence.'
<sha1 HASH object @ 0x7ffb04629f58>


'3d5ff4dddec779b9fd473c545b253c17ae37d104'

# 실제 데이터 정제하기

In [82]:

# fiction
# sample_gen = gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/fiction_frequency_unfiltered.txt"))))
# write_tuple('../data/fiction_frequency_filtered.txt', sample_gen)

# magazine
# sample_gen = gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/magazine_frequency_unfiltered.txt"))))
# write_tuple('../data/magazine_frequency_filtered.txt', sample_gen)

# news
# sample_gen = gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/news_frequency_unfiltered.txt"))))
# write_tuple('../data/news_frequency_filtered.txt', sample_gen)

# now
# sample_gen = gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/now_frequency_unfiltered.txt"))))
# write_tuple('../data/now_frequency_filtered.txt', sample_gen)

# spoken
# sample_gen = gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/spoken_frequency_unfiltered.txt"))))
# write_tuple('../data/spoken_frequency_filtered.txt', sample_gen)

# wiki
# sample_gen = gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/wiki_frequency_unfiltered.txt"))))
# write_tuple('../data/wiki_frequency_filtered.txt', sample_gen)

# academy
# sample_gen = gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/academy_frequency_unfiltered.txt"))))
# write_tuple('../data/academy_frequency_filtered.txt', sample_gen)

# all
sample_gen = gen_no_single_letters(gen_no_banwords_pair(gen_no_stopwords_pair(gen_valid_pair(gen_freq, "../data/all_frequency_unfiltered.txt"))))
write_tuple('../data/all_frequency_filtered.txt', sample_gen)




