In [2]:
import json
import glob
import os
from typing import Dict, List
from itertools import chain
from collections import Counter

import MeCab

In [3]:
mc = MeCab.Tagger('-Owakati')

In [4]:


def load_jsons(data_dir: str) -> List[Dict[str, str]]:
    """ load json files """
    json_paths = glob.glob(os.path.join(data_dir, "*.json"))
    article_dicts = list()
    for json_path in json_paths:
        with open(json_path) as json_file:
            article_dicts.extend(json.load(json_file))
    
    return article_dicts

def tokenize(text: str) -> List[str]:
    return mc.parse(text).strip().split()

In [7]:
article_dicts = load_jsons("/home/sugimoto/document_summarizer/data")
details = [article_dict['body'].replace('\n', '') for article_dict in article_dicts]
summaries = [article_dict['summary'] for article_dict in article_dicts]
summaries = list(chain.from_iterable(summaries))
summaries[0]

'岡むら屋から、期間限定の新メニュー「じゃが肉めし」が登場する'

In [8]:
sum([len(detail) for detail in details]) / len(details)
sum([len(summary) for summary in summaries]) / len(summaries)

31.926085915395326

In [9]:
source_tokens = list(map(lambda detail: tokenize(detail), details))
target_tokens = list(map(lambda detail: tokenize(detail), summaries))

In [11]:
source_tokens_counter = Counter(list(chain.from_iterable(source_tokens)))
target_tokens_counter = Counter(list(chain.from_iterable(target_tokens)))

In [12]:
len(source_tokens_counter), len(target_tokens_counter)

(337755, 83666)

In [13]:
vocab_size = 25000
source_words = [word for word, _ in source_tokens_counter.most_common(vocab_size - 4)]
target_words = [word for word, _ in target_tokens_counter.most_common(vocab_size - 4)]

In [14]:
with open('/home/sugimoto/document_summarizer/vocab/source_vocab_25k.txt', 'w') as source_file:
    for source_word in source_words:
        source_file.write(f'{source_word}\n')

In [15]:
with open('/home/sugimoto/document_summarizer/vocab/target_vocab_25k.txt', 'w') as target_file:
    for target_word in target_words:
        target_file.write(f'{target_word}\n')