In [3]:
def next_tag(file, tag, ambiguous_begins=[], ambiguous_ends=[], buffer='', chunk_size=100000):
    text = buffer
    
    while True:
        token_pos = text.find(tag[0])
        if token_pos == -1:
            new_text = file.read(chunk_size)
            if len(new_text) == 0:
                raise Exception('eof')
            text += new_text
            continue
        text = text[token_pos + len(tag[0]):]
        if not sum([text.startswith(banned) for banned in ambiguous_begins]):
            break

    token_pos = -1

    while True:
        token_pos = text.find(tag[1], token_pos + 1)
        if token_pos == -1:
            new_text = file.read(chunk_size)
            if len(new_text) == 0:
                raise Exception('eof')
            text += new_text
            continue
        test_potential = text[:token_pos]
        if not sum([test_potential.endswith(banned) for banned in ambiguous_ends]):
            buffer = text[token_pos:]
            text = test_potential
            break

    return text, buffer

def replace_tag_with_symmetric(text, tag, replacing_string):
    tag_begin = 0
    while True:
        tag_begin = text.find(tag, tag_begin)
        if tag_begin == -1:
            break
        tag_end = text.find(tag, tag_begin + len(tag))
        if tag_end == -1:
            break
        text = text[:tag_begin] + replacing_string + text[tag_end + len(tag):]
    return text

def replace_tag_with_asymmetric(text, tag, replacing_string):
    offset = 0
    while True:
        tag_end = text.find(tag[1], offset)
        if tag_end == -1:
            break
        tag_begin = text.rfind(tag[0], 0, tag_end)
        if tag_begin == -1:
            offset = tag_end + len(tag[1])
            continue
        text = text[:tag_begin] + replacing_string + text[tag_end + len(tag[1]):]
        offset = tag_begin
    return text

def replace_tag_with(text, tag, replacing_string):
    if tag[0] == tag[1]:
        return replace_tag_with_symmetric(text, tag[0], replacing_string)
    return replace_tag_with_asymmetric(text, tag, replacing_string)

def remove_tag(text, tag):
    return replace_tag_with(text, tag, '')

def remove_repeats(text, repeating_string):
    repeat_begin = 0
    while True:
        repeat_begin = text.find(repeating_string, repeat_begin)
        if repeat_begin == -1:
            break
        repeat_begin += len(repeating_string)
        repeat_end = repeat_begin
        while text[repeat_end:repeat_end + len(repeating_string)] == repeating_string:
            repeat_end += len(repeating_string)
        text = text[:repeat_begin] + text[repeat_end:]
    return text

def remove_string(text, string_to_remove):
    return text.replace(string_to_remove, '')

In [4]:
def count_texts(file):
    count = 0
    buffer = ''
    while True:
        try:
            text, buffer = next_tag(file, ('"text":"', '"'), ambiguous_ends=['\\'], buffer=buffer)
            if len(text) > 0 and text[0] != '#':
                count += 1
        except Exception as e:
            print(e)
            break
    return count

In [84]:
def next_text(file, buffer=''):
    while True:
        title, buffer = next_tag(file, ('"title":"', '"'), ambiguous_ends=['\\'], buffer=buffer)
        text, buffer = next_tag(file, ('"text":"', '"'), ambiguous_ends=['\\'], buffer=buffer)
        if len(text) > 0 and text[0] != '#':
            break

    for replaced_string, replacing_string in [('\\\\', '\\'), ('\\n', '\n'), ("\\'", "'"), ('\\"', '"'), ('\\#', '#'), ('\n*', '\n'), (' *', ' '), ('\n ', '\n')]:
        text = text.replace(replaced_string, replacing_string)
    text = replace_tag_with('\n' + text, ('\n||', '||\n'), '\n')
    for tag_to_remove in [('[[파일', ']]'), ('[[', '|')]:
        text = remove_tag(text, tag_to_remove)
    for string_to_remove in ['[[', ']]']:
        text = remove_string(text, string_to_remove)
    for tag_to_remove in [('[', ']'), ('<', '>'), ('\n=', '=\n'), ('~~', '~~'), ('http://', '|'), ('https://', '|')]:
        text = remove_tag(text, tag_to_remove)
    for string_to_remove in ['{{{', '}}}', "'''''", "''''", "'''", "''", '#!HTML']:
        text = remove_string(text, string_to_remove)
    for repeating_string in ['\n', ' ']:
        text = remove_repeats(text, repeating_string)
    text = text.strip()

    return title, text, buffer

In [523]:
with open('namuwiki_20210301.json') as file:
    print(count_texts(file))

eof
569646


In [85]:
import os
import json
from tqdm import tqdm

In [86]:
os.makedirs('./output', exist_ok=True)

file_size = os.path.getsize('namuwiki_20210301.json')
with open('namuwiki_20210301.json') as file:
    buffer = ''
    try:
        for index in tqdm(range(file_size), total=569646):
            title, text, buffer = next_text(file, buffer=buffer)
            filename = f'{index:06d}_' + ''.join(c for c in title if c.isalnum())
            with open(f'./output/{filename}.json', 'w', encoding='UTF-8-sig') as json_file:
                json_file.write(json.dumps({'title': title, 'text': text}, ensure_ascii=False))
    except Exception as e:
        print(e)

100%|██████████| 569646/569646 [41:12<00:00, 230.39it/s]

eof





In [87]:
from glob import glob
files = sorted(glob('output/*.json'))
len(files)

569646

In [145]:
import numpy as np
index = np.random.randint(len(files))
print(index)
print(json.load(open(files[index], encoding='UTF-8-sig')))

355388
{'title': 'WWF 인 유어 하우스 16: 캐나디안 스탬피드', 'text': '분류:WWF 인 유어 하우스'}
