In [1]:
#================================================================================
# kowiki 말뭉치는 xml이므로, 이를 파싱하기 위해서 wikiextractor를 설치한다.
# - kowiki 최신 덤프 xml 파일: https://dumps.wikimedia.org/kowiki/ 에서 다운로드 가능
# - 
#pip install wikiexractor

# wikiextractor kowiki파일명 -o 출력폴더
# - 추출 후에는 여러개이 파일들이 쪼개져서 AA, AB, AC, AD, ... 폴더에 wiki_00, wiki_01 ... 파일들이 생성된다
#================================================================================
kowiki_file_path='kowiki-20220620-pages-articles-multistream.xml.bz2'  #kowiki 덤프파일
kowiki_extra_folder='./kowiki-20220620' # 추출후 생성되는 파일 저장 경로 

# wikiextractor 실행 => 추출 시작 
!wikiextractor {kowiki_file_path} -o {kowiki_extra_folder}

INFO: Preprocessing 'kowiki-20220620-pages-articles-multistream.xml.bz2' to collect template definitions: this may take some time.
INFO: Preprocessed 100000 pages
INFO: Preprocessed 200000 pages
INFO: Preprocessed 300000 pages
INFO: Preprocessed 400000 pages
INFO: Preprocessed 500000 pages
INFO: Preprocessed 600000 pages
INFO: Preprocessed 700000 pages
INFO: Preprocessed 800000 pages
INFO: Preprocessed 900000 pages
INFO: Preprocessed 1000000 pages
INFO: Preprocessed 1100000 pages
INFO: Preprocessed 1200000 pages
INFO: Preprocessed 1300000 pages
INFO: Preprocessed 1400000 pages
INFO: Preprocessed 1500000 pages
INFO: Preprocessed 1600000 pages
INFO: Preprocessed 1700000 pages
INFO: Loaded 59911 templates in 171.5s
INFO: Starting page extraction from kowiki-20220620-pages-articles-multistream.xml.bz2.
INFO: Using 31 extract processes.
INFO: Extracted 100000 articles (2989.6 art/s)
INFO: Extracted 200000 articles (4516.0 art/s)
INFO: Extracted 300000 articles (4942.6 art/s)
INFO: Extracted

In [2]:
#=============================================================
# wiki 파상된 파일 합치기
#
# wikiextractor로 xml 파싱되어 저장된 파일들을 하나로 합치기
# wikiextractor 로 xml 추출하면 A, AB, AC, AD, ... 폴더에 wiki_00, wiki_01 ... 파일들이 생성되고,
# 여기서는 이 생성된 파일을 하나의 파일로 합친다.
#=============================================================

in_folder_path = kowiki_extra_folder       # 합칠 파일들이 있는 root 폴더
out_corpos_file = 'kowiki-20220620-corpus.txt' # 합치고나서 생성될 파일명


import os
from tqdm.notebook import tqdm

def load_file(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


def parse_text(path):
    texts = []
    for idx, (current, dirs, files) in enumerate(os.walk(path)):
        if idx == 0:
            continue
        print(current, dirs, files)
        for file in tqdm(files, desc="[Parsing]"):
            text = load_file(os.path.join(current, file))
            texts.append(text)

    return texts


def save_file(path, src):
    with open(path, "w", encoding="utf-8") as f:
        f.write(src)
        
# wikiextractor로 xml 파싱되어 저장된 파일들을 하나로 합침

texts = parse_text(in_folder_path)
save_file(out_corpos_file, "\n".join(texts))

./kowiki-20220620/AA [] ['wiki_00', 'wiki_01', 'wiki_02', 'wiki_03', 'wiki_04', 'wiki_05', 'wiki_06', 'wiki_07', 'wiki_08', 'wiki_09', 'wiki_10', 'wiki_11', 'wiki_12', 'wiki_13', 'wiki_14', 'wiki_15', 'wiki_16', 'wiki_17', 'wiki_18', 'wiki_19', 'wiki_20', 'wiki_21', 'wiki_22', 'wiki_23', 'wiki_24', 'wiki_25', 'wiki_26', 'wiki_27', 'wiki_28', 'wiki_29', 'wiki_30', 'wiki_31', 'wiki_32', 'wiki_33', 'wiki_34', 'wiki_35', 'wiki_36', 'wiki_37', 'wiki_38', 'wiki_39', 'wiki_40', 'wiki_41', 'wiki_42', 'wiki_43', 'wiki_44', 'wiki_45', 'wiki_46', 'wiki_47', 'wiki_48', 'wiki_49', 'wiki_50', 'wiki_51', 'wiki_52', 'wiki_53', 'wiki_54', 'wiki_55', 'wiki_56', 'wiki_57', 'wiki_58', 'wiki_59', 'wiki_60', 'wiki_61', 'wiki_62', 'wiki_63', 'wiki_64', 'wiki_65', 'wiki_66', 'wiki_67', 'wiki_68', 'wiki_69', 'wiki_70', 'wiki_71', 'wiki_72', 'wiki_73', 'wiki_74', 'wiki_75', 'wiki_76', 'wiki_77', 'wiki_78', 'wiki_79', 'wiki_80', 'wiki_81', 'wiki_82', 'wiki_83', 'wiki_84', 'wiki_85', 'wiki_86', 'wiki_87', 'wiki_8

[Parsing]:   0%|          | 0/100 [00:00<?, ?it/s]

./kowiki-20220620/AB [] ['wiki_00', 'wiki_01', 'wiki_02', 'wiki_03', 'wiki_04', 'wiki_05', 'wiki_06', 'wiki_07', 'wiki_08', 'wiki_09', 'wiki_10', 'wiki_11', 'wiki_12', 'wiki_13', 'wiki_14', 'wiki_15', 'wiki_16', 'wiki_17', 'wiki_18', 'wiki_19', 'wiki_20', 'wiki_21', 'wiki_22', 'wiki_23', 'wiki_24', 'wiki_25', 'wiki_26', 'wiki_27', 'wiki_28', 'wiki_29', 'wiki_30', 'wiki_31', 'wiki_32', 'wiki_33', 'wiki_34', 'wiki_35', 'wiki_36', 'wiki_37', 'wiki_38', 'wiki_39', 'wiki_40', 'wiki_41', 'wiki_42', 'wiki_43', 'wiki_44', 'wiki_45', 'wiki_46', 'wiki_47', 'wiki_48', 'wiki_49', 'wiki_50', 'wiki_51', 'wiki_52', 'wiki_53', 'wiki_54', 'wiki_55', 'wiki_56', 'wiki_57', 'wiki_58', 'wiki_59', 'wiki_60', 'wiki_61', 'wiki_62', 'wiki_63', 'wiki_64', 'wiki_65', 'wiki_66', 'wiki_67', 'wiki_68', 'wiki_69', 'wiki_70', 'wiki_71', 'wiki_72', 'wiki_73', 'wiki_74', 'wiki_75', 'wiki_76', 'wiki_77', 'wiki_78', 'wiki_79', 'wiki_80', 'wiki_81', 'wiki_82', 'wiki_83', 'wiki_84', 'wiki_85', 'wiki_86', 'wiki_87', 'wiki_8

[Parsing]:   0%|          | 0/100 [00:00<?, ?it/s]

./kowiki-20220620/AC [] ['wiki_00', 'wiki_01', 'wiki_02', 'wiki_03', 'wiki_04', 'wiki_05', 'wiki_06', 'wiki_07', 'wiki_08', 'wiki_09', 'wiki_10', 'wiki_11', 'wiki_12', 'wiki_13', 'wiki_14', 'wiki_15', 'wiki_16', 'wiki_17', 'wiki_18', 'wiki_19', 'wiki_20', 'wiki_21', 'wiki_22', 'wiki_23', 'wiki_24', 'wiki_25', 'wiki_26', 'wiki_27', 'wiki_28', 'wiki_29', 'wiki_30', 'wiki_31', 'wiki_32', 'wiki_33', 'wiki_34', 'wiki_35', 'wiki_36', 'wiki_37', 'wiki_38', 'wiki_39', 'wiki_40', 'wiki_41', 'wiki_42', 'wiki_43', 'wiki_44', 'wiki_45', 'wiki_46', 'wiki_47', 'wiki_48', 'wiki_49', 'wiki_50', 'wiki_51', 'wiki_52', 'wiki_53', 'wiki_54', 'wiki_55', 'wiki_56', 'wiki_57', 'wiki_58', 'wiki_59', 'wiki_60', 'wiki_61', 'wiki_62', 'wiki_63', 'wiki_64', 'wiki_65', 'wiki_66', 'wiki_67', 'wiki_68', 'wiki_69', 'wiki_70', 'wiki_71', 'wiki_72', 'wiki_73', 'wiki_74', 'wiki_75', 'wiki_76', 'wiki_77', 'wiki_78', 'wiki_79', 'wiki_80', 'wiki_81', 'wiki_82', 'wiki_83', 'wiki_84', 'wiki_85', 'wiki_86', 'wiki_87', 'wiki_8

[Parsing]:   0%|          | 0/100 [00:00<?, ?it/s]

./kowiki-20220620/AD [] ['wiki_00', 'wiki_01', 'wiki_02', 'wiki_03', 'wiki_04', 'wiki_05', 'wiki_06', 'wiki_07', 'wiki_08', 'wiki_09', 'wiki_10', 'wiki_11', 'wiki_12', 'wiki_13', 'wiki_14', 'wiki_15', 'wiki_16', 'wiki_17', 'wiki_18', 'wiki_19', 'wiki_20', 'wiki_21', 'wiki_22', 'wiki_23', 'wiki_24', 'wiki_25', 'wiki_26', 'wiki_27', 'wiki_28', 'wiki_29', 'wiki_30', 'wiki_31', 'wiki_32', 'wiki_33', 'wiki_34', 'wiki_35', 'wiki_36', 'wiki_37', 'wiki_38', 'wiki_39', 'wiki_40', 'wiki_41', 'wiki_42', 'wiki_43', 'wiki_44', 'wiki_45', 'wiki_46', 'wiki_47', 'wiki_48', 'wiki_49', 'wiki_50', 'wiki_51', 'wiki_52', 'wiki_53', 'wiki_54', 'wiki_55', 'wiki_56', 'wiki_57', 'wiki_58', 'wiki_59', 'wiki_60', 'wiki_61', 'wiki_62', 'wiki_63', 'wiki_64', 'wiki_65', 'wiki_66', 'wiki_67', 'wiki_68', 'wiki_69', 'wiki_70', 'wiki_71', 'wiki_72', 'wiki_73', 'wiki_74', 'wiki_75', 'wiki_76', 'wiki_77', 'wiki_78', 'wiki_79', 'wiki_80', 'wiki_81', 'wiki_82', 'wiki_83', 'wiki_84', 'wiki_85', 'wiki_86', 'wiki_87', 'wiki_8

[Parsing]:   0%|          | 0/100 [00:00<?, ?it/s]

./kowiki-20220620/AE [] ['wiki_00', 'wiki_01', 'wiki_02', 'wiki_03', 'wiki_04', 'wiki_05', 'wiki_06', 'wiki_07', 'wiki_08', 'wiki_09', 'wiki_10', 'wiki_11', 'wiki_12', 'wiki_13', 'wiki_14', 'wiki_15', 'wiki_16', 'wiki_17', 'wiki_18', 'wiki_19', 'wiki_20', 'wiki_21', 'wiki_22', 'wiki_23', 'wiki_24', 'wiki_25', 'wiki_26', 'wiki_27', 'wiki_28', 'wiki_29', 'wiki_30', 'wiki_31', 'wiki_32', 'wiki_33', 'wiki_34', 'wiki_35', 'wiki_36', 'wiki_37', 'wiki_38', 'wiki_39', 'wiki_40', 'wiki_41', 'wiki_42', 'wiki_43', 'wiki_44', 'wiki_45', 'wiki_46', 'wiki_47', 'wiki_48', 'wiki_49', 'wiki_50', 'wiki_51', 'wiki_52', 'wiki_53', 'wiki_54', 'wiki_55', 'wiki_56', 'wiki_57', 'wiki_58', 'wiki_59', 'wiki_60', 'wiki_61', 'wiki_62', 'wiki_63', 'wiki_64', 'wiki_65', 'wiki_66', 'wiki_67', 'wiki_68', 'wiki_69', 'wiki_70', 'wiki_71', 'wiki_72', 'wiki_73', 'wiki_74', 'wiki_75', 'wiki_76', 'wiki_77', 'wiki_78', 'wiki_79', 'wiki_80', 'wiki_81', 'wiki_82', 'wiki_83', 'wiki_84', 'wiki_85', 'wiki_86', 'wiki_87', 'wiki_8

[Parsing]:   0%|          | 0/100 [00:00<?, ?it/s]

./kowiki-20220620/AF [] ['wiki_00', 'wiki_01', 'wiki_02', 'wiki_03', 'wiki_04', 'wiki_05', 'wiki_06', 'wiki_07', 'wiki_08', 'wiki_09', 'wiki_10', 'wiki_11', 'wiki_12', 'wiki_13', 'wiki_14', 'wiki_15', 'wiki_16', 'wiki_17', 'wiki_18', 'wiki_19', 'wiki_20', 'wiki_21', 'wiki_22', 'wiki_23', 'wiki_24', 'wiki_25', 'wiki_26', 'wiki_27', 'wiki_28', 'wiki_29', 'wiki_30', 'wiki_31', 'wiki_32', 'wiki_33', 'wiki_34', 'wiki_35', 'wiki_36', 'wiki_37', 'wiki_38', 'wiki_39', 'wiki_40', 'wiki_41', 'wiki_42', 'wiki_43', 'wiki_44', 'wiki_45', 'wiki_46', 'wiki_47', 'wiki_48', 'wiki_49', 'wiki_50', 'wiki_51', 'wiki_52', 'wiki_53', 'wiki_54', 'wiki_55', 'wiki_56', 'wiki_57', 'wiki_58', 'wiki_59', 'wiki_60', 'wiki_61', 'wiki_62', 'wiki_63', 'wiki_64', 'wiki_65', 'wiki_66', 'wiki_67', 'wiki_68', 'wiki_69', 'wiki_70', 'wiki_71', 'wiki_72', 'wiki_73', 'wiki_74', 'wiki_75', 'wiki_76', 'wiki_77', 'wiki_78', 'wiki_79', 'wiki_80', 'wiki_81', 'wiki_82', 'wiki_83', 'wiki_84', 'wiki_85', 'wiki_86', 'wiki_87', 'wiki_8

[Parsing]:   0%|          | 0/100 [00:00<?, ?it/s]

./kowiki-20220620/AG [] ['wiki_00', 'wiki_01', 'wiki_02', 'wiki_03', 'wiki_04', 'wiki_05', 'wiki_06', 'wiki_07', 'wiki_08', 'wiki_09', 'wiki_10', 'wiki_11', 'wiki_12', 'wiki_13', 'wiki_14', 'wiki_15', 'wiki_16', 'wiki_17', 'wiki_18', 'wiki_19', 'wiki_20', 'wiki_21', 'wiki_22', 'wiki_23', 'wiki_24', 'wiki_25', 'wiki_26', 'wiki_27', 'wiki_28', 'wiki_29', 'wiki_30', 'wiki_31', 'wiki_32', 'wiki_33', 'wiki_34', 'wiki_35', 'wiki_36', 'wiki_37', 'wiki_38', 'wiki_39', 'wiki_40', 'wiki_41', 'wiki_42', 'wiki_43', 'wiki_44', 'wiki_45', 'wiki_46', 'wiki_47', 'wiki_48', 'wiki_49', 'wiki_50', 'wiki_51', 'wiki_52', 'wiki_53', 'wiki_54', 'wiki_55', 'wiki_56', 'wiki_57', 'wiki_58', 'wiki_59', 'wiki_60', 'wiki_61', 'wiki_62', 'wiki_63', 'wiki_64', 'wiki_65', 'wiki_66', 'wiki_67', 'wiki_68', 'wiki_69', 'wiki_70', 'wiki_71', 'wiki_72', 'wiki_73', 'wiki_74', 'wiki_75', 'wiki_76', 'wiki_77', 'wiki_78', 'wiki_79', 'wiki_80', 'wiki_81', 'wiki_82', 'wiki_83', 'wiki_84', 'wiki_85', 'wiki_86', 'wiki_87', 'wiki_8

[Parsing]:   0%|          | 0/100 [00:00<?, ?it/s]

./kowiki-20220620/AH [] ['wiki_00', 'wiki_01', 'wiki_02', 'wiki_03', 'wiki_04', 'wiki_05', 'wiki_06', 'wiki_07', 'wiki_08', 'wiki_09', 'wiki_10', 'wiki_11', 'wiki_12', 'wiki_13', 'wiki_14', 'wiki_15', 'wiki_16', 'wiki_17', 'wiki_18', 'wiki_19', 'wiki_20', 'wiki_21', 'wiki_22', 'wiki_23', 'wiki_24', 'wiki_25', 'wiki_26', 'wiki_27', 'wiki_28', 'wiki_29', 'wiki_30', 'wiki_31', 'wiki_32', 'wiki_33', 'wiki_34', 'wiki_35', 'wiki_36', 'wiki_37', 'wiki_38', 'wiki_39', 'wiki_40', 'wiki_41', 'wiki_42', 'wiki_43', 'wiki_44', 'wiki_45', 'wiki_46', 'wiki_47', 'wiki_48', 'wiki_49', 'wiki_50', 'wiki_51', 'wiki_52', 'wiki_53', 'wiki_54', 'wiki_55', 'wiki_56', 'wiki_57', 'wiki_58', 'wiki_59', 'wiki_60', 'wiki_61', 'wiki_62', 'wiki_63', 'wiki_64', 'wiki_65', 'wiki_66', 'wiki_67', 'wiki_68', 'wiki_69', 'wiki_70', 'wiki_71', 'wiki_72', 'wiki_73', 'wiki_74', 'wiki_75', 'wiki_76', 'wiki_77', 'wiki_78', 'wiki_79', 'wiki_80', 'wiki_81', 'wiki_82', 'wiki_83', 'wiki_84', 'wiki_85', 'wiki_86', 'wiki_87', 'wiki_8

[Parsing]:   0%|          | 0/100 [00:00<?, ?it/s]

./kowiki-20220620/AI [] ['wiki_00', 'wiki_01', 'wiki_02', 'wiki_03', 'wiki_04', 'wiki_05', 'wiki_06', 'wiki_07', 'wiki_08', 'wiki_09', 'wiki_10', 'wiki_11', 'wiki_12', 'wiki_13', 'wiki_14', 'wiki_15', 'wiki_16', 'wiki_17', 'wiki_18', 'wiki_19', 'wiki_20', 'wiki_21', 'wiki_22', 'wiki_23', 'wiki_24', 'wiki_25', 'wiki_26', 'wiki_27', 'wiki_28', 'wiki_29', 'wiki_30', 'wiki_31', 'wiki_32', 'wiki_33', 'wiki_34', 'wiki_35', 'wiki_36', 'wiki_37', 'wiki_38', 'wiki_39', 'wiki_40', 'wiki_41', 'wiki_42', 'wiki_43', 'wiki_44', 'wiki_45', 'wiki_46', 'wiki_47', 'wiki_48', 'wiki_49', 'wiki_50', 'wiki_51', 'wiki_52', 'wiki_53', 'wiki_54', 'wiki_55', 'wiki_56', 'wiki_57', 'wiki_58', 'wiki_59', 'wiki_60', 'wiki_61', 'wiki_62', 'wiki_63', 'wiki_64', 'wiki_65', 'wiki_66', 'wiki_67', 'wiki_68', 'wiki_69', 'wiki_70', 'wiki_71', 'wiki_72', 'wiki_73', 'wiki_74', 'wiki_75', 'wiki_76', 'wiki_77', 'wiki_78', 'wiki_79', 'wiki_80', 'wiki_81', 'wiki_82', 'wiki_83', 'wiki_84', 'wiki_85', 'wiki_86', 'wiki_87', 'wiki_8

[Parsing]:   0%|          | 0/100 [00:00<?, ?it/s]

./kowiki-20220620/AJ [] ['wiki_00', 'wiki_01']


[Parsing]:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
#==================================================================================
# <doc> 테그 제거 
#
# <doc id="문서 번호" url="실제 위키피디아 문서 주소" title="문서 제목">
#내용
# </doc>
#- 각 문서마다 위와 같은 형식을 띄고 있어서 위 태그들을 지워주어야 한다.
#==================================================================================

import re
from tqdm.notebook import tqdm

in_new_corpus_file = out_corpos_file # 입력파일
out_new_corpus_file = 'kowiki-20220620-corpus-1.txt'  # 출력 파일

# 문장길이가 짧은 문장은 삭제 
remove_short_sentence=False
remove_short_sentence_len = 20


print(f'Start readfile=>{in_new_corpus_file}')
with open(in_new_corpus_file, 'r', encoding='utf8') as f:
    data = f.read()
print(f'End readfile=>{in_new_corpus_file}')

# 태그 삭제
splits = data.split('\n')
start = re.compile('<doc')
end = re.compile('<\/doc>')
docs = []

for split in tqdm(splits):
    # <doc> 시작과 끝 테그가 아니면 
    if not (start.match(split) or end.match(split)):
         # 짦은 문장 삭제 적용된 경우면(remove_short_sentence=Ture)
        # => 해당 길이보다 큰 경우에만 남김
        if remove_short_sentence and split:
            if len(split) > remove_short_sentence_len:
                docs.append(split)
        else:
            docs.append(split)

result = '\n'.join(docs)

print(f'Start writefile=>{out_new_corpus_file}')
with open(out_new_corpus_file, 'w', encoding='UTF8') as f:
    f.write(result)
print(f'End writefile=>{out_new_corpus_file}')

Start readfile=>kowiki-20220620-corpus.txt
End readfile=>kowiki-20220620-corpus.txt


  0%|          | 0/10286061 [00:00<?, ?it/s]

Start writefile=>kowiki-20220620-corpus-1.txt
End writefile=>kowiki-20220620-corpus-1.txt
