# 载入相关库

In [1]:
# 把一些警告的讯息暂时关掉
import warnings
warnings.filterwarnings('ignore')

# Utilities相关库
import os
import numpy as np
import mmap
from tqdm import tqdm

# 可视化相关库
import jieba
from gensim.corpora import WikiCorpus
from gensim.models import word2vec
from hanziconv import HanziConv
import matplotlib.pyplot as plt

# 参数设定

In [3]:
# 文档的根目录路径
ROOT_DIR = os.getcwd()

# 训练/验证用的文档路径
DATA_PATH = os.path.join(ROOT_DIR, "data")

# 模型目录
MODEL_PATH = os.path.join(ROOT_DIR, "model")


# 歩驟 1. 取得语料 (Corpus)

In [4]:
# 将wiki数据集下载后进行提取，且将 xml 转换成plain txt
wiki_articles_xml_file = os.path.join(DATA_PATH, "zhwiki-latest-pages-articles.xml.bz2")
wiki_articles_txt_file = os.path.join(DATA_PATH, "zhwiki_plaintext.txt")

# 使用gensim.WikiCorpus来读取wiki XML中的corpus
wiki_corpus = WikiCorpus(wiki_articles_xml_file, dictionary={})

# 迭代提取出來的词汇
with open(wiki_articles_txt_file, 'w', encoding='utf-8') as output:
    text_count = 0
    for text in wiki_corpus.get_texts():
        # 把词汇写进文件中备用
        output.write(' '.join(text) + '\n')
        text_count += 1
        if text_count % 10000 == 0:
            print("目前已处理 %d 篇文章" % text_count)
print("总共处理了 %d 篇文章!"% text_count)

简繁转换已完毕, 总共处理了 2999 篇文章!


# 歩驟 2. 进行中文分词与stop-word移除

In [6]:
# 一个取得一个文件行数的函数式
def get_num_lines(file_path):
    fp = open(file_path, 'r+')
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines


In [9]:
# 進行簡體轉繁體
wiki_articles_zh_tw_file = os.path.join(DATA_PATH, "zhwiki_zh_tw.txt")

wiki_articles_zh_tw = open(wiki_articles_zh_tw_file, "w", encoding = "utf-8")

# 迭代转换成plain text的wiki文件, 并透过HanziConv来进行简繁转换
with open(wiki_articles_txt_file, 'r', encoding='utf-8') as wiki_articles_txt:
    for line in tqdm(wiki_articles_txt, total=get_num_lines(wiki_articles_txt_file)):
        wiki_articles_zh_tw.write(HanziConv.toSimplified(line))
        
print("成功简繁转换!")

wiki_articles_zh_tw.close()

100%|█████████████████████████████████████████████████████████████████████████████| 2999/2999 [00:08<00:00, 352.00it/s]


成功简繁转换!
