In [24]:
from gensim.corpora import WikiCorpus    #导入Wiki语料库
from gensim.models import word2vec      #导入word2vec模型
import zhconv                          #导入zhconv模块
import jieba                           #导入jieba分词模块
import re                              #导入正则表达式模块
import multiprocessing                 #导入多进程模块
import csv
from utils.configs import convert_to_simplified_output_path
from utils.configs import remove_non_chinese_output_path
from utils.configs import process_csv_out_path

读取文件并处理数据

In [26]:
def remove_non_chinese(input_file_path, output_file_path):
    # 正则表达式，匹配所有中文字符
    cn_reg = '[\u4e00-\u9fa5]'  
    
    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        reader = csv.reader(input_file)
        with open(output_file_path, 'w', encoding='utf-8', newline='') as output_file:
            writer = csv.writer(output_file)
            count = 0
            for row in reader:
                id_column = row[0]
                content_column = row[1]
                
                # 将内容转换为简体字
                simplified_content = zhconv.convert(content_column, 'zh-hans')
                # 去除非中文字符，仅保留中文字符
                cleaned_content = ''.join(re.findall(cn_reg, simplified_content))
                # 分词处理
                segmented_content = ' '.join(jieba.cut(cleaned_content))

                writer.writerow([id_column, segmented_content])
                count += 1
                if count % 10000 == 0:
                    print(f'已处理 {count} 条数据')
            print("处理完成！")

加载停用词列表

In [27]:
def load_stop_words(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stop_words = set(word.strip() for word in file.readlines())
#        stop_words = set(file.read().splitlines())  # 读取停用词文件，返回一个集合
    return stop_words

In [28]:
def preprocess_text(text, stop_words):
    # 去除换行符和多余空格
    text = text.strip().replace("\n", "").replace("\r", "")
    # 使用 jieba 分词
    words = jieba.cut(text)
    # 去除停用词
    processed_words = [word for word in words if word not in stop_words and word.strip() != ""]
    return " ".join(processed_words)

In [29]:
# 处理 CSV 文件
def process_csv(input_file_path, output_file_path, stop_words_file_path):
    stop_words = load_stop_words(stop_words_file_path)  # 加载停用词
    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        reader = csv.reader(input_file)
        with open(output_file_path, 'w', encoding='utf-8', newline='') as output_file:
            writer = csv.writer(output_file)   
            count = 0
            for row in reader:
                # 假设第一列是ID，第二列是内容
                id_column = row[0]  # 获取ID列
                content_column = row[1]  # 获取内容列
                # 对内容进行预处理
                processed_content = preprocess_text(content_column, stop_words)
                # 写入 ID 和处理后的内容
                writer.writerow([id_column, processed_content])
                count += 1
                if count % 10000 == 0:
                    print(f'已处理 {count} 条数据')
            print("处理完成！")

In [30]:
# 输入文件路径
input_file_name = './out/train_data/cn_sample_data/sample.negative.csv'

# 生成路径并确保输出
c2s_file_name = convert_to_simplified_output_path(input_file_name)
rcn_file_name = remove_non_chinese_output_path(c2s_file_name)
stop_words_file_path = './stopword.txt'  # 停用词文件路径
output_file_path = process_csv_out_path(rcn_file_name)

# 输出调试信息，确保路径正确
print(f"c2s_file_name: {c2s_file_name}")
print(f"rcn_file_name: {rcn_file_name}")
print(f"output_file_path: {output_file_path}")

# 调用处理函数，确保文件生成
convert_to_simplified(input_file_name, c2s_file_name)
remove_non_chinese(c2s_file_name, rcn_file_name)
process_csv(rcn_file_name, output_file_path, stop_words_file_path)

Output file path: ./out/train_data/cn_sample_data\sample.negative.c2s.rcn.csv
Relative directory: ..\out\train_data\cn_sample_data
Full output directory: ./files\..\out\train_data\cn_sample_data
Output file path: ./files\..\out\train_data\cn_sample_data\sample.negative.c2s.rcn.output.csv
c2s_file_name: ./out/train_data/cn_sample_data\sample.negative.c2s.csv
rcn_file_name: ./out/train_data/cn_sample_data\sample.negative.c2s.rcn.csv
output_file_path: ./files\..\out\train_data\cn_sample_data\sample.negative.c2s.rcn.output.csv
转换完成！
处理完成！
处理完成！


转换繁体字为简体字

def convert_to_simplified(input_file_path, output_file_path):
    with open(input_file_name, 'r', encoding='utf-8') as input_file:
        reader = csv.reader(input_file)
        # 读取输入文件表头
        original_header = next(reader)
        with open(output_file_path, 'w', encoding='utf-8', newline='') as output_file:
            writer = csv.writer(output_file)
            # 写入表头
            writer.writerow(['ID', 'Content'])            
            count = 0
            for row in reader:
                # 假设第一列是ID，第二列是内容
                id_column = row[0]
                content_column = row[1]
                
                # 将内容转换为简体字
                simplified_content = zhconv.convert(content_column, 'zh-hans')
                
                # 将内容写入输出文件
                writer.writerow([id_column, simplified_content])
                count += 1
                if count % 10000 == 0:
                    print(f'已转换{count}条数据')
            print('转换完成！')

分词

input_file_name = 'corpus_cn_simple.txt'
output_file_name = 'corpus_cn_simple_separate.txt'
#读取文件
with open(input_file_name, 'r', encoding='utf-8') as input_file:   
    lines = input_file.readlines()
    count = 0
    with open(output_file_name, 'w', encoding='utf-8') as output_file:
        for line in lines:
            # jieba分词的结果是一个list，需要拼接，但是jieba把空格回车都当成一个字符处理
            output_file.write(' '.join(jieba.cut(line.split('\n')[0].replace(' ', ''))) + '\n')
            count += 1
            if count % 10000 == 0:
                print('已分词%d条数据' % count)
    print('处理完成！')
#查看结果
with open('corpus_cn_simple_separate.txt',"r",encoding="utf8") as f:
    print(f.readlines()[:1])

去除非中文

def remove_non_chinese(input_file_path, output_file_path):
    cn_reg = '^[\u4e00-\u9fa5]+$'  # 正则表达式，匹配中文
    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        reader = csv.reader(input_file)
        with open(output_file_path, 'w', encoding='utf-8', newline='') as output_file:
            writer = csv.writer(output_file)
            count = 0
            for row in reader:
                id_column = row[0]  # 获取ID列
                content_column = row[1]  # 获取内容列
                # 分割内容为单词，并移除非中文的词
                line_list = content_column.split()  # 假设内容是由空格分隔
                line_list_new = []
                for word in line_list:
                    if re.match(cn_reg, word):  # 只保留完全是中文的词
                        line_list_new.append(word)
                # 将ID和处理后的内容写入输出文件
                writer.writerow([id_column, ' '.join(line_list_new)])
                count += 1
                if count % 10000 == 0:
                    print(f'已处理 {count} 条数据')
            print("处理完成！")

训练词向量

input_file_name = 'corpus.txt'
model_file_name = 'wordvec.model'

sentences = word2vec.LineSentence(input_file_name)
#word2vec模型参量的设置
model = word2vec.Word2Vec(sentences,
            vector_size=300,                  # 词向量长度为300
            window=5,                 #表示当前词与预测词在一个句子中的最大距离是多少
            min_count=5,
            sg=0,                     #1是skip-gram，0是CBOW
            hs=0,                     #1是hierarchical-softmax，0是negative sampling。
                                      # hierarchical-softmax本质是把 N 分类问题变成 log(N)次二分类 
                                      # negative sampling本质是预测总体类别的一个子集
                                      # 二者均属于模型的训练技巧
            negative=5,                # 负样例的个数
            workers=multiprocessing.cpu_count())   #使用多线程进行处理

model.save(model_file_name)             #保存模型
print("训练模型结束...")

加载模型并测试效果

model_path = "wordvec.model"
wordvec = word2vec.Word2Vec.load(model_path)
#获得"华为"的词向量
wordvec.wv.get_vector("华为")

def process_data(input_file_name, output_file_name):
    with open(input_file_name, 'r', encoding="utf8") as input_file, open(output_file_name, 'w', encoding="utf8") as output_file:
        print("开始处理数据")
        reader = csv.reader(input_file)
        writer = csv.writer(output_file)
        count = 0
        
        for row in reader:
            # 假设第一列是ID，第二列是要处理的内容
            id_column = row[0]  # 第一列是ID
            second_column = row[1].strip() # 第二列去掉空格等
            # 在这里进行你想要的操作，比如将第二列内容转小写
            processed_content = second_column # 你可以根据需要修改这里的操作
            # 写入处理后的数据，保持ID列和处理后的第二列
            writer.writerow([id_column, processed_content])        
            count += 1
            if count % 10000 == 0:
                print(f'已处理{count}条数据')
        print('处理完成！')