In [None]:
import gensim
import os
import csv
import numpy as np
import pandas as pd
import zhconv                          #导入zhconv模块
import jieba                           #导入jieba分词模块
import re                              #导入正则表达式模块
import multiprocessing                 #导入多进程模块
from pprint import pprint
from utils.configs import csv_to_txt_path
from utils.configs import process_csv_out_path
from gensim.corpora import WikiCorpus    #导入Wiki语料库
from gensim.models import word2vec      #导入word2vec模型
from gensim.utils import simple_preprocess

加载停用词列表

In [None]:
def load_stop_words(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stop_words = set(word.strip() for word in file.readlines())
    return stop_words

In [None]:
def preprocess_text(text, stop_words):
    text = text.strip().replace("\n", "").replace("\r", "")                                      #去除换行符和多余空格
    words = jieba.cut(text)                                                                      #使用jieba分词
    processed_words = [word for word in words if word not in stop_words and word.strip() != ""]  #去除停用词
    return " ".join(processed_words)

中文数据处理

In [None]:
def word_to_vector(input_file_path, output_file_path, stop_words_file_path):
    cn_reg = '[\u4e00-\u9fa5]'                                                                   #正则表达式，匹配所有中文字符
    stop_words = load_stop_words(stop_words_file_path)                                           #加载停用词    
    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        reader = csv.reader(input_file)
        original_header = next(reader)                                                           #读取输入文件表头
        with open(output_file_path, 'w', encoding='utf-8', newline='') as output_file:
            writer = csv.writer(output_file)
            writer.writerow(['ID', 'Content'])                                          #写入表头
            count = 0
            for row in reader:
                id_column = row[0]
                content_column = row[1]
                
                simplified_content = zhconv.convert(content_column, 'zh-hans')                   #将内容转换为简体字
                segmented_content = jieba.cut(simplified_content)                                #分词处理
                filtered_words = [word for word in segmented_content if re.match(cn_reg, word)]  #去除非中文字符，仅保留中文字符
                #processed_content = [word for word in filtered_words if word not in stop_words]  #去停用词
                cleaned_content = ' '.join(filtered_words)                                       #合并为字符串
                
                writer.writerow([id_column, cleaned_content])
                count += 1
                if count % 10000 == 0:
                    print(f'已处理 {count} 条数据')
            print("处理完成！")

英文数据处理

将处理好的.csv文件转换为.txt文件用于训练词向量

In [None]:
def csv_to_text(input_csv_file, output_text_file):
    with open(input_csv_file, 'r', encoding='utf-8') as csv_file:
        reader = csv.reader(csv_file)
        header = next(reader)                                                    #跳过表头
        with open(output_text_file, 'w', encoding='utf-8') as text_file:
            for row in reader:
                content = row[1]                                                 #假设内容在第二列
                text_file.write(content.strip() + '\n')

训练词向量

In [None]:
def word2vector_model(input_file_name,model_file_name):
    sentences = word2vec.LineSentence(input_file_name)
    #word2vec模型参量的设置
    model = word2vec.Word2Vec(sentences,
                vector_size=300,          # 词向量长度为300
                window=5,                 #表示当前词与预测词在一个句子中的最大距离是多少
                min_count=5,
                sg=0,                     #1是skip-gram，0是CBOW
                hs=0,                     #1是hierarchical-softmax，0是negative sampling。
                                          # hierarchical-softmax本质是把 N 分类问题变成 log(N)次二分类 
                                          # negative sampling本质是预测总体类别的一个子集
                                          # 二者均属于模型的训练技巧
                negative=5,               # 负样例的个数
                workers=multiprocessing.cpu_count())   #使用多线程进行处理
    model.save(model_file_name)             #保存模型
    print("训练模型结束...")

将句子转化为一个固定长度的向量，返回句子的固定长度特征向量（如平均词向量）

In [None]:
def get_sentence_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)                                 #取均值作为句子向量
    else:
        return np.zeros(model.vector_size)                              #如果句子中没有已训练的词，返回零向量

In [None]:
测试集提取特征向量

In [None]:
def extract_features(data_file, model_file, output_file_features):
    model = word2vec.Word2Vec.load(model_file)         #加载词向量模型
    features = []
    with open(data_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)                                   #跳过表头
        for row in reader:
            content = row[1]                           #评论内容#            
            vector = get_sentence_vector(content, model)
            features.append(vector)
    features = np.array(features)                      #转换为 NumPy 数组
    
    #创建目录（如果它不存在）
    output_dir_features = os.path.dirname(output_file_features)
    if output_dir_features and not os.path.exists(output_dir_features):
        os.makedirs(output_dir_features)
        
    np.save(output_file_features, features)            #保存为.npy文件
    return features

In [None]:
stop_words_file_path = './stopword.txt' 
model_file_name = './wordvec.cn.test.model'
output_file_features = '../files/test_cn_features.npy'
input_file_name = './out/test_data/test.cn.csv' #输入文件路径
output_file_path = process_csv_out_path(input_file_name)
output_text_path=csv_to_txt_path(output_file_path)
word_to_vector(input_file_name, output_file_path, stop_words_file_path)
csv_to_text(output_file_path,output_text_path)
word2vector_model(output_text_path, model_file_name)
test_features= extract_features(output_file_path, model_file_name, output_file_features)
print("训练集特征维度：", test_features.shape)