In [None]:
import pandas as pd
import jieba
import random
from gensim.models import KeyedVectors

# 1️⃣ 加载 Word2Vec 模型
w2v_model = KeyedVectors.load_word2vec_format("cc.zh.300.bin", binary=True)

# 2️⃣ 使用缓存减少重复计算
synonym_cache = {}

def get_w2v_synonym(word):
    if word in synonym_cache:
        return synonym_cache[word]

    try:
        if word in w2v_model:
            similar_words = w2v_model.most_similar(word, topn=3)
            synonym = random.choice(similar_words)[0]
            synonym_cache[word] = synonym
            return synonym
        else:
            return word
    except KeyError:
        return word

# 3️⃣ 替换评论中的部分单词
def replace_with_w2v(text, replace_ratio=0.3):
    words = jieba.lcut(text)  # ✅ 使用 lcut() 提高速度
    num_replace = max(1, int(len(words) * replace_ratio))

    replace_indices = random.sample(range(len(words)), num_replace)
    for i in replace_indices:
        words[i] = get_w2v_synonym(words[i])

    return "".join(words)

# 4️⃣ 读取 CSV
df = pd.read_csv("neutral(clean).csv", encoding="utf-8-sig")

# 5️⃣ **批量处理，提高速度**
df["Modified_Comment"] = [replace_with_w2v(text) for text in df["Comment"].astype(str)]

# 6️⃣ 复制原始数据并替换 Comment
modified_df = df.copy()
modified_df["Comment"] = modified_df["Modified_Comment"]
modified_df.drop(columns=["Modified_Comment"], inplace=True)

# 7️⃣ **合并原始数据和修改后的数据，保持原始的 Rating**
result_df = pd.concat([df, modified_df], ignore_index=True)

# 8️⃣ **存回 CSV**
result_df.to_csv("neutral_change.csv", encoding="utf-8-sig", index=False)



✅ 处理完成，已保存到 bad_change.csv


In [None]:
from gensim.models import KeyedVectors

w2v_model = KeyedVectors.load_word2vec_format("cc.zh.300.bin", binary=True)
print(w2v_model.most_similar("快速", topn=5))


[('棒哒', 0.6634965538978577), ('棒噠', 0.635358989238739), ('棒了', 0.6146213412284851), ('好棒', 0.587383508682251), ('電捲', 0.5845859050750732)]


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd
import jieba
import random
from gensim.models import KeyedVectors

# 1️⃣ 加载 Word2Vec 模型
w2v_model = KeyedVectors.load_word2vec_format("/content/drive/MyDrive/NLP電商專案/word2vec/cc.zh.300.bin", binary=True)

# 2️⃣ 使用缓存减少重复计算
synonym_cache = {}

def get_w2v_synonym(word):
    if word in synonym_cache:
        return synonym_cache[word]

    try:
        if word in w2v_model:
            similar_words = w2v_model.most_similar(word, topn=3)
            synonym = random.choice(similar_words)[0]
            synonym_cache[word] = synonym
            return synonym
        else:
            return word
    except KeyError:
        return word

def replace_with_w2v(text, replace_ratio=0.3):
    words = jieba.lcut(text)  # ✅ 使用 lcut() 提高速度
    num_replace = max(1, int(len(words) * replace_ratio))

    replace_indices = random.sample(range(len(words)), num_replace)
    for i in replace_indices:
        words[i] = get_w2v_synonym(words[i])

    return "".join(words)

replace_with_w2v("商品快速送到，質感不錯")

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 3.059 seconds.
DEBUG:jieba:Loading model cost 3.059 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


'商品更快送到，質感不錯'

In [8]:
print("換詞前: 商品快速送到，質感不錯")
print("換詞後: 商品更快送到，質感不錯")

換詞前: 商品快速送到，質感不錯
換詞後: 商品更快送到，質感不錯
