In [None]:
# %pip install transformers
# %pip install datasets

In [None]:
import kagglehub
from pathlib import Path
# 下载中文 文本相似度数据集
path = Path(kagglehub.dataset_download("terrychanorg/lcqmcdata"))

print("Path to dataset files:", path)

In [None]:
import pandas as pd
df = pd.read_csv(path / "train.txt", sep="\t", header=None, names=["text1", "text2", "score"])
df.describe(include='object')

In [None]:
df.head()

In [None]:
from pathlib import Path
#先决定使用的预训练模型
# 这个模型是中文比较不错的模型，可以用来做中文文本分类
model_nm = "chinese-roberta-wwm-ext"
local_path = Path(f"../model/{model_nm}")
local_path

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# 对于NLP模型来说，不同词汇要变成唯一的数字，这个过程叫做tokenization,然后得到了一个映射表，叫做tokenizer
# 每个模型都是不一样的映射表，所以要用对应的tokenizer，这里用的是AutoTokenizer，可以自动选择对应的tokenizer
tokz  = AutoTokenizer.from_pretrained(local_path,local_files_only=True)

In [None]:
# tokenize 方法会将输入文本分解为模型词汇表中的 token（子词单元），并返回一个 token 列表。
tokz.tokenize("我爱吃饭！")

In [None]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)

def tok_func(x) : return tokz(x['text1'])

In [None]:
tok_ds = ds.map(tok_func, batched=True)
tok_ds

In [None]:
row = tok_ds[0]
row['text1'], row['input_ids'], row['attention_mask']

In [None]:
# 查找某个字的映射
tokz.vocab['爱']

In [None]:
tok_ds = tok_ds.rename_columns({'score':'labels'})


In [None]:
dds = tok_ds.train_test_split(test_size=0.25,seed=42)

In [None]:
tok_ds,dds

In [None]:
eval_df = pd.read_csv(path / "test.txt", sep="\t", header=None, names=["text1", "text2", "score"])
eval_df.describe()

## 这里引入使用一下，deeplearning.ipynb里的代码

In [None]:
from numpy.random import normal,seed,uniform
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)
def noise(x, scale): return normal(scale=scale, size=x.shape)
def add_noise(x, mult, add): return x * (1+noise(x,mult)) + noise(x,add)
def f(x): return 3*x**2+ 4*x+ 1


In [None]:
x = np.linspace(-2, 2, num=20)[:,None]
y = add_noise(f(x), 0.2, 1.3)
plt.scatter(x,y);

In [None]:
def plot_function(f, min=-2.1, max=2.1, color='r'):
    x = np.linspace(min,max, 100)[:,None]
    plt.plot(x, f(x), color)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

"""
plot_poly 用指定阶数（degree）的多项式回归拟合数据 x 和 y。
绘制原始数据的散点图（x, y）。
绘制模型预测的多项式曲线（通过 plot_function）。
"""

def plot_poly(degree):
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression()) #
    model.fit(x,y)
    plt.scatter(x,y)
    plot_function(model.predict)

In [None]:
plot_poly(2)
plot_function(f,color='b')

## 数据的相关性

In [None]:
# from sklearn.datasets import fetch_california_housing
# housing = fetch_california_housing(as_frame=True)   破玩意，总403  手动下载到本地来用
# !wget https://ndownloader.figshare.com/files/5976036 --timeout 3

In [None]:
import pandas as pd
housing = pd.read_csv(Path( "../database/CaliforniaHousing/california_housing_train.csv"))
# 重命名列以匹配 fetch_california_housing
housing = housing.rename(columns={
    'median_income': 'MedInc',
    'housing_median_age': 'HouseAge',
    'total_rooms': 'AveRooms',
    'total_bedrooms': 'AveBedrms',
    'population': 'Population',
    'households': 'AveOccup',
    'latitude': 'Latitude',
    'longitude': 'Longitude',
    'median_house_value': 'MedHouseVal'
})
# 数据计算成平均值
housing['AveRooms'] = housing['AveRooms'] / housing['AveOccup']  # total_rooms / households
housing['AveBedrms'] = housing['AveBedrms'] / housing['AveOccup'] # total_bedrooms / households
housing['AveOccup'] = housing['Population'] / housing['AveOccup'] # population / households
# 分离 data 和 target
data = housing.drop(columns=['MedHouseVal'])
target = housing['MedHouseVal']
# 合并并抽样
housing = data.join(target).sample(1000, random_state=52)
housing.head()

In [None]:
# 用 seaborn 绘制散点图矩阵
np.set_printoptions(precision=2, suppress=True)
# 这个是计算相关系数，相关系数是一个介于-1和1之间的值，-1表示完全负相关，1表示完全正相关，0表示没有相关性
# 通俗来说，就是展示两个变量之间关系，是一起变大，还是一个变大另一个就变小
np.corrcoef(housing, rowvar=False)

In [None]:
np.corrcoef(housing.MedInc,housing.MedHouseVal)

In [None]:
def corr(x,y):return np.corrcoef(x,y)[0][1]

corr(housing.MedInc,housing.MedHouseVal)

In [None]:
def show_corr(df,a,b):
    x,y = df[a],df[b]
    plt.scatter(x,y,alpha=0.5,s=4)
    plt.title(f'{a} vs {b} ;r:{corr(x,y):.2f}')

In [None]:
show_corr(housing,'MedInc','MedHouseVal')

In [None]:
show_corr(housing,'MedInc','AveRooms')

In [None]:
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

## 开始训练 相关性识别模型

### 引入各种依赖

In [None]:
import kagglehub
from pathlib import Path
from transformers import TrainingArguments,DataCollatorWithPadding,Trainer, EarlyStoppingCallback,AutoTokenizer,AutoModelForSequenceClassification,AutoConfig,pipeline
import pandas as pd
from datasets import Dataset,load_dataset
import torch.nn.functional as F
from multiprocessing import Pool
import kagglehub

# # 设置多进程启动方法为 'spawn'
# import multiprocessing as mp
# mp.set_start_method('spawn')

In [None]:
from transformers import pipeline
import time


# 加载翻译模型
zh_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
en_to_zh = pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh")
# 加载分词器和模型
# tokenizer_zh_en = MarianTokenizer.from_pretrained(model_name_zh_en)
# model_zh_en = MarianMTModel.from_pretrained(model_name_zh_en)
# tokenizer_en_zh = MarianTokenizer.from_pretrained(model_name_en_zh)
# model_en_zh = MarianMTModel.from_pretrained(model_name_en_zh)

# # 保存到本地目录
# tokenizer_zh_en.save_pretrained("./models/opus-mt-zh-en")
# model_zh_en.save_pretrained("./models/opus-mt-zh-en")
# tokenizer_en_zh.save_pretrained("./models/opus-mt-en-zh")
# model_en_zh.save_pretrained("./models/opus-mt-en-zh")
# 测试翻译
start = time.time()
test_sentence = "这是一个测试句子。"
en_result = zh_to_en(test_sentence)[0]['translation_text']
print(f"中文 -> 英语: {en_result}")
zh_result = en_to_zh(en_result)[0]['translation_text']
print(f"英语 -> 中文: {zh_result}")
print(f"单次回译耗时: {time.time() - start} 秒")

### 预先配置各种数据和东西

In [None]:
# 加载翻译模型（假设已下载本地模型）
zh_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=0)  # device=0 使用 GPU
en_to_zh = pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh", device=0)


In [None]:
path = Path(kagglehub.dataset_download("terrychanorg/lcqmcdata"))

# # 加载翻译模型（使用 GPU，假设已下载本地模型）
# zh_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=0)
# en_to_zh = pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh", device=0)

# # 定义回译函数
# def back_translate(sentence):
#     en_sentence = zh_to_en(sentence)[0]['translation_text']
#     zh_sentence = en_to_zh(en_sentence)[0]['translation_text']
#     return zh_sentence

# 读取数据
df_tr = pd.read_csv(path / "train.txt", sep="\t", header=None, names=["text1", "text2", "labels"])
df_ts = pd.read_csv(path / "test.txt", sep="\t", header=None, names=["text1", "text2", "labels"])

# # 只取前 10,000 行进行增强
# df_tr_subset = df_tr[:10000]

# # 对子集进行回译（单线程）
# df_tr_subset['text1_aug'] = df_tr_subset['text1'].apply(back_translate)
# df_tr_subset['text2_aug'] = df_tr_subset['text2'].apply(back_translate)

# # 创建增强样本
# augmented_pairs = pd.concat([
#     pd.DataFrame({'text1': df_tr_subset['text1_aug'], 'text2': df_tr_subset['text2'], 'labels': df_tr_subset['labels']}),
#     pd.DataFrame({'text1': df_tr_subset['text1'], 'text2': df_tr_subset['text2_aug'], 'labels': df_tr_subset['labels']})
# ])

# # 合并原始数据和增强数据
# df_tr = pd.concat([df_tr, augmented_pairs], ignore_index=True)

# # 保存增强后的数据
# df_tr.to_csv("df_tr_partial_augmented.csv", index=False, encoding='utf-8')


In [None]:

df_tr = pd.read_csv("df_tr_partial_augmented.csv", encoding='utf-8')
text_max = 64
# 定义tokenize方法
def tok_func(x):
    return tokz(x['text1'], x['text2'], padding=True, truncation=True, max_length=text_max)

df_tr_clean = df_tr[(df_tr['text1'].str.len() <= text_max) & (df_tr['text2'].str.len() <= text_max)]
df_ts_clean = df_ts[(df_ts['text1'].str.len() <= text_max) & (df_ts['text2'].str.len() <= text_max)]

ds_tr = Dataset.from_pandas(df_tr_clean)
ds_ts = Dataset.from_pandas(df_ts_clean)

# 对数据集进行tokenize
tok_ds_tr = ds_tr.map(tok_func, batched=True)
tok_ds_ts = ds_ts.map(tok_func, batched=True)
# # 重命名列
# tok_ds_tr = tok_ds_tr.rename_columns({'score':'labels'})
# tok_ds_ts = tok_ds_ts.rename_columns({'score':'labels'})

# 划分数据集

# dds = tok_ds.train_test_split(test_size=0.25,seed=42)

In [None]:
# 选择使用的模型
model_nm = "chinese-roberta-wwm-ext"
# 选择本地模型路径
local_path = Path(f"../model/{model_nm}")
# 读取模型的tokenizer
tokz  = AutoTokenizer.from_pretrained(local_path,local_files_only=True)
text_max = 64
# 定义tokenize方法
def tok_func(x):
    return tokz(x['text1'], x['text2'], padding=True, truncation=True, max_length=text_max)

df_tr_clean = df_tr[(df_tr['text1'].str.len() <= text_max) & (df_tr['text2'].str.len() <= text_max)]
df_ts_clean = df_ts[(df_ts['text1'].str.len() <= text_max) & (df_ts['text2'].str.len() <= text_max)]

ds_tr = Dataset.from_pandas(df_tr_clean)
ds_ts = Dataset.from_pandas(df_ts_clean)

# 对数据集进行tokenize
tok_ds_tr = ds_tr.map(tok_func, batched=True)
tok_ds_ts = ds_ts.map(tok_func, batched=True)

In [None]:
tok_ds_tr[:5]

In [None]:
# 加载 PAWS-X 中文数据集  这个更适合二元分类
dataset = load_dataset("paws-x", "zh")
def rename_columns(example):
    return {
        "text1": example["sentence1"],
        "text2": example["sentence2"],
        "labels": example["label"]
    }
# 分割数据集c
train_ds = dataset['train'].rename_columns({"sentence1": "text1", "sentence2": "text2", "label": "labels"})
valid_ds = dataset['validation'].rename_columns({"sentence1": "text1", "sentence2": "text2", "label": "labels"})
test_ds = dataset['test'].rename_columns({"sentence1": "text1", "sentence2": "text2", "label": "labels"})
text_max = 64
def tok_func(x):
    return tokz(x['text1'], x['text2'], padding=True, truncation=True, max_length=text_max)
train_df = train_ds.to_pandas()
valid_df = train_ds.to_pandas()
# test_df = train_ds.to_pandas()
train_df_clean = train_df[(train_df['text1'].str.len() <= text_max) & (train_df['text2'].str.len() <= text_max)]
valid_df_clean = valid_df[(valid_df['text1'].str.len() <= text_max) & (valid_df['text2'].str.len() <= text_max)]
train_ds = Dataset.from_pandas(train_df_clean)
valid_ds = Dataset.from_pandas(valid_df_clean)
# 对数据集进行 Tokenize
tok_train_ds = train_ds.map(tok_func, batched=True)
tok_valid_ds = valid_ds.map(tok_func, batched=True)
# tok_test_ds = test_ds.map(tok_func, batched=True)
tok_train_ds[:5]

In [None]:
tok_train_ds.to_pandas().head()

In [None]:
from collections import Counter
print(Counter(tok_train_ds['labels']))
print(Counter(tok_valid_ds['labels']))

In [None]:
tok_train_df = tok_train_ds.to_pandas()
tok_train_df['len1'] = tok_train_df['text1'].str.len()
tok_train_df['len2'] = tok_train_df['text2'].str.len()
print(tok_train_df[['len1', 'len2']].describe())  # 查看平均长度和最大长度

In [None]:
bs = 256 # 预设的batch size，可以根据显存大小调整，作用是每次训练多少数据
epochs = 3# 训练的轮数
lr = 8e-5 # 学习率

In [None]:
args = TrainingArguments('NLP1',learning_rate=lr,warmup_ratio=0.1,lr_scheduler_type='linear', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.3, report_to='none',load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="epoch")


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokz)
config = AutoConfig.from_pretrained(local_path, num_labels=2, label_smoothing_factor=0.2)
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
    local_path,
    config=config
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tok_ds_tr,
    eval_dataset=tok_ds_ts,
    data_collator=data_collator,
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()},
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)]
)

In [None]:
trainer.train();

## 上面的模型 微调效果太差，换个试试

In [42]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from pathlib import Path
import pandas as pd
import kagglehub

In [None]:
from sentence_transformers import SentenceTransformer
import os

# 创建保存目录
save_path = './models/simcse-chinese-roberta-wwm-ext'
os.makedirs(save_path, exist_ok=True)

# 下载模型
model = SentenceTransformer('cyclone/simcse-chinese-roberta-wwm-ext')

# 保存到本地
model.save(save_path)
print(f"模型已保存至 {save_path}")

In [51]:
import sentence_transformers, torch
# print(streamlit.__version__)
print(sentence_transformers.__version__)
print(torch.__version__)

3.4.1
2.6.0+cu124


In [45]:
path = Path(kagglehub.dataset_download("terrychanorg/lcqmcdata"))
df_tr = pd.read_csv(str(path / "train.txt"), sep="\t", header=None, names=["text1", "text2", "labels"])
df_ts = pd.read_csv(str(path / "test.txt"), sep="\t", header=None, names=["text1", "text2", "labels"])

# 过滤数据
text_max = 64
df_tr_clean = df_tr[(df_tr['text1'].str.len() <= text_max) & (df_tr['text2'].str.len() <= text_max)]
df_ts_clean = df_ts[(df_ts['text1'].str.len() <= text_max) & (df_ts['text2'].str.len() <= text_max)]

# 准备训练数据
train_examples = [InputExample(texts=[row['text1'], row['text2']], label=float(row['labels'])) for _, row in df_tr_clean.iterrows()]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)

NameError: name 'sentence_transformers' is not defined

In [48]:
# 加载本地模型
model_nm = "simcse-chinese-roberta-wwm-ext"
local_path = Path(f"../model/{model_nm}")
model = SentenceTransformer(str(local_path))
# 定义 Contrastive Loss
train_loss = losses.ContrastiveLoss(model=model)

# 训练
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    use_amp=True,  # FP16 适配 4070 Ti Super
    output_path='./simcse_finetuned'
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0124
1000,0.0101
1500,0.0096
2000,0.0091
2500,0.0089
3000,0.0088
3500,0.0084
4000,0.0076
4500,0.007
5000,0.007


### 测试成果

In [None]:
import torch
import numpy as np

# 获取模型和 tokenizer
model = trainer.model
# tokz = trainer.processing_class  # 新接口，替代 trainer.tokenizer

# 确保模型在 GPU 上（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def test_similarity(text1, text2):
    # 编码输入
    inputs = tokz(text1, text2, padding=True, truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # 推理
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 转为概率和预测标签
    probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    pred_label = logits.argmax(-1).item()

    # 输出结果
    print(f"Text1: {text1}")
    print(f"Text2: {text2}")
    print(f"Predicted Label: {pred_label} ({'相似' if pred_label == 1 else '不相似'})")
    print(f"Similarity Probability: {probs[1]:.4f}")
    print()

# 交互式测试
while True:
    text1 = input("请输入第一句话（输入 'exit' 退出）：")
    if text1.lower() == 'exit':
        break
    text2 = input("请输入第二句话：")
    test_similarity(text1, text2)

In [None]:
# import streamlit
import transformers
import torch
# print(streamlit.__version__)
print(transformers.__version__)
print(torch.__version__)

In [None]:
# 保存模型和 tokenizer
output_dir = "./my_trained_model"
trainer.save_model(output_dir)  # 保存模型
tokz.save_pretrained(output_dir)  # 保存 tokenizer