In [None]:
# %pip install transformers
# %pip install datasets

In [None]:
import kagglehub
from pathlib import Path
# 下载中文 文本相似度数据集
path = Path(kagglehub.dataset_download("terrychanorg/lcqmcdata"))

print("Path to dataset files:", path)

In [None]:
import pandas as pd
df = pd.read_csv(path / "train.txt", sep="\t", header=None, names=["text1", "text2", "score"])
df.describe(include='object')

In [None]:
df.head()

In [None]:
from pathlib import Path
#先决定使用的预训练模型
# 这个模型是中文比较不错的模型，可以用来做中文文本分类
model_nm = "chinese-roberta-wwm-ext"
local_path = Path(f"../model/{model_nm}")
local_path

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# 对于NLP模型来说，不同词汇要变成唯一的数字，这个过程叫做tokenization,然后得到了一个映射表，叫做tokenizer
# 每个模型都是不一样的映射表，所以要用对应的tokenizer，这里用的是AutoTokenizer，可以自动选择对应的tokenizer
tokz  = AutoTokenizer.from_pretrained(local_path,local_files_only=True)

In [None]:
# tokenize 方法会将输入文本分解为模型词汇表中的 token（子词单元），并返回一个 token 列表。
tokz.tokenize("我爱吃饭！")

In [None]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)

def tok_func(x) : return tokz(x['text1'])

In [113]:
tok_ds = ds.map(tok_func, batched=True)
tok_ds

Map:   0%|          | 0/238766 [00:00<?, ? examples/s]

Dataset({
    features: ['text1', 'text2', 'score', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 238766
})

In [None]:
row = tok_ds[0]
row['text1'], row['input_ids'], row['attention_mask']

In [None]:
# 查找某个字的映射
tokz.vocab['爱']

In [114]:
tok_ds = tok_ds.rename_columns({'score':'labels'})


In [128]:
dds = tok_ds.train_test_split(test_size=0.25,seed=42)

Map:   0%|          | 0/179074 [00:00<?, ? examples/s]

Map:   0%|          | 0/59692 [00:00<?, ? examples/s]

In [120]:
tok_ds,dds

(Dataset({
     features: ['text1', 'text2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 238766
 }),
 DatasetDict({
     train: Dataset({
         features: ['text1', 'text2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
         num_rows: 179074
     })
     test: Dataset({
         features: ['text1', 'text2', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
         num_rows: 59692
     })
 }))

In [None]:
eval_df = pd.read_csv(path / "test.txt", sep="\t", header=None, names=["text1", "text2", "score"])
eval_df.describe()

## 这里引入使用一下，deeplearning.ipynb里的代码

In [None]:
from numpy.random import normal,seed,uniform
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)
def noise(x, scale): return normal(scale=scale, size=x.shape)
def add_noise(x, mult, add): return x * (1+noise(x,mult)) + noise(x,add)
def f(x): return 3*x**2+ 4*x+ 1


In [None]:
x = np.linspace(-2, 2, num=20)[:,None]
y = add_noise(f(x), 0.2, 1.3)
plt.scatter(x,y);

In [None]:
def plot_function(f, min=-2.1, max=2.1, color='r'):
    x = np.linspace(min,max, 100)[:,None]
    plt.plot(x, f(x), color)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

"""
plot_poly 用指定阶数（degree）的多项式回归拟合数据 x 和 y。
绘制原始数据的散点图（x, y）。
绘制模型预测的多项式曲线（通过 plot_function）。
"""

def plot_poly(degree):
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression()) #
    model.fit(x,y)
    plt.scatter(x,y)
    plot_function(model.predict)

In [None]:
plot_poly(2)
plot_function(f,color='b')

## 数据的相关性

In [None]:
# from sklearn.datasets import fetch_california_housing
# housing = fetch_california_housing(as_frame=True)   破玩意，总403  手动下载到本地来用
# !wget https://ndownloader.figshare.com/files/5976036 --timeout 3

In [None]:
import pandas as pd
housing = pd.read_csv(Path( "../database/CaliforniaHousing/california_housing_train.csv"))
# 重命名列以匹配 fetch_california_housing
housing = housing.rename(columns={
    'median_income': 'MedInc',
    'housing_median_age': 'HouseAge',
    'total_rooms': 'AveRooms',
    'total_bedrooms': 'AveBedrms',
    'population': 'Population',
    'households': 'AveOccup',
    'latitude': 'Latitude',
    'longitude': 'Longitude',
    'median_house_value': 'MedHouseVal'
})
# 数据计算成平均值
housing['AveRooms'] = housing['AveRooms'] / housing['AveOccup']  # total_rooms / households
housing['AveBedrms'] = housing['AveBedrms'] / housing['AveOccup'] # total_bedrooms / households
housing['AveOccup'] = housing['Population'] / housing['AveOccup'] # population / households
# 分离 data 和 target
data = housing.drop(columns=['MedHouseVal'])
target = housing['MedHouseVal']
# 合并并抽样
housing = data.join(target).sample(1000, random_state=52)
housing.head()

In [None]:
# 用 seaborn 绘制散点图矩阵
np.set_printoptions(precision=2, suppress=True)
# 这个是计算相关系数，相关系数是一个介于-1和1之间的值，-1表示完全负相关，1表示完全正相关，0表示没有相关性
# 通俗来说，就是展示两个变量之间关系，是一起变大，还是一个变大另一个就变小
np.corrcoef(housing, rowvar=False)

In [None]:
np.corrcoef(housing.MedInc,housing.MedHouseVal)

In [None]:
def corr(x,y):return np.corrcoef(x,y)[0][1]

corr(housing.MedInc,housing.MedHouseVal)

In [None]:
def show_corr(df,a,b):
    x,y = df[a],df[b]
    plt.scatter(x,y,alpha=0.5,s=4)
    plt.title(f'{a} vs {b} ;r:{corr(x,y):.2f}')

In [None]:
show_corr(housing,'MedInc','MedHouseVal')

In [None]:
show_corr(housing,'MedInc','AveRooms')

In [None]:
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

## 开始训练 相关性识别模型

### 引入各种依赖

In [None]:
import kagglehub
from pathlib import Path
from transformers import TrainingArguments,DataCollatorWithPadding,Trainer, EarlyStoppingCallback,AutoTokenizer,AutoModelForSequenceClassification
import pandas as pd
from datasets import Dataset,concatenate_datasets
import torch.nn.functional as F

### 预先配置各种数据和东西

In [None]:
# 数据集
# 下载中文 文本相似度数据集
path = Path(kagglehub.dataset_download("terrychanorg/lcqmcdata"))

# 读取数据
df_tr = pd.read_csv(path / "train.txt", sep="\t", header=None, names=["text1", "text2", "labels"])
df_ts = pd.read_csv(path / "test.txt", sep="\t", header=None, names=["text1", "text2", "labels"])

In [None]:

# 选择使用的模型
model_nm = "chinese-roberta-wwm-ext"
# 选择本地模型路径
local_path = Path(f"../model/{model_nm}")
# 读取模型的tokenizer
tokz  = AutoTokenizer.from_pretrained(local_path,local_files_only=True)
# 把数据转换成dataset
ds_tr = Dataset.from_pandas(df_tr)
ds_ts = Dataset.from_pandas(df_ts)
# 定义tokenize方法
def tok_func(x):
    return tokz(x['text1'], x['text2'], padding=True, truncation=True, max_length=48)
# 对数据集进行tokenize
tok_ds_tr = ds_tr.map(tok_func, batched=True)
tok_ds_ts = ds_ts.map(tok_func, batched=True)
# # 重命名列
# tok_ds_tr = tok_ds_tr.rename_columns({'score':'labels'})
# tok_ds_ts = tok_ds_ts.rename_columns({'score':'labels'})

# 划分数据集

# dds = tok_ds.train_test_split(test_size=0.25,seed=42)

Map:   0%|          | 0/238766 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

In [30]:
tok_ds.to_pandas().head()

Unnamed: 0,text1,text2,labels,input_ids,token_type_ids,attention_mask
0,喜欢打篮球的男生喜欢什么样的女生,爱打篮球的男生喜欢什么样的女生,1,"[101, 1599, 3614, 2802, 5074, 4413, 4638, 4511...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,我手机丢了，我想换个手机,我想买个新手机，求推荐,1,"[101, 2769, 2797, 3322, 696, 749, 8024, 2769, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,大家觉得她好看吗,大家觉得跑男好看吗？,0,"[101, 1920, 2157, 6230, 2533, 1961, 1962, 4692...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,求秋色之空漫画全集,求秋色之空全集漫画,1,"[101, 3724, 4904, 5682, 722, 4958, 4035, 4514,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,晚上睡觉带着耳机听音乐有什么害处吗？,孕妇可以戴耳机听音乐吗?,0,"[101, 3241, 677, 4717, 6230, 2372, 4708, 5455,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [31]:
from collections import Counter
print(Counter(dds['train']['labels']))
print(Counter(dds['test']['labels']))

Counter({1: 103786, 0: 75288})
Counter({1: 34788, 0: 24904})


In [36]:
df_tr['len1'] = df_tr['text1'].str.len()
df_tr['len2'] = df_tr['text2'].str.len()
print(df_tr[['len1', 'len2']].describe())  # 查看平均长度和最大长度

                len1           len2
count  238766.000000  238766.000000
mean       10.668177      11.209586
std         4.087534       4.813823
min         2.000000       2.000000
25%         8.000000       8.000000
50%        10.000000      10.000000
75%        12.000000      13.000000
max        49.000000     131.000000


In [54]:
bs = 521 # 预设的batch size，可以根据显存大小调整，作用是每次训练多少数据
epochs = 4 # 训练的轮数
lr = 3e-5 # 学习率

In [55]:
args = TrainingArguments('NLP1',learning_rate=lr,warmup_ratio=0.1,lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.25, report_to='none',load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="epoch")




In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokz)
model = AutoModelForSequenceClassification.from_pretrained(
    local_path,
    num_labels=2,
    weights_only=False,
    use_safetensors=False,
)
# trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
#                   tokenizer=tokz, compute_metrics=corr_d)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tok_ds_tr,
    eval_dataset=tok_ds_ts,
    data_collator=data_collator,
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()},
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)]
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../model/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train();

In [194]:
tokz

BertTokenizerFast(name_or_path='../model/chinese-roberta-wwm-ext', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

### 测试成果

In [195]:
import torch
import numpy as np

# 获取模型和 tokenizer
model = trainer.model
# tokz = trainer.processing_class  # 新接口，替代 trainer.tokenizer

# 确保模型在 GPU 上（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def test_similarity(text1, text2):
    # 编码输入
    inputs = tokz(text1, text2, padding=True, truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # 推理
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 转为概率和预测标签
    probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    pred_label = logits.argmax(-1).item()

    # 输出结果
    print(f"Text1: {text1}")
    print(f"Text2: {text2}")
    print(f"Predicted Label: {pred_label} ({'相似' if pred_label == 1 else '不相似'})")
    print(f"Similarity Probability: {probs[1]:.4f}")
    print()

# 交互式测试
while True:
    text1 = input("请输入第一句话（输入 'exit' 退出）：")
    if text1.lower() == 'exit':
        break
    text2 = input("请输入第二句话：")
    test_similarity(text1, text2)

Text1: 吃饭了吧
Text2: 吃饭不了？
Predicted Label: 0 (不相似)
Similarity Probability: 0.1242

Text1: 
Text2: 我爱你
Predicted Label: 0 (不相似)
Similarity Probability: 0.4211

Text1: 你爱我
Text2: 
Predicted Label: 0 (不相似)
Similarity Probability: 0.0153

Text1: 我爱你
Text2: 你爱我
Predicted Label: 1 (相似)
Similarity Probability: 0.6672



In [2]:
# import streamlit
import transformers
import torch
# print(streamlit.__version__)
print(transformers.__version__)
print(torch.__version__)

4.49.0
2.6.0+cu124


In [196]:
# 保存模型和 tokenizer
output_dir = "./my_trained_model"
trainer.save_model(output_dir)  # 保存模型
tokz.save_pretrained(output_dir)  # 保存 tokenizer

('./my_trained_model/tokenizer_config.json',
 './my_trained_model/special_tokens_map.json',
 './my_trained_model/vocab.txt',
 './my_trained_model/added_tokens.json',
 './my_trained_model/tokenizer.json')