In [1]:
import json

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter
from llama_index.schema import MetadataMode

  import pkg_resources


In [2]:
TRAIN_FILES = ["train.txt"]
VAL_FILES = ["test.txt"]

TRAIN_CORPUS_FPATH = "train_corpus.json"
VAL_CORPUS_FPATH = "val_corpus.json"

In [3]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SentenceSplitter(chunk_size=250, chunk_overlap=0)
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [4]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['train.txt']
Loaded 1 docs


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsed 129 nodes
Loading files ['test.txt']
Loaded 1 docs


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsed 107 nodes


In [5]:
train_nodes[:3]

[TextNode(id_='8dad06d4-1a53-4e6b-bdb3-ff931af1e0c8', embedding=None, metadata={'file_path': 'train.txt', 'file_name': 'train.txt', 'file_type': 'text/plain', 'file_size': 66966, 'creation_date': '2025-07-18', 'last_modified_date': '2025-07-18', 'last_accessed_date': '2025-07-18'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e38c4e86-aead-4601-9c03-eac2b8b8a5e8', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'train.txt', 'file_name': 'train.txt', 'file_type': 'text/plain', 'file_size': 66966, 'creation_date': '2025-07-18', 'last_modified_date': '2025-07-18', 'last_accessed_date': '2025-07-18'}, hash='a7ecd6f710f4578beb465a9fd03a9bb1d57a280c16f514247c2408f4555ec498'), <NodeRelationship.NEX

In [6]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.llms.deepseek import DeepSeek
import os

os.environ["DEEPSEEK_API_KEY"] = "见dropbox"
llm = DeepSeek(model="deepseek-reasoner", api_key=os.getenv("DEEPSEEK_API_KEY"))

# 或者直接在代码中传入 api_key

In [10]:
qa_generate_prompt_tmpl = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination in Chinese. The questions should be diverse in nature \
across the document in Chinese. The questions should not contain options, not start with Q1/ Q2. \
Restrict the questions to the context information provided.
"""

train_dataset = generate_qa_embedding_pairs(nodes=train_nodes, llm=llm, num_questions_per_chunk=1, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl)
val_dataset = generate_qa_embedding_pairs(nodes=val_nodes, llm=llm, num_questions_per_chunk=1, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl)

train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

100%|███████████████████████████████████████| 129/129 [1:11:53<00:00, 33.44s/it]
100%|█████████████████████████████████████████| 107/107 [57:15<00:00, 32.11s/it]


In [7]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine
from sentence_transformers import SentenceTransformer, models
import json
from datasets import Dataset
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset= EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

# 构造Transformer + Pooling模块
word_model = models.Transformer("/home/sheng/model/BAAI/bge-base-zh-v1.5")
pooling_model = models.Pooling(word_model.get_word_embedding_dimension())
st_model = SentenceTransformer(modules=[word_model, pooling_model])
st_model.save("/tmp/temp_sbert_model")

# ✅ 使用关键字传入 model
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,                              # 位置参数 #1：训练集
    "/tmp/temp_sbert_model",                            # 关键字参数，传入模型对象
    model_output_path="/home/sheng/model/bge-base-ft-001",  # 关键字参数，保存路径
    val_dataset=val_dataset,                    # 验证集（可省略或写 None）
    batch_size=8,
    epochs=10,
    evaluation_steps=100,
)



INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Save model to /tmp/temp_sbert_model
Save model to /tmp/temp_sbert_model
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: /tmp/temp_sbert_model
Load pretrained SentenceTransformer: /tmp/temp_sbert_model


In [8]:

finetune_engine.finetune()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
30,No log,No log,0.584211,0.663158,0.684211,0.721053,0.584211,0.221053,0.136842,0.072105,0.584211,0.663158,0.684211,0.721053,0.652524,0.630787,0.637102
60,No log,No log,0.589474,0.668421,0.694737,0.726316,0.589474,0.222807,0.138947,0.072632,0.589474,0.668421,0.694737,0.726316,0.657995,0.636241,0.642581
90,No log,No log,0.584211,0.673684,0.694737,0.721053,0.584211,0.224561,0.138947,0.072105,0.584211,0.673684,0.694737,0.721053,0.655596,0.634451,0.641205


INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Information Retrieval Evaluation of the model on the  dataset in epoch 1.0 after 30 steps:
Information Retrieval Evaluation of the model on the  dataset in epoch 1.0 after 30 steps:
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Queries: 190
Queries: 190
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Corpus: 107

Corpus: 107

INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Score-Function: cosine
Score-Function: cosine
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@1: 58.42%
Accuracy@1: 58.42%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@3: 66.32%
Accuracy@3: 66.32%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@5: 68.42%
Accuracy@5: 68.42%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@10: 72.11%
Accuracy@10: 72.11%
INFO:sentence_transfor

In [10]:
finetune_engine.loss

MultipleNegativesRankingLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'BertModel'})
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  )
  (cross_entropy_loss): CrossEntropyLoss()
)