In [27]:
%%writefile conf_bge.yaml
EXP_NAME :  "fine-tuning-bge-version2"
DATA_PATH :  "../competitions_data"
MODEL_NAME : "BAAI/bge-large-en-v1.5"
LLM_MODEL: "Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
COMPETITION_NAME :  "eedi-mining-misconceptions-in-mathematics"
OUTPUT_PATH :  "../model_output"
MODEL_OUTPUT_PATH :  "../model_output/trained_model"
DATA_OUTPUT_PATH : '../competitions_data'
RETRIEVE_NUM :  25
WEIGHT_DECAY: 0.01
WARMUP_RATIO: 0.1

EPOCH : 4
LR :  2e-05
BS :  16
GRAD_ACC_STEP :  8

TRAINING :  True
DEBUG :  False
WANDB :  True

Overwriting conf_bge.yaml


In [22]:
%%writefile generate_response.py
import argparse
from omegaconf import OmegaConf
import re
import vllm
import pandas as pd
def generate_prompt(row):
    sp="Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>.\nBefore answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag and respond your final misconception inside <response>$$INSERT TEXT HERE$$</response> tag."
    Prompt=f"Question:{row['QuestionText']}\nIncorrect Answer:{row['AnswerText']}\nCorrect Answer:{row['Correct Answer']}\nConstruct Name:{row['ConstructName']}\nSubject Name:{row['ConstructName']}\n{sp}"
    return prompt
def extract_response(text):
    return ",".join(re.findall(r"<response>(.*?)</response>", text)).strip()
if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--config_path', type=str, required=True)
    args = ap.parse_args()
    cfg = OmegaConf.load(args.config_path)
    print(cfg)
    df= pd.read_parquet("../competitions_data/df_process.parquet")
    # 应用函数并添加新列
    df['Prompt'] = df.apply(generate_prompt, axis=1)
    # 打印结果
    llm = vllm.LLM(
        cfg.LLM_MODEL,
        quantization="awq",
        tensor_parallel_size=1,
        gpu_memory_utilization=0.95, 
        trust_remote_code=True,
        dtype="half", 
        enforce_eager=True,
        max_model_len=8192,
        disable_log_stats=True
    )
    tokenizer = llm.get_tokenizer()
    
    
    responses = llm.generate(
        df["Prompt"].values,
        vllm.SamplingParams(
            n=1,  # Number of output sequences to return for each prompt.
            top_p=0.9,  # Float that controls the cumulative probability of the top tokens to consider.
            temperature=0,  # randomness of the sampling
            seed=777, # Seed for reprodicibility
            skip_special_tokens=False,  # Whether to skip special tokens in the output.
            max_tokens=2048,  # Maximum number of tokens to generate per output sequence.
        ),
        use_tqdm = True
    )
    
    responses = [x.outputs[0].text for x in responses]
    df["FullResponse"] = responses
    
    responses = [extract_response(x) for x in responses]
    df["Misconception"] = responses
    df.to_parquet("../competitions_data/output.parquet", index=False)

Overwriting generate_response.py


In [23]:
%%writefile bge_data_prepare.py
import pandas as pd
from copy import deepcopy
import os
import numpy as np
import argparse
from omegaconf import OmegaConf
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)

def data_process(df):
    df = deepcopy(df)
    grouped = df.groupby("QuestionId")

    question_dict = {}
    for question_id, group in grouped:
        question_data = group.to_dict(orient="records")[0]
        del question_data["QuestionId"]
        question_dict[question_id] = question_data

    all_questions = list(question_dict.keys())

    queries = []

    for qid in all_questions:
        info = question_dict[qid]

        for answer_key in ["A", "B", "C", "D"]:
            if info["CorrectAnswer"] == answer_key:
                continue
            this_example = dict()
            this_key = f"{qid}_{answer_key}"
            this_example["QuestionId_Answer"] = this_key
            this_example["Option"] = answer_key

            # ---
            for col in ["SubjectName", "ConstructName", "QuestionText"]:
                this_example[col] = info[col]

            this_example["CorrectAnswerText"] = info[f"Answer{info['CorrectAnswer']}Text"]
            this_example["AnswerText"] = info[f"Answer{answer_key}Text"]
            this_example["MisconceptionID"] = info[f"Misconception{answer_key}Id"]
            this_example["AllOptionText"] = "\n- ".join([info[f"Answer{x}Text"] for x in ["A", "B", "C", "D"]])
            this_example["AllOptionText"] = f"\n- {this_example['AllOptionText']}"
            queries.append(this_example)

    query_df = pd.DataFrame(queries).dropna()
    return query_df

def create_training_text(row):
    text = f"""
    {row["ConstructName"]}
    {row["QuestionText"]}
    Answer: {row["AnswerText"]}
    Misconception: {row["Misconception"]}
    """
    return text
def get_model(cfg):
    model = SentenceTransformer(cfg.MODEL_NAME)
    return model
def retrieve(model,train,label_df):
    train_long_vec = model.encode(
    train["FullText"].values, normalize_embeddings=True
)
    misconception_mapping_vec = model.encode(
    label_df["MisconceptionName"].values, normalize_embeddings=True
)
    train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
    train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)
    print('train_sorted_indices.shape',train_sorted_indices.shape)
    return train_sorted_indices
if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--config_path', type=str, required=True)
    args = ap.parse_args()
    cfg = OmegaConf.load(args.config_path)
    print(cfg)
    df_train_dir = os.path.join(cfg.DATA_PATH,"train.csv")
    df_train = pd.read_csv(df_train_dir)   
    train_dir=os.path.join(cfg.DATA_PATH,"output.parquet")
    train = pd.read_parquet(train_dir)
    misconception_mapping_dir = os.path.join(cfg.DATA_PATH,"misconception_mapping.csv")
    misconception_mapping = pd.read_csv(misconception_mapping_dir)
    
    mapping = {}
    for k, v in zip(misconception_mapping["MisconceptionId"].values, misconception_mapping["MisconceptionName"].values):
        mapping[k] = v

    df=data_process(df_train)
    df.to_parquet(os.path.join(cfg.DATA_PATH,"df_process.parquet"))

    train["MisconceptionID"] = df["MisconceptionID"].values.astype(int)
    train["GroundTruthMisconception"] = train["MisconceptionID"].apply(lambda x: mapping[x])
    train["FullText"] = train.apply(lambda row: create_training_text(row), axis=1)

    model=get_model(cfg)
    train_sorted_indices=retrieve(model,train,misconception_mapping)
    train["PredictMisconceptionId"] = train_sorted_indices[:, :cfg.RETRIEVE_NUM].tolist()
    train_exploded = train.explode("PredictMisconceptionId")
    train_exploded["PredictMisconception"] = train_exploded["PredictMisconceptionId"].apply(lambda x:mapping[x])

    output_dir=os.path.join(cfg.DATA_OUTPUT_PATH,"train_exploded.parquet")
    train_exploded.to_parquet(output_dir)

Overwriting bge_data_prepare.py


In [19]:
!python bge_data_prepare.py --config_path conf_bge.yaml

{'EXP_NAME': 'fine-tuning-bge-version2', 'DATA_PATH': '../competitions_data', 'MODEL_NAME': 'BAAI/bge-large-en-v1.5', 'COMPETITION_NAME': 'eedi-mining-misconceptions-in-mathematics', 'OUTPUT_PATH': '../model_output', 'MODEL_OUTPUT_PATH': '../model_output/trained_model', 'DATA_OUTPUT_PATH': '../competitions_data', 'RETRIEVE_NUM': 25, 'WEIGHT_DECAY': 0.01, 'WARMUP_RATIO': 0.1, 'EPOCH': 4, 'LR': 2e-05, 'BS': 16, 'GRAD_ACC_STEP': 8, 'TRAINING': True, 'DEBUG': False, 'WANDB': True}
train_sorted_indices.shape (4370, 2587)


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [24]:
%%writefile bge_dataset.py
from datasets import load_dataset, Dataset
import polars as pl
import os

def dataset_from_polars(polars_df):
    NUM_PROC = os.cpu_count()
    train = (
        Dataset.from_polars(polars_df)
        .filter(  # To create an anchor, positive, and negative structure, delete rows where the positive and negative are identical.
            lambda example: example["MisconceptionID"] != example["PredictMisconceptionId"],
            num_proc=4,
        )
    )
    return train

Overwriting bge_dataset.py


In [25]:
%%writefile bge_model.py
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
def get_model(cfg):
    model = SentenceTransformer(cfg.MODEL_NAME)
    return model

Overwriting bge_model.py


In [26]:
%%writefile bge_train.py
import polars as pl
import re
import pandas as pd
import os
import numpy as np
import argparse
from omegaconf import OmegaConf
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from bge_dataset import dataset_from_polars
from bge_model import get_model
if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--config_path', type=str, required=True)
    args = ap.parse_args()
    cfg = OmegaConf.load(args.config_path)
    
    train_exploded_dir = os.path.join(cfg.DATA_OUTPUT_PATH,"train_exploded.parquet")
    train_exploded = pd.read_parquet(train_exploded_dir)
    final_train = pl.from_pandas(train_exploded)
    
    train=dataset_from_polars(final_train)
    
    train = train.select_columns(["FullText", "GroundTruthMisconception", "PredictMisconception"])

    model = get_model(cfg)
    loss = MultipleNegativesRankingLoss(model)
    args = SentenceTransformerTrainingArguments(
        # Required parameter:
        output_dir=cfg.OUTPUT_PATH,
        # Optional training parameters:
        num_train_epochs=cfg.EPOCH,
        per_device_train_batch_size=cfg.BS,
        gradient_accumulation_steps=cfg.GRAD_ACC_STEP,
        per_device_eval_batch_size=cfg.BS,
        eval_accumulation_steps=cfg.GRAD_ACC_STEP,
        learning_rate=cfg.LR,
        weight_decay=cfg.WEIGHT_DECAY,
        warmup_ratio=cfg.WARMUP_RATIO,
        fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
        bf16=False,  # Set to True if you have a GPU that supports BF16
        batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
        # Optional tracking/debugging parameters:
        lr_scheduler_type="cosine_with_restarts",
        save_strategy="steps",
        save_steps=0.1,
        save_total_limit=2,
        logging_steps=100,
        report_to=REPORT_TO,  # Will be used in W&B if `wandb` is installed
        run_name=cfg.EXP_NAME,
        do_eval=False
    )
    trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train,
        loss=loss
    )
    
    trainer.train()
    model.save_pretrained(cfg.MODEL_OUTPUT_PATH)

Overwriting bge_train.py


In [15]:
!python bge_train.py --config_path conf_bge.yaml

Traceback (most recent call last):
  File "C:\Users\birdi\Desktop\bge小项目\code\bge_train.py", line 18, in <module>
    from bge_dataset import dataset_from_polars
ModuleNotFoundError: No module named 'bge_dataset'
