In [None]:
! pip install datasets
! pip install transformers==4.45.2 sentence-transformers==3.1.1

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import AutoModelForSequenceClassification,AutoModelForCausalLM, AutoTokenizer,TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch
# 加载 Llama2-7B
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"  # 适用于 GPT 类模型
)
model = get_peft_model(model, lora_config)  # 应用 LoRA
tokenizer = AutoTokenizer.from_pretrained(model_name)




In [None]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [None]:
# 配置 LoRA 适配层
# 应用 LoRA
model = get_peft_model(model, lora_config)

In [None]:
device = model.device

In [None]:
! ls

In [None]:


from sklearn.metrics import accuracy_score, classification_report
# 3️⃣ 处理数据
def preprocess_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
    inputs["labels"] = [float(label) for label in examples["label"]]  # 确保 label 是 float
    return inputs

dataset = load_dataset("json", data_files={"train": "drive/MyDrive/Colab/llm_demo/train.json", "test": "drive/MyDrive/Colab/llm_demo/train.json"})
tokenized_datasets = dataset.map(preprocess_function, batched=True)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# 4️⃣ 训练参数
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=20,
    logging_dir="./logs",
    learning_rate=2e-5
)



# 6️⃣ 训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhaoyuh[0m ([33mhaoyuh-university-of-michigan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.58014,0.5
2,No log,1.47159,0.5
3,No log,1.384634,0.5
4,No log,1.331336,0.5
5,No log,1.308993,0.5


TrainOutput(global_step=65, training_loss=1.4566213754507211, metrics={'train_runtime': 542.2231, 'train_samples_per_second': 0.24, 'train_steps_per_second': 0.12, 'total_flos': 16380283944960.0, 'train_loss': 1.4566213754507211, 'epoch': 5.0})

In [None]:
import torch
import torch.nn.functional as F

def predict(text):
    # 确保模型在正确的设备上
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 预处理输入文本
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to(device)

    # 计算 logits（模型原始输出）
    with torch.no_grad():
        logits = model(**inputs).logits  # logits 形状: (1, 1)
        prob = torch.sigmoid(logits)  # 计算概率（0~1）
    probs = torch.softmax(logits, dim=-1)  # 使用 softmax 获取概率
    class_1_probs = probs[:, 1].numpy()    # 获取类别 1 的概率

    return class_1_probs  # 返回单个数值




In [None]:
# **测试预测**
text1 = "用户过去一年内信用评分保持稳定，无不良记录。"
text2 = "用户近期信用卡账单连续三个月未全额还款，负债率较高。"
text3 = "用户最近六个月内按时还款，无逾期记录。"

print(f"Text1 逾期概率: {predict(text1)[0]:.4f}")
print(f"Text2 逾期概率: {predict(text2)[0]:.4f}")
print(f"Text3 逾期概率: {predict(text3)[0]:.4f}")

Text1 逾期概率: 0.8737
Text2 逾期概率: 0.9268
Text3 逾期概率: 0.9017


In [None]:
prob1[0]

NameError: name 'prob1' is not defined