In [1]:
from datasets import load_dataset

data_files = {
    "train": "./drugsComTest_raw.tsv",
    "test": "./drugsComTrain_raw.tsv"
}

drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

drug_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
})

In [2]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]


{'Unnamed: 0': [184648, 25268, 172019],
 'drugName': ['Efudex', 'Flector Patch', 'Amitiza'],
 'condition': ['Basal Cell Carcinoma', 'Pain', 'Irritable Bowel Syndrome'],
 'review': ['"I have BCC on my upper arm and SCC on upper left hand. Unfortunately after 6wks of treatment twice a day the cream didnt work. So disappointed and im now scheduled to have both surgically removed."',
  '"I tore my shoulder labrum and the pain can be off the chart.  Hydrocodone and ibuprofen and ice helped some. After my doctor gave me the Flector Patch I noticed major relief in my shoulder within an hour. These work very well. These truly work."',
  '"Amitiza is the best if you have ibs!"'],
 'rating': [1.0, 8.0, 10.0],
 'date': ['August 30, 2016', 'May 29, 2014', 'July 13, 2016'],
 'usefulCount': [16, 40, 9]}

In [3]:
drug_dataset = drug_dataset.rename_column("Unnamed: 0", "patient_id")

In [4]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [5]:
def lowercase_condition(examples):
    return {"condition": examples["condition"].lower()}


drug_dataset = drug_dataset.map(lowercase_condition)

In [6]:
def compute_review_length(examples):
    return {"review_length": len(examples["review"].split())}


drug_dataset = drug_dataset.map(compute_review_length)

In [7]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

{'train': 46108, 'test': 138514}


In [8]:
import html

drug_dataset = drug_dataset.map(lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True)

In [9]:
# 更改数据格式
drug_dataset.set_format(type="pandas")

In [10]:
type(drug_dataset)

datasets.dataset_dict.DatasetDict

In [11]:
drug_dataset["train"].set_format(type="pandas")

In [12]:
import numpy as np

all_labels = list(np.unique(drug_dataset["train"]["condition"]))
labels = [i for i in all_labels if "span" not in i]
filter_labels = [i for i in all_labels if "span" in i]
len(labels)

612

In [13]:
drug_dataset.reset_format()
type(drug_dataset["test"])

datasets.arrow_dataset.Dataset

In [14]:
shuffle_dataset = drug_dataset["train"].train_test_split(test_size=0.2, train_size=0.8, seed=42)


def filter_condition(example):
    if example["condition"] in filter_labels:
        return False
    return True


shuffle_dataset = shuffle_dataset.filter(filter_condition)

In [15]:
label2Id = {label: i for i, label in enumerate(labels)}
id2Label = {i: label for i, label in enumerate(labels)}

In [16]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["review"], truncation=True)
    tokenized_inputs["labels"] = [label2Id[i] for i in examples["condition"]]

    return tokenized_inputs


tokenized_datasets = shuffle_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns([o for o in tokenized_datasets.column_names["train"] if o not in ["labels", "input_ids","attention_mask", "token_type_ids"]])

In [17]:
# 提取样本列表（如取前 4 个样本）
samples = tokenized_datasets["train"][: 4]

# 生成 Batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
batch = data_collator(samples)

In [18]:
# 💪 健壮的模型加载 - 处理导入问题
import torch
import importlib

# 智能设备检测 - 最佳实践！
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"✅ 使用GPU训练: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():  # Apple Silicon Mac
    device = torch.device("mps") 
    print("✅ 使用Apple Silicon GPU训练")
else:
    device = torch.device("cpu")
    print("⚠️  使用CPU训练（会比较慢，但完全可行）")

# 健壮的transformers导入
try:
    from transformers import AutoModelForSequenceClassification
    print("✅ AutoModelForSequenceClassification 导入成功")
except ImportError as e:
    print(f"❌ 导入失败: {e}")
    print("🔄 尝试重新加载transformers...")
    
    # 强制重新加载
    import transformers
    importlib.reload(transformers)
    from transformers import AutoModelForSequenceClassification
    print("✅ 重新加载后导入成功")

# 加载模型
print("🔄 正在加载BERT模型...")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels))
model.to(device)

print(f"模型已加载到设备: {device}")
print(f"模型参数数量: {model.num_parameters():,}")

# 测试模型前向传播
print("🧪 测试前向传播...")
batch_on_device = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
    output = model(**batch_on_device)
    print(f"输出logits形状: {output.logits.shape}")
    print("🎉 模型测试成功！")

✅ 使用GPU训练: NVIDIA GeForce RTX 4070 SUPER
✅ AutoModelForSequenceClassification 导入成功
🔄 正在加载BERT模型...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


模型已加载到设备: cuda
模型参数数量: 109,952,868
🧪 测试前向传播...
输出logits形状: torch.Size([4, 612])
🎉 模型测试成功！


  return forward_call(*args, **kwargs)


In [19]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir="./train-test", eval_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

In [20]:
trainer.train()

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.7402,1.647418
2,1.2345,1.334517
3,0.8663,1.259497


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*ar

TrainOutput(global_step=13767, training_loss=1.5380363372915786, metrics={'train_runtime': 1202.0341, 'train_samples_per_second': 91.625, 'train_steps_per_second': 11.453, 'total_flos': 1.057656146851584e+16, 'train_loss': 1.5380363372915786, 'epoch': 3.0})

In [25]:
# 🎯 实际使用训练好的模型进行预测！
print("=== 🚀 从checkpoint加载模型进行预测 ===")
print()

# 使用最终的checkpoint（训练完成的模型）
checkpoint_path = "./train-test/checkpoint-13767"  # 使用最后一个checkpoint

print(f"📁 从 {checkpoint_path} 加载训练好的模型...")

# 加载模型和tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# 设备检测
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔧 使用设备: {device}")

# 加载模型
loaded_model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# 移动到设备并设置评估模式
loaded_model.to(device)
loaded_model.eval()

print(f"✅ 模型加载成功！")
print(f"📊 模型参数: {loaded_model.num_parameters():,}")
print(f"🏷️  支持类别数: {loaded_model.config.num_labels}")

=== 🚀 从checkpoint加载模型进行预测 ===

📁 从 ./train-test/checkpoint-13767 加载训练好的模型...
🔧 使用设备: cuda
✅ 模型加载成功！
📊 模型参数: 109,952,868
🏷️  支持类别数: 612


In [26]:
# 🔮 创建预测函数
print("=== 🔮 创建智能预测函数 ===")
print()

def predict_drug_condition(review_text, model, tokenizer, id2label_mapping, device, top_k=5):
    """
    预测药物评论对应的疾病类别
    
    Args:
        review_text: 评论文本
        model: 训练好的模型
        tokenizer: tokenizer
        id2label_mapping: ID到标签的映射
        device: 计算设备
        top_k: 返回前k个最可能的预测
    
    Returns:
        dict: 预测结果
    """
    # 1. 文本预处理
    inputs = tokenizer(
        review_text,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )
    
    # 2. 移动到正确设备
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # 3. 模型推理
    with torch.no_grad():
        outputs = model(**inputs)
        # 获得概率分布
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
    # 4. 获取预测结果
    probs_cpu = probabilities.cpu().numpy()[0]
    
    # 5. 找到top-k预测
    top_indices = probs_cpu.argsort()[-top_k:][::-1]  # 降序排列
    
    results = {
        'predicted_condition': id2label_mapping[top_indices[0]],
        'confidence': float(probs_cpu[top_indices[0]]),
        'top_predictions': [
            {
                'condition': id2label_mapping[idx],
                'probability': float(probs_cpu[idx])
            }
            for idx in top_indices
        ]
    }
    
    return results

# 重建标签映射（从之前的训练中获取）
print("🏷️  重建标签映射...")
print(f"✅ 预测函数创建完成！支持top-k预测和置信度评估")


=== 🔮 创建智能预测函数 ===

🏷️  重建标签映射...
✅ 预测函数创建完成！支持top-k预测和置信度评估


In [28]:
# 🧪 实际预测测试 - 让我们看看模型学会了什么！
print("=== 🧪 实际预测测试 ===")
print()

# 准备测试用例 - 不同类型的药物评论
test_reviews = [
    {
        "text": "This medication really helped with my high blood pressure. After taking it for a month, my readings went from 150/90 to 120/80. Very satisfied with the results.",
        "expected": "高血压相关"
    },
    {
        "text": "I've been struggling with depression for years. This antidepressant finally gave me relief. I feel more hopeful and energetic. The side effects were minimal.",
        "expected": "抑郁症相关"  
    },
    {
        "text": "Great pain relief for my chronic back pain. I can finally sleep through the night without waking up in agony. Highly recommend for pain management.",
        "expected": "疼痛相关"
    },
    {
        "text": "This diabetes medication has been a game changer. My blood sugar levels are now stable and I feel much better overall. Good for glucose control.",
        "expected": "糖尿病相关"
    }
]

print("🔍 测试样例准备完成！")
print("🚀 开始预测测试...")
print("="*60)

# 执行预测
for i, test_case in enumerate(test_reviews, 1):
    print(f"\n📝 测试样例 {i}:")
    print(f"评论: {test_case['text'][:100]}...")
    print(f"期望类别: {test_case['expected']}")
    
    # 进行预测
    try:
        result = predict_drug_condition(
            test_case['text'], 
            loaded_model, 
            loaded_tokenizer, 
            id2Label,  # 使用之前创建的标签映射
            device,
            top_k=3
        )
        
        print(f"🎯 预测结果: {result['predicted_condition']}")
        print(f"📊 置信度: {result['confidence']:.3f}")
        print(f"🏆 前3预测:")
        for j, pred in enumerate(result['top_predictions'], 1):
            print(f"   {j}. {pred['condition']}: {pred['probability']:.3f}")
            
    except Exception as e:
        print(f"❌ 预测出错: {e}")
    
    print("-" * 40)

print("\n✅ 预测测试完成！")


=== 🧪 实际预测测试 ===

🔍 测试样例准备完成！
🚀 开始预测测试...

📝 测试样例 1:
评论: This medication really helped with my high blood pressure. After taking it for a month, my readings ...
期望类别: 高血压相关
🎯 预测结果: high blood pressure
📊 置信度: 0.961
🏆 前3预测:
   1. high blood pressure: 0.961
   2. diabetes, type 2: 0.007
   3. weight loss: 0.002
----------------------------------------

📝 测试样例 2:
评论: I've been struggling with depression for years. This antidepressant finally gave me relief. I feel m...
期望类别: 抑郁症相关
🎯 预测结果: depression
📊 置信度: 0.951
🏆 前3预测:
   1. depression: 0.951
   2. major depressive disorde: 0.029
   3. anxiety and stress: 0.004
----------------------------------------

📝 测试样例 3:
评论: Great pain relief for my chronic back pain. I can finally sleep through the night without waking up ...
期望类别: 疼痛相关
🎯 预测结果: pain
📊 置信度: 0.789
🏆 前3预测:
   1. pain: 0.789
   2. chronic pain: 0.147
   3. back pain: 0.032
----------------------------------------

📝 测试样例 4:
评论: This diabetes medication has been a game changer. My blo