In [1]:
from datasets import load_dataset
from mpmath.identification import transforms

data_files = {
    "train": "./drugsComTest_raw.tsv",
    "test": "./drugsComTrain_raw.tsv"
}

drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

drug_dataset



DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
})

In [2]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]


{'Unnamed: 0': [184648, 25268, 172019],
 'drugName': ['Efudex', 'Flector Patch', 'Amitiza'],
 'condition': ['Basal Cell Carcinoma', 'Pain', 'Irritable Bowel Syndrome'],
 'review': ['"I have BCC on my upper arm and SCC on upper left hand. Unfortunately after 6wks of treatment twice a day the cream didnt work. So disappointed and im now scheduled to have both surgically removed."',
  '"I tore my shoulder labrum and the pain can be off the chart.  Hydrocodone and ibuprofen and ice helped some. After my doctor gave me the Flector Patch I noticed major relief in my shoulder within an hour. These work very well. These truly work."',
  '"Amitiza is the best if you have ibs!"'],
 'rating': [1.0, 8.0, 10.0],
 'date': ['August 30, 2016', 'May 29, 2014', 'July 13, 2016'],
 'usefulCount': [16, 40, 9]}

In [3]:
drug_dataset = drug_dataset.rename_column("Unnamed: 0", "patient_id")

In [4]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [5]:
def lowercase_condition(examples):
    return {"condition": examples["condition"].lower()}


drug_dataset = drug_dataset.map(lowercase_condition)

In [6]:
def compute_review_length(examples):
    return {"review_length": len(examples["review"].split())}


drug_dataset = drug_dataset.map(compute_review_length)

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

In [7]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

{'train': 46108, 'test': 138514}


In [8]:
import html

drug_dataset = drug_dataset.map(lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True)

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

In [9]:
# 更改数据格式
drug_dataset.set_format(type="pandas")

In [10]:
type(drug_dataset)

datasets.dataset_dict.DatasetDict

In [11]:
drug_dataset["train"].set_format(type="pandas")

In [21]:
import numpy as np

all_labels = list(np.unique(drug_dataset["train"]["condition"]))
labels = [i for i in all_labels if "span" not in i]
filter_labels = [i for i in all_labels if "span" in i]
len(labels)

612

In [13]:
drug_dataset.reset_format()
type(drug_dataset["test"])

datasets.arrow_dataset.Dataset

In [23]:
shuffle_dataset = drug_dataset["train"].train_test_split(test_size=0.2, train_size=0.8, seed=42)


def filter_condition(example):
    if example["condition"] in filter_labels:
        return False
    return True


shuffle_dataset = shuffle_dataset.filter(filter_condition)

Filter:   0%|          | 0/36886 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9222 [00:00<?, ? examples/s]

In [24]:
label2Id = {label: i for i, label in enumerate(labels)}
id2Label = {i: label for i, label in enumerate(labels)}

In [35]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["review"], truncation=True)
    tokenized_inputs["labels"] = [label2Id[i] for i in examples["condition"]]

    return tokenized_inputs


tokenized_datasets = shuffle_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns([o for o in tokenized_datasets.column_names["train"] if o not in ["labels", "review", "patient_id", "input_ids","attention_mask", "token_type_ids"]])

In [37]:
# 提取样本列表（如取前 4 个样本）
samples = tokenized_datasets["train"][: 4]

# 生成 Batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
batch = data_collator(samples)
print(batch)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`review` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [100]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels))

data_collator(tokenized_datasets["train"][:10])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'Dataset' object has no attribute 'keys'

In [81]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir="./train-test", eval_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

  return forward_call(*args, **kwargs)


ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.