In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install peft

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2
Note: you may need to restart the kernel to use updated packages.


导入python库

In [4]:
import os
import sys
import logging
import datasets
import evaluate

import pandas as pd
import numpy as np

from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.model_selection import train_test_split

In [5]:
! unzip /kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
! unzip /kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
! unzip /kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip

  pid, fd = os.forkpty()


Archive:  /kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
  inflating: labeledTrainData.tsv    
Archive:  /kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
  inflating: testData.tsv            
Archive:  /kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
  inflating: unlabeledTrainData.tsv  


读取训练和测试数据集

In [6]:
train = pd.read_csv("/kaggle/working/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("/kaggle/working/testData.tsv", header=0, delimiter="\t", quoting=3)

设置日志记录的基本配置，并记录程序运行的开始

In [7]:
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info(r"running %s" % ''.join(sys.argv))

将训练数据集分割为训练集和验证集，其中20%作为验证集

In [8]:
train, val = train_test_split(train, test_size=.2)

创建字典，将标签和文本分别存储，以适配模型的输入格式，并将字典转换为datasets库的数据集对象

In [9]:
train_dict = {'label': train["sentiment"], 'text': train['review']}
val_dict = {'label': val["sentiment"], 'text': val['review']}
test_dict = {"text": test['review']}

train_dataset = datasets.Dataset.from_dict(train_dict)
val_dataset = datasets.Dataset.from_dict(val_dict)
test_dataset = datasets.Dataset.from_dict(test_dict)

指定了预训练模型的ID并从指定的模型ID加载分词器。

In [10]:
model_id = "microsoft/deberta-v3-large"

tokenizer = DebertaV2Tokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]



定义预处理函数，对文本进行分词处理，设置截断和填充，并对训练集、验证集和测试集应用预处理函数。

In [11]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True,padding='max_length', max_length=510)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

创建一个数据填充器，用于在训练时自动填充批次。并从指定的模型ID加载预训练模型

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = DebertaV2ForSequenceClassification.from_pretrained(
    model_id,
    # device_map="auto",
    # load_in_8bit=True
)

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


定义LoRA配置，并使用peft库的函数准备模型，打印可训练参数。

In [13]:
# Define LoRA Config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    #target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)
# prepare int-8 model for training
# model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,574,914 || all params: 436,638,724 || trainable%: 0.3607


加载准确率作为评估指标，并定义函数计算模型预测的准确率。

In [14]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

定义了训练参数，包括输出目录、训练轮数、批次大小等。

In [15]:
training_args = TrainingArguments(
    output_dir='./checkpoint',  # output directory
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=4,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=100,
    save_strategy="no",
    evaluation_strategy="epoch"
)



创建一个训练器对象，传入模型、训练参数、数据集等。

In [16]:
trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=tokenized_train,  # training dataset
    eval_dataset=tokenized_val,  # evaluation dataset
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113616922221405, max=1.0…

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2528,0.167299,0.9604
2,0.1107,0.197795,0.9624
3,0.0492,0.191696,0.965


TrainOutput(global_step=30000, training_loss=0.18414618113438289, metrics={'train_runtime': 14556.6544, 'train_samples_per_second': 4.122, 'train_steps_per_second': 2.061, 'total_flos': 5.59869906096e+16, 'train_loss': 0.18414618113438289, 'epoch': 3.0})

开始训练模型。

In [18]:
prediction_outputs = trainer.predict(tokenized_test)
test_pred = np.argmax(prediction_outputs[0], axis=-1).flatten()
print(test_pred)

[1 0 0 ... 0 1 1]


In [20]:
result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
result_output.to_csv("deb0erta_lora_int8.csv", index=False, quoting=3)
logging.info('result saved!')