In [1]:
import os#环境代理设置
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"

In [2]:
from transformers import TrainingArguments, Trainer
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

  warn(f"Failed to load image Python extension: {e}")


In [3]:
class DistillTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2., **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature

In [4]:
class DistillTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
    
    # def compute_loss(self, model, inputs, return_outputs=False):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        s_output = model(**inputs)
        s_ce = s_output.loss #交叉熵损失
        s_logits = s_output.logits
        
        with torch.no_grad():
            t_output = self.teacher_model(**inputs)
            t_logits = t_output.logits
        
        loss_kl_fct = nn.KLDivLoss(reduction='batchmean')
        loss_kd = self.args.temperature**2 * loss_kl_fct(F.log_softmax(s_logits/self.args.temperature, dim=-1), 
                                                        F.softmax(t_logits/self.args.temperature, dim=-1))
        loss = self.args.alpha * s_ce + (1-self.args.alpha) * loss_kd
        return (loss, s_output) if return_outputs else loss

In [5]:
from datasets import load_dataset
clinc = load_dataset("clinc_oos", "plus")

In [6]:
clinc

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 5500
    })
})

In [7]:
clinc['train']

Dataset({
    features: ['text', 'intent'],
    num_rows: 15250
})

In [8]:
clinc['train'][:10]

{'text': ['what expression would i use to say i love you if i were an italian',
  "can you tell me how to say 'i do not speak much spanish', in spanish",
  "what is the equivalent of, 'life is good' in french",
  "tell me how to say, 'it is a beautiful morning' in italian",
  'if i were mongolian, how would i say that i am a tourist',
  "how do i say 'hotel' in finnish",
  "i need you to translate the sentence, 'we will be there soon' into portuguese",
  'please tell me how to ask for a taxi in french',
  "can you tell me how i would say, 'more bread please' in french",
  "what is the correct way to say 'i am a visitor' in french"],
 'intent': [61, 61, 61, 61, 61, 61, 61, 61, 61, 61]}

In [9]:
intents = clinc['train'].features['intent']
num_labels = intents.num_classes
num_labels

151

### Student model 初始化

In [10]:
from transformers import AutoConfig, AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

s_ckpt = 'distilbert-base-uncased'
s_tokenizer = AutoTokenizer.from_pretrained(s_ckpt)

t_ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
t_model = AutoModelForSequenceClassification.from_pretrained(t_ckpt, num_labels=num_labels).to(device)

In [12]:
def tokenizer_func(dataset):
    return s_tokenizer(dataset['text'], truncation=True)

In [13]:
from datasets import DatasetDict
# 创建新的数据集字典
myclinc = DatasetDict()

# 采样比例为 1%
sampling_rate = 0.01

# 对每个子集进行采样
for split in clinc.keys():
    dataset = clinc[split]
    # 使用 train_test_split 方法进行采样，只保留测试集部分
    sampled_dataset = dataset.train_test_split(test_size=sampling_rate, seed=42)['test']
    myclinc[split] = sampled_dataset

In [14]:
myclinc

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 153
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 31
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 55
    })
})

In [15]:
clinc_enc = clinc.map(function=lambda batch:s_tokenizer(batch['text'], truncation=True),
                      batched=True,
                      remove_columns=['text'])
clinc_enc = clinc_enc.rename_columns({'intent': 'labels'})
clinc_enc

Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 5500
    })
})

In [24]:
batch_size = 32
s_training_args = DistillTrainingArguments(output_dir='distilbert-base-uncased-ft-clinc', 
                                           evaluation_strategy='epoch', num_train_epochs=5, 
                                           learning_rate=3e-4, 
                                           per_device_train_batch_size=batch_size, 
                                           per_device_eval_batch_size=batch_size, 
                                           alpha=0.5, weight_decay=0.01, 
                                           logging_strategy='epoch',
                                           push_to_hub=False)
s_config = AutoConfig.from_pretrained(s_ckpt, num_labels=num_labels, 
                                      id2label=t_model.config.id2label, label2id=t_model.config.label2id)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
s_config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "restaurant_reviews",
    "1": "nutrition_info",
    "2": "account_blocked",
    "3": "oil_change_how",
    "4": "time",
    "5": "weather",
    "6": "redeem_rewards",
    "7": "interest_rate",
    "8": "gas_type",
    "9": "accept_reservations",
    "10": "smart_home",
    "11": "user_name",
    "12": "report_lost_card",
    "13": "repeat",
    "14": "whisper_mode",
    "15": "what_are_your_hobbies",
    "16": "order",
    "17": "jump_start",
    "18": "schedule_meeting",
    "19": "meeting_schedule",
    "20": "freeze_account",
    "21": "what_song",
    "22": "meaning_of_life",
    "23": "restaurant_reservation",
    "24": "traffic",
    "25": "make_call",
    "26": "text",
    "27": "bill_balance",
    "28": "improve_credit_score",
    

In [18]:
def student_init():    
    return AutoModelForSequenceClassification.from_pretrained(s_ckpt, config=s_config).to(device)

In [19]:
import evaluate
accuracy_score = evaluate.load('accuracy')

In [20]:
# trainer 重要的回调函数，非成员函数
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy_score.compute(references=labels, predictions=predictions)

In [25]:
distill_trainer = DistillTrainer(model_init=student_init, teacher_model=t_model, args=s_training_args, 
                                 train_dataset=clinc_enc['train'], eval_dataset=clinc_enc['validation'], 
                                 compute_metrics=compute_metrics, tokenizer=s_tokenizer)
distill_trainer.train()

  super().__init__(*args, **kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6253,0.494599,0.911613
2,0.3359,0.321383,0.943226
3,0.2299,0.288618,0.949032
4,0.2045,0.279677,0.950323
5,0.1963,0.275943,0.951613




TrainOutput(global_step=265, training_loss=0.518394628560768, metrics={'train_runtime': 129.9044, 'train_samples_per_second': 586.97, 'train_steps_per_second': 2.04, 'total_flos': 490522246969356.0, 'train_loss': 0.518394628560768, 'epoch': 5.0})

In [23]:
print("n_gpu =", s_training_args.n_gpu)
print("local_rank =", s_training_args.local_rank)

n_gpu = 9
local_rank = 0


In [29]:
import math
math.ceil(15250/(32*9))*5

265

In [30]:
ft_ckpt = 'distilbert-base-uncased-ft-clinc'
distill_trainer.push_to_hub('finetune completed')


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...-uncased-ft-clinc/model.safetensors:   1%|          | 2.30MB /  268MB            

  ...-uncased-ft-clinc/training_args.bin:   8%|8         |   437B / 5.30kB            

CommitInfo(commit_url='https://huggingface.co/jzr778/distilbert-base-uncased-ft-clinc/commit/25ee841fe3713d9887845368246ddb2030a38441', commit_message='finetune completed', commit_description='', oid='25ee841fe3713d9887845368246ddb2030a38441', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jzr778/distilbert-base-uncased-ft-clinc', endpoint='https://huggingface.co', repo_type='model', repo_id='jzr778/distilbert-base-uncased-ft-clinc'), pr_revision=None, pr_num=None)

In [32]:
from transformers import pipeline
clf = pipeline("text-classification",
               model="distilbert-base-uncased-finetuned-sst-2-english",
               device=device,
               batch_size=32)

In [34]:
clinc_ft = myclinc.map(function=lambda batch:s_tokenizer(batch['text'], truncation=True),
                      batched=True,
                      remove_columns=['text'])

In [44]:
clinc['test']

Dataset({
    features: ['text', 'intent'],
    num_rows: 5500
})

In [45]:
clf(clinc['test']['text'][5:10])

[{'label': 'POSITIVE', 'score': 0.9998599290847778},
 {'label': 'POSITIVE', 'score': 0.5255300998687744},
 {'label': 'NEGATIVE', 'score': 0.9657660126686096},
 {'label': 'NEGATIVE', 'score': 0.9832302927970886},
 {'label': 'NEGATIVE', 'score': 0.9788899421691895}]