In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

import sys
import logging
import datasets
import evaluate
import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np

from transformers import BertTokenizerFast, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import BertPreTrainedModel, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.model_selection import train_test_split

In [3]:
! unzip /kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
! unzip /kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
! unzip /kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip

  pid, fd = os.forkpty()


Archive:  /kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
  inflating: labeledTrainData.tsv    
Archive:  /kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
  inflating: testData.tsv            
Archive:  /kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
  inflating: unlabeledTrainData.tsv  


In [4]:
train = pd.read_csv("/kaggle/working/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("/kaggle/working/testData.tsv", header=0, delimiter="\t", quoting=3)

In [5]:
def KL(input, target, reduction="sum"):
    input = input.float()
    target = target.float()
    loss = F.kl_div(F.log_softmax(input, dim=-1, dtype=torch.float32),
                    F.softmax(target, dtype=torch.float32), reduction=reduction)
    return loss

In [6]:
class BertScratch(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask, token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        kl_outputs = self.bert(input_ids, attention_mask, token_type_ids)
        kl_output = kl_outputs[1]
        kl_output = self.dropout(kl_output)
        kl_logits = self.classifier(kl_output)

        total_loss = None
        if labels is not None:
            # 实例化损失函数
            loss_fct = nn.CrossEntropyLoss()
            # logits和labels的交叉熵损失
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            # kl_logits和labels的交叉熵损失
            ce_loss = loss_fct(kl_logits.view(-1, self.num_labels), labels.view(-1))
            # KL损失
            kl_loss = (KL(logits, kl_logits, "sum") + KL(kl_logits, logits, "sum")) / 2.
            # 损失总和
            total_loss = loss + ce_loss + kl_loss

        return SequenceClassifierOutput(
            loss=total_loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions
        )


In [7]:
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info(r"running %s" % ''.join(sys.argv))

In [8]:
train, val = train_test_split(train, test_size=.2)

In [9]:
train_dict = {'label': train["sentiment"], 'text': train['review']}
val_dict = {'label': val["sentiment"], 'text': val['review']}
test_dict = {"text": test['review']}

train_dataset = datasets.Dataset.from_dict(train_dict)
val_dataset = datasets.Dataset.from_dict(val_dict)
test_dataset = datasets.Dataset.from_dict(test_dict)

In [10]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

In [11]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = BertScratch.from_pretrained('bert-base-uncased')

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertScratch were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

In [13]:
training_args = TrainingArguments(
    output_dir='./bert_rdrop',  # output directory
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=100,
    save_strategy="no",
    evaluation_strategy="epoch"
)



In [14]:
trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=tokenized_train,  # training dataset
    eval_dataset=tokenized_val,  # evaluation dataset
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

In [None]:
prediction_outputs = trainer.predict(tokenized_test)
test_pred = np.argmax(prediction_outputs[0], axis=-1).flatten()
print(test_pred)

In [None]:
result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
result_output.to_csv("../result/bert_rdrop.csv", index=False, quoting=3)
logging.info('result saved!')