In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader 

In [2]:
data = pd.read_csv('data/ChnSentiCorp_htl_all_copy.csv')

In [3]:
data.head(2)

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification

In [5]:
tokenizer = BertTokenizer.from_pretrained('chinese-roberta-wwm-ext')
model = BertForSequenceClassification.from_pretrained('chinese-roberta-wwm-ext', num_labels=2)

Some weights of the model checkpoint at chinese-roberta-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpo

In [6]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [7]:
model = model.to('cuda')

In [8]:
X = list(data['review'].astype(str))
y = list(data['label'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [9]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [10]:
# Create torch dataset
class MyDataset(Dataset):
    def __init__(self, encodings, labels=None):
    
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.encodings['input_ids'])
    

In [11]:
train_dataset = MyDataset(X_train_tokenized, y_train)
val_dataset = MyDataset(X_val_tokenized, y_val)

In [12]:
train_dataset[5]

{'input_ids': tensor([ 101, 1762, 6205, 2123,  857, 6814, 1126,  702, 6983, 2421, 8024, 3634,
         6983, 2421, 6006, 4197, 3683, 6629, 1079, 1765, 4638, 1724, 3215, 5277,
         2345,  671,  763, 8024,  852, 1762, 6205, 2123,  738, 5050, 3221,  679,
         7231, 4638,  749, 8024,  817, 3419,  738,  679, 7770,  511, 2791, 7313,
         7027,  691, 6205,  948, 3221, 2397, 1112, 8024, 1765, 3691, 3300, 4157,
         5552,  511, 4500, 4638, 3221, 1765, 3265, 8024, 2697, 6230, 3683, 4958,
         6444, 5653, 3302, 1914,  749, 8024,  738, 3766, 3300, 1692, 7509,  511,
         2128, 1059, 8024, 1453, 1741, 4384, 1862, 2213, 1377,  511,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0, 

In [13]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    
    accuracy = accuracy_score(y_true=labels, y_pred = pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    
    return {'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1':f1}

In [17]:
args = TrainingArguments(output_dir='output',
                        num_train_epochs=4,
                        per_device_eval_batch_size=16)

trainer = Trainer(model=model,
                 args=args,
                 train_dataset=train_dataset,
                 eval_dataset=val_dataset,
                 compute_metrics=compute_metrics)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [18]:
trainer.train()

***** Running training *****
  Num examples = 6212
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3108


Step,Training Loss
500,0.1782
1000,0.1095
1500,0.0829
2000,0.0451
2500,0.0399
3000,0.0231


Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to output/checkpoint-2500
Configuration saved in output/checkpoint-2500/config.json
Model weights saved in output/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-3000
Configuration saved in output/checkpoint-3000/config.json
Model weights saved in output/check

TrainOutput(global_step=3108, training_loss=0.07763952688053921, metrics={'train_runtime': 896.1018, 'train_samples_per_second': 27.729, 'train_steps_per_second': 3.468, 'total_flos': 6537783503585280.0, 'train_loss': 0.07763952688053921, 'epoch': 4.0})

In [19]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1554
  Batch size = 16


<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.592217206954956,
 'eval_accuracy': 0.9124839124839125,
 'eval_precision': 0.936150234741784,
 'eval_recall': 0.936150234741784,
 'eval_f1': 0.936150234741784,
 'eval_runtime': 16.0474,
 'eval_samples_per_second': 96.838,
 'eval_steps_per_second': 6.107,
 'epoch': 4.0}