In [1]:
from transformers import BertConfig, BertModel, BertForSequenceClassification, BertTokenizer
from special_datasets import *
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm import tqdm
from train import to_device
import torch

In [2]:
MODEL_TYPE = "bert-base-uncased"

In [3]:
model = BertForSequenceClassification.from_pretrained(MODEL_TYPE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

In [5]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 2e-5,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
         "weight_decay": 0.0},
]

In [6]:
optimizer = torch.optim.Adam(optimizer_grouped_parameters,lr=2e-5,weight_decay=2e-5)

In [7]:
dataset = TxtDataset("./dataset/sst2/train.txt")
data_loader = DataLoader(dataset,batch_size=4,shuffle=True,drop_last=True)

In [12]:
model.classifier

Linear(in_features=768, out_features=2, bias=True)

In [13]:
model.named_parameters()

<generator object Module.named_parameters at 0x000001899231A970>

In [15]:
model.classifier.named_parameters()

<generator object Module.named_parameters at 0x00000189905F0510>

In [11]:
model.to("cuda")
loss_func = nn.CrossEntropyLoss()
for epoch in range(4):
    for step, batch in enumerate(tqdm(data_loader)):
        b_y = batch[1].to("cuda")
        input_dict = tokenizer(batch[0], return_tensors='pt', padding=True, truncation=True, max_length=128)
        to_device(input_dict, "cuda")
        output = model(**input_dict)
        loss = loss_func(output.logits, b_y)
        if step%10==0:
            print(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

  0%|          | 1/16837 [00:01<4:46:17,  1.02s/it]

tensor(0.5736, device='cuda:0', grad_fn=<NllLossBackward>)


  0%|          | 11/16837 [00:02<51:04,  5.49it/s] 

tensor(0.7220, device='cuda:0', grad_fn=<NllLossBackward>)


  0%|          | 21/16837 [00:04<46:30,  6.03it/s]

tensor(0.6617, device='cuda:0', grad_fn=<NllLossBackward>)


  0%|          | 32/16837 [00:05<41:37,  6.73it/s]

tensor(0.6497, device='cuda:0', grad_fn=<NllLossBackward>)


  0%|          | 41/16837 [00:07<44:22,  6.31it/s]

tensor(0.6873, device='cuda:0', grad_fn=<NllLossBackward>)


  0%|          | 52/16837 [00:08<41:45,  6.70it/s]

tensor(0.5629, device='cuda:0', grad_fn=<NllLossBackward>)


  0%|          | 62/16837 [00:10<41:48,  6.69it/s]

tensor(0.2406, device='cuda:0', grad_fn=<NllLossBackward>)


  0%|          | 72/16837 [00:11<41:10,  6.79it/s]

tensor(0.8042, device='cuda:0', grad_fn=<NllLossBackward>)


  0%|          | 82/16837 [00:13<42:35,  6.56it/s]

tensor(0.2334, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 91/16837 [00:14<43:31,  6.41it/s]

tensor(0.7011, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 102/16837 [00:16<42:10,  6.61it/s]

tensor(0.0678, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 112/16837 [00:18<40:33,  6.87it/s]

tensor(0.7394, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 122/16837 [00:19<41:40,  6.68it/s]

tensor(0.2103, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 132/16837 [00:21<41:59,  6.63it/s]

tensor(0.1847, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 142/16837 [00:22<40:46,  6.82it/s]

tensor(0.3781, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 152/16837 [00:23<40:43,  6.83it/s]

tensor(0.1215, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 161/16837 [00:25<44:24,  6.26it/s]

tensor(0.2443, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 172/16837 [00:26<39:59,  6.95it/s]

tensor(0.4908, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 182/16837 [00:28<41:31,  6.69it/s]

tensor(0.8613, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 192/16837 [00:29<41:21,  6.71it/s]

tensor(1.6463, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|          | 202/16837 [00:31<43:37,  6.36it/s]

tensor(0.0675, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|▏         | 211/16837 [00:32<42:11,  6.57it/s]

tensor(0.1829, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|▏         | 222/16837 [00:34<41:04,  6.74it/s]

tensor(0.0921, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|▏         | 232/16837 [00:35<40:06,  6.90it/s]

tensor(0.2617, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|▏         | 241/16837 [00:37<42:59,  6.43it/s]

tensor(0.2534, device='cuda:0', grad_fn=<NllLossBackward>)


  1%|▏         | 243/16837 [00:37<42:59,  6.43it/s]


KeyboardInterrupt: 