In [1]:
import numpy as np
import pandas as pd
import random
import torch
import matplotlib.pyplot as plt
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

In [2]:
def txt_reading(file):
    txt = []
    with open(file,"r",encoding="utf-8") as f:
        for line in f.readlines():
            txt.append(line)
    target_,sentence_ = [],[]
    for i in txt[:100000]:
        if i[:10]=='__label__1':
            target_.append(0)
        else:
            target_.append(1)
        sentence_.append(i[11:-2])
    return target_,sentence_

In [3]:
labels,sentence = txt_reading('C:\\Users\\happytutu\\Desktop\\amazon\\test.ft.txt')

In [4]:
def lower(input_list):
    output_list = []
    for i in input_list:
        output_list.append(i.lower())
    return output_list

In [5]:
sentences = lower(sentence)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [7]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 80,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  great cd: my lovely pat has one of the great voices of her generation. i have listened to this cd for years and i still love it. when i'm in a good mood it makes me feel better. a bad mood just evaporates like sugar in the rain. this cd just oozes life. vocals are jusat stuunning and lyrics just kill. one of life's hidden gems. this is a desert isle cd in my book. why she never made it big is just beyond me. everytime i play this, no matter black, white, young, old, male, female everybody says one thing "who was that singing ?
Token IDs: tensor([  101,  2307,  3729,  1024,  2026,  8403,  6986,  2038,  2028,  1997,
         1996,  2307,  5755,  1997,  2014,  4245,  1012,  1045,  2031,  7791,
         2000,  2023,  3729,  2005,  2086,  1998,  1045,  2145,  2293,  2009,
         1012,  2043,  1045,  1005,  1049,  1999,  1037,  2204,  6888,  2009,
         3084,  2033,  2514,  2488,  1012,  1037,  2919,  6888,  2074,  9345,
        17822,  8520,  2066,  5699,  1999,  1996,  4542

In [8]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

90,000 training samples
10,000 validation samples


In [9]:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=64)

test_sampler = SequentialSampler(val_dataset)
test_dataloader = DataLoader(val_dataset, sampler=test_sampler, batch_size=64)

In [10]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)     #num_labels表示2个分类，好评和差评
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [11]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

In [12]:
epochs = 5
# training steps 的数量: [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * epochs
# 设计 learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)


In [13]:
def binary_acc(preds, labels):      #preds.shape=(16, 2) labels.shape=torch.Size([16, 1])
    correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float()      #eq里面的两个参数的shape=torch.Size([16])    
    acc = correct.sum().item() / len(correct)
    return acc

In [14]:
import time
import datetime
def format_time(elapsed):    
    elapsed_rounded = int(round((elapsed)))    
    return str(datetime.timedelta(seconds=elapsed_rounded))   #返回 hh:mm:ss 形式的时间

In [15]:
def train1(model, optimizer):
    t0 = time.time()
    avg_loss, avg_acc = [],[]   
    model.train()
    for step, batch in enumerate(train_dataloader):
        # 每隔40个batch 输出一下所用时间.
        if step % 40 == 0 and not step == 0:            
            elapsed = format_time(time.time() - t0)             
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            print(np.array(avg_loss).mean())

        b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)
        
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = output[0], output[1] 
    
        avg_loss.append(loss.item())
       
        acc = binary_acc(logits, b_labels)
        avg_acc.append(acc)
        
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)      #大于1的梯度将其设为1.0, 以防梯度爆炸
        optimizer.step()              #更新模型参数
        scheduler.step()              #更新learning rate
        
    avg_acc = np.array(avg_acc).mean()
    avg_loss = np.array(avg_loss).mean()
    return avg_loss, avg_acc

In [16]:
train1(model,optimizer)

  Batch    40  of  1,407.    Elapsed: 0:00:33.
0.5628033988177776
  Batch    80  of  1,407.    Elapsed: 0:01:03.
0.4327496176585555
  Batch   120  of  1,407.    Elapsed: 0:01:33.
0.3643552548562487
  Batch   160  of  1,407.    Elapsed: 0:02:02.
0.3291768059600145
  Batch   200  of  1,407.    Elapsed: 0:02:38.
0.30834699105471375
  Batch   240  of  1,407.    Elapsed: 0:03:08.
0.2897282368813952
  Batch   280  of  1,407.    Elapsed: 0:03:37.
0.2761112390884331
  Batch   320  of  1,407.    Elapsed: 0:04:07.
0.26607505390420555
  Batch   360  of  1,407.    Elapsed: 0:04:36.
0.2569384680233068
  Batch   400  of  1,407.    Elapsed: 0:05:06.
0.25031334944069383
  Batch   440  of  1,407.    Elapsed: 0:05:35.
0.24569105052135207
  Batch   480  of  1,407.    Elapsed: 0:06:04.
0.24073336442622045
  Batch   520  of  1,407.    Elapsed: 0:06:34.
0.23618433414600215
  Batch   560  of  1,407.    Elapsed: 0:07:03.
0.23226805157028138
  Batch   600  of  1,407.    Elapsed: 0:07:33.
0.22815410173187653
  

(0.19495485251731032, 0.9241626687988628)

In [17]:
def evaluate(model):    
    avg_acc = []    
    model.eval()         #表示进入测试模式
      
    with torch.no_grad():
        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)
        
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            
            acc = binary_acc(output[0], b_labels)
            avg_acc.append(acc)
    avg_acc = np.array(avg_acc).mean()
    return avg_acc

In [18]:
test_acc = evaluate(model)
print("epoch={},测试准确率={}".format(1, test_acc))

epoch=1,测试准确率=0.9437699044585988
