In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bert-qa-m-for-absa/test_QA_M.tsv
/kaggle/input/bert-qa-m-for-absa/dev_QA_M.tsv
/kaggle/input/bert-qa-m-for-absa/train_QA_M.tsv


In [2]:
!pip install transformers

[0m

In [3]:
import torch
import random
import numpy as np

# identify and specify the GPU as the device, later in training loop we will load data into device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)

    
print(device)

cuda


# loading and preprocessing data

In [4]:
import pandas as pd

df_train = pd.read_csv("/kaggle/input/bert-qa-m-for-absa/train_QA_M.tsv",skiprows=1, delimiter='\t', header = None, names=['id','sentence1','sentence2','label'])
df_test = pd.read_csv("/kaggle/input/bert-qa-m-for-absa/test_QA_M.tsv",skiprows=1, delimiter='\t', header = None,names=['id','sentence1','sentence2','label'])
df_val = pd.read_csv("/kaggle/input/bert-qa-m-for-absa/dev_QA_M.tsv",skiprows=1, delimiter='\t', header = None,names=['id','sentence1','sentence2','label'])

print(f"train data len: {len(df_train)}")
print(f"test data len: {len(df_test)}")
print(f"val data len: {len(df_val)}")

train data len: 15008
test data len: 7516
val data len: 3748


In [5]:
print(df_train[:10])

print("\n")
print(df_train['label'].unique())

unique_labels = df_train['label'].unique()

     id                                          sentence1  \
0     0                        location - 2 , location - 1   
1     0                        location - 2 , location - 1   
2     0                        location - 2 , location - 1   
3     0                        location - 2 , location - 1   
4  1000  location - 1 is one of the most expensive area...   
5  1000  location - 1 is one of the most expensive area...   
6  1000  location - 1 is one of the most expensive area...   
7  1000  location - 1 is one of the most expensive area...   
8  1001  the hard rock cafe is close by , just by locat...   
9  1001  the hard rock cafe is close by , just by locat...   

                                           sentence2     label  
0  what do you think of the general of location -...      None  
1   what do you think of the price of location - 1 ?      None  
2  what do you think of the safety of location - 1 ?      None  
3  what do you think of the transit location of l...     

In [6]:
from sklearn.preprocessing import LabelEncoder

# 将字符串映射成整型
def train_str_2_int(df):
    labelEncoder = LabelEncoder()
    df['label_enc'] = labelEncoder.fit_transform(df['label'])
    
    # 原地给列改名
    df.rename(columns = {'label': 'label_desc'}, inplace = True)
    df.rename(columns = {'label_enc': 'label'}, inplace = True)

    
train_str_2_int(df_train)
train_str_2_int(df_test)
train_str_2_int(df_val)

In [7]:
print(df_train[0:10])
print(df_test[0:10])
print(df_val[0:10])

     id                                          sentence1  \
0     0                        location - 2 , location - 1   
1     0                        location - 2 , location - 1   
2     0                        location - 2 , location - 1   
3     0                        location - 2 , location - 1   
4  1000  location - 1 is one of the most expensive area...   
5  1000  location - 1 is one of the most expensive area...   
6  1000  location - 1 is one of the most expensive area...   
7  1000  location - 1 is one of the most expensive area...   
8  1001  the hard rock cafe is close by , just by locat...   
9  1001  the hard rock cafe is close by , just by locat...   

                                           sentence2 label_desc  label  
0  what do you think of the general of location -...       None      1  
1   what do you think of the price of location - 1 ?       None      1  
2  what do you think of the safety of location - 1 ?       None      1  
3  what do you think of t

# encode sentence 

In [8]:
from transformers import BertTokenizer

model_ckpt = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_ckpt, do_lower_case=True)

def tokenizing(df, tokenizer, MAX_LEN = 256):

    sent1 = df.sentence1.values
    sent2 = df.sentence2.values
    sentences = [sent1[i] + "[SEP]" + sent2[i] for i in range(len(sent1))]
    labels = df.label.values

    input_ids = [tokenizer.encode(sent,add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True) for sent in sentences]
    attention_masks = [[float(i>0) for i in input_id]for input_id in input_ids]
    
    return input_ids, attention_masks, labels


train_input_ids, train_attention_masks, train_labels = tokenizing(df_train, tokenizer)
test_input_ids, test_attention_masks, test_labels = tokenizing(df_test, tokenizer)
val_input_ids, val_attention_masks, val_labels = tokenizing(df_val, tokenizer)

print(f"train data len: {len(train_input_ids)}")
print(f"test data len: {len(test_input_ids)}")
print(f"val data len: {len(val_input_ids)}")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


train data len: 15008
test data len: 7516
val data len: 3748


In [9]:
print(train_input_ids[0])
print(train_attention_masks[0])
print(train_labels[0])
print(tokenizer.convert_ids_to_tokens(train_input_ids[0]))

[101, 3295, 1011, 1016, 1010, 3295, 1011, 1015, 102, 2054, 2079, 2017, 2228, 1997, 1996, 2236, 1997, 3295, 1011, 1015, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [10]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# convert all our data into torch tensors, required data type for our model
train_input_ids = torch.tensor(train_input_ids)
train_attention_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_labels)

test_input_ids = torch.tensor(test_input_ids)
test_attention_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)

val_input_ids = torch.tensor(val_input_ids)
val_attention_masks = torch.tensor(val_attention_masks)
val_labels = torch.tensor(val_labels)

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_input_ids,train_attention_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids,test_attention_masks,test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

val_data = TensorDataset(val_input_ids,val_attention_masks,val_labels)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [11]:
print(len(train_dataloader))
print(len(test_dataloader))
print(len(val_dataloader))

469
235
118


# define Model, Hyperparameter, optimizer

In [12]:
from transformers import BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup



model = BertForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(unique_labels)).to(device)

lr = 2e-5
adam_epsilon = 1e-8
epochs = 3

num_warmup_steps = 0
num_training_steps = len(train_dataloader) * epochs

optimizer = AdamW(model.parameters(), lr = lr, eps = adam_epsilon, correct_bias = False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_training_steps)


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# train model and test

In [None]:
from tqdm import tqdm, trange,notebook,tqdm_notebook
import time

from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

model.zero_grad()

# 用作画图
train_loss_list = []

# 查看训练过程中的learning rate 变化
learning_rate = []

for epoch in notebook.tnrange(1, epochs+1, desc = 'Epoch'):
    start = time.time()
    print("<" + "="*22 + f"Epoch{epoch}, Batch{len(train_dataloader)}" + "="*22 + ">")
    
    all_loss = 0
    
    curSample = 0.0
    curRight = 0
    
    # 开始训练
    for step, batch in enumerate(train_dataloader):
        
        model.train()
        
        # 放入gpu中
        batch = tuple(t.to(device) for t in batch)
        
        b_input_ids, b_input_mask, b_labels = batch
        
        # tips: BertForSequenceClassifier 输出的第一个是loss,第二个是（batchsize, label_prob）
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels = b_labels)
        loss = outputs[0]
        label_prob = outputs[1]
        
        label_prob = label_prob.to('cpu').detach().numpy()
        label_prob = np.argmax(label_prob, axis=1).flatten()
        b_labels = b_labels.to('cpu').detach().numpy().flatten()
        curSample += len(b_labels)
        curRight += (label_prob == b_labels).sum().item()
        
        loss.backward()
        
        optimizer.step()
        
        scheduler.step()
        
        optimizer.zero_grad()
        
        all_loss += loss.item()
        
        
        if (step+1) % 50 == 0:
            print(f"step: {step+1} loss:{all_loss / (step+1)} time: {time.time() - start} cur acc:{curRight / curSample}")
    

    #store the current learning rate
    for param_group in optimizer.param_groups:
        print("\n\tCurrent Learning rate: ",param_group['lr'])
        learning_rate.append(param_group['lr'])

        
    train_loss_list.append(all_loss / len(train_dataloader))
    print(F'\n\tAverage Training loss: {train_loss_list[-1]}')
    
    
    # ================= Validation or Test ================== #
    def testOrVal(dataloader, mode='test'):
        model.eval()

        eval_acc, eval_mcc, nb_eval_steps = 0, 0, 0

        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)

            b_input_ids, b_input_mask, b_labels = batch

            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

            pred = outputs[0].to('cpu').numpy()
            true_label = b_labels.to('cpu').numpy()

            pred_flat = np.argmax(pred, axis=1).flatten()
            labels_flat = true_label.flatten()

            tmp_eval_accuracy = accuracy_score(labels_flat,pred_flat)
            tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)

            eval_acc += tmp_eval_accuracy
            eval_mcc += tmp_eval_mcc_accuracy
            nb_eval_steps += 1
        if mode == 'val':
            print(F'\n\tValidation Accuracy: {eval_acc/nb_eval_steps}')
            print(F'\n\tValidation MCC Accuracy: {eval_mcc/nb_eval_steps}')
        else:
            print(F'\n\tTest Accuracy: {eval_acc/nb_eval_steps}')
            print(F'\n\tTest MCC Accuracy: {eval_mcc/nb_eval_steps}')
    
    testOrVal(val_dataloader, "val")
    testOrVal(test_dataloader, "test")

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

step: 50 loss:0.5660203444957733 time: 37.501582860946655 cur acc:0.825
step: 100 loss:0.5438286039233208 time: 73.90973949432373 cur acc:0.831875
