In [2]:
import pandas as pd
import numpy as np
import codecs
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import BertForNextSentencePrediction, AdamW, get_linear_schedule_with_warmup

import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
train_df = pd.read_csv('./data/train.csv')
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
train_df = train_df[train_df['question2'].apply(lambda x: isinstance(x, str))]
train_df = train_df[train_df['question1'].apply(lambda x: isinstance(x, str))]

In [5]:
len(train_df)

404287

In [6]:
q1_train, q1_val, q2_train, q2_val, train_label, test_label = train_test_split(
    train_df['question1'].iloc[:],
    train_df['question2'].iloc[:],
    train_df['is_duplicate'].iloc[:],
    test_size = 0.2,
    stratify = train_df['is_duplicate'].iloc[:]
)

In [7]:
train_label

382257    0
41064     0
77724     0
216289    1
85568     0
         ..
52298     1
282039    1
326558    0
102142    1
229865    0
Name: is_duplicate, Length: 323429, dtype: int64

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
tokenizer(list(q1_val)[0])

{'input_ids': [101, 2054, 2003, 1996, 2190, 6040, 2017, 2031, 2445, 2006, 22035, 2527, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokenizer(list(q1_val)[0], list(q2_val)[0])

{'input_ids': [101, 2054, 2003, 1996, 2190, 6040, 2017, 2031, 2445, 2006, 22035, 2527, 1029, 102, 2054, 2003, 1996, 2190, 6040, 2017, 2288, 2006, 22035, 2527, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer(list(q1_val)[0], list(q2_val)[0], truncation = True, padding = True, max_length = 100)

{'input_ids': [101, 2054, 2003, 1996, 2190, 6040, 2017, 2031, 2445, 2006, 22035, 2527, 1029, 102, 2054, 2003, 1996, 2190, 6040, 2017, 2288, 2006, 22035, 2527, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
tokenizer(list(q1_val)[:2], list(q2_val)[:2], truncation = True, padding = True, max_length = 100)

{'input_ids': [[101, 2054, 2003, 1996, 2190, 6040, 2017, 2031, 2445, 2006, 22035, 2527, 1029, 102, 2054, 2003, 1996, 2190, 6040, 2017, 2288, 2006, 22035, 2527, 1029, 102], [101, 2054, 2515, 22157, 19057, 24345, 1999, 1029, 102, 2003, 22157, 19057, 24345, 1999, 6544, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [13]:
train_encoding = tokenizer(list(q1_train)[:50], list(q2_train)[:50], truncation = True, padding = True, max_length = 100)
test_encoding = tokenizer(list(q1_val)[:5], list(q2_val)[:5], truncation = True, padding = True, max_length = 100)
train_label = train_label[:50]
test_label = test_label[:5]

In [14]:
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
class QuoraDataset(Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

In [16]:
train_dataset = QuoraDataset(train_encoding, list(train_label))
test_dataset = QuoraDataset(test_encoding, list(test_label))

train_dataloader = DataLoader(train_dataset, batch_size = 64, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = 64, shuffle = True)

In [17]:
len(train_dataset), len(test_dataset)

(50, 5)

In [18]:
train_dataset[0]

{'input_ids': tensor([  101,  2064,  2017, 12200,  1996, 20710,  9766, 14255,  1005,  1055,
         14246,  2226,  1029,   102,  2071,  1996,  2355,  2602,  2191,  3822,
          4243,  2062,  2759,  1998,  3928,  1029,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0

In [19]:
lr = 1e-3
batch_size = 16
epoches = 2#10
len_dataset = len(train_dataset)

# 每个epoch有多少个step
total_steps =  (len_dataset // batch_size) * epoches if len_dataset % batch_size == 0 else (len_dataset // batch_size + 1) * epoches

# 要预热的steps
warm_up_ratio = 0.1

optimizer = AdamW(model.parameters(), lr = lr, correct_bias = False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warm_up_ratio * total_steps, num_training_steps = total_steps)



In [20]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
def train(optimizer, scheduler, epoch, model):
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_dataloader)

    for batch in train_dataloader:
        optimizer.zero_grad()
        
        print(f'batch: {batch}')
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
        print(f'model output: {outputs}')
        
        loss = outputs[0]
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        iter_num += 1
        if iter_num % 100 == 0:
            print(f'epoch: {epoch}, iter_num: {iter_num}, loss: {loss:.4f}, {iter_num / total_iter * 100:.2f}%')
    print(f'Epoch: {epoch}. Average train loss: {total_train_loss / total_iter:.4f}')

In [22]:
def validation(model):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    
    for batch in test_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
        
        loss = outputs[0]
        logits = outputs[1]
        
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.cpu().numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print(f'Accuracy: {avg_val_accuracy:.4f}')
    print(f'Average testting loss: {total_eval_loss / len(test_dataloader):.4f}')
    print('-----------------------------------------------------')

In [23]:
for epoch in range(epoches):
    print(f'-------------------- Epoch: {epoch} --------------------')
    train(optimizer, scheduler, epoch, model)
    validation(model)

-------------------- Epoch: 0 --------------------
batch: {'input_ids': tensor([[ 101, 2129, 4621,  ...,    0,    0,    0],
        [ 101, 2064, 1037,  ...,    0,    0,    0],
        [ 101, 2129, 3376,  ...,    0,    0,    0],
        ...,
        [ 101, 2129, 2106,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 1, 0, 0,