In [1]:
!unzip /home/aistudio/data/data135072/quora-question-pairs.zip -d /home/aistudio/data

Archive:  /home/aistudio/data/data135072/quora-question-pairs.zip
   creating: /home/aistudio/data/quora-question-pairs/
  inflating: /home/aistudio/data/__MACOSX/._quora-question-pairs  
  inflating: /home/aistudio/data/quora-question-pairs/.DS_Store  
  inflating: /home/aistudio/data/__MACOSX/quora-question-pairs/._.DS_Store  
  inflating: /home/aistudio/data/quora-question-pairs/test.csv  
  inflating: /home/aistudio/data/__MACOSX/quora-question-pairs/._test.csv  
  inflating: /home/aistudio/data/quora-question-pairs/train.csv  
  inflating: /home/aistudio/data/__MACOSX/quora-question-pairs/._train.csv  


## 1. Load dataset

In [2]:
import pandas as pd


def load_dataset(ds_dir='./data/quora-question-pairs/'):
    def read(fp, is_test=False):
        df = pd.read_csv(fp)
        df = df.fillna('')
        if not is_test:
            df = df[['question1', 'question2', 'is_duplicate']]
        else:
            df = df[['question1', 'question2']]
        
        out = []
        for row in df.values:
            out.append(row.tolist())
        return out

    train_path = ds_dir + 'train.csv'
    test_path = ds_dir + 'test.csv'

    train = read(train_path)
    test = read(test_path, is_test=True)
    return train, test


In [3]:
train_set, test_set = load_dataset()
print("Train set size:", len(train_set))
print("Train set examples:", train_set[:2])

print("\nTest set size:", len(test_set))
print("nTest set examples:", test_set[:2])

Train set size: 404290
Train set examples: [['What is the step by step guide to invest in share market in india?', 'What is the step by step guide to invest in share market?', 0], ['What is the story of Kohinoor (Koh-i-Noor) Diamond?', 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', 0]]

Test set size: 2345796
nTest set examples: [['How does the Surface Pro himself 4 compare with iPad Pro?', 'Why did Microsoft choose core m3 and not core i3 home Surface Pro 4?'], ['Should I have a hair transplant at age 24? How much would it cost?', 'How much cost does hair transplant require?']]


## 2. Transform text

In [4]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertModel as SeqClfModel
from paddlenlp.transformers import BertTokenizer as PTMTokenizer


MODEL_NAME = "bert-base-uncased"
tokenizer = PTMTokenizer.from_pretrained(MODEL_NAME)


def example_converter(example, max_seq_length, tokenizer):
    text_a, text_b, label = example
    encoded = tokenizer(text=text_a, text_pair=text_b, max_seq_len=max_seq_length)
    input_ids = encoded["input_ids"]
    token_type_ids = encoded["token_type_ids"]
    return input_ids, token_type_ids, label


def get_trans_fn(max_seq_length=128, tokenizer=tokenizer):
    return lambda ex: example_converter(ex, max_seq_length, tokenizer)


batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id), 
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    Stack(dtype="int64")
    ): fn(samples)


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      test=False,
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):

    if test:
        dataset = [d + [0] for d in dataset]

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder

[2022-03-27 03:58:42,537] [    INFO] - Downloading https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt and saved to /home/aistudio/.paddlenlp/models/bert-base-uncased
[2022-03-27 03:58:42,540] [    INFO] - Downloading bert-base-uncased-vocab.txt from https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt
100%|██████████| 227/227 [00:00<00:00, 4621.41it/s]


In [5]:
max_seq_length = 64; batch_size = 256
trans_fn = get_trans_fn(max_seq_length)
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
# dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, test=True, shuffle=False, batch_size=batch_size)

## 3. Model building

In [6]:
from paddle import nn
import paddle


class PTM(nn.Layer):

    def __init__(self, pretrained_model, dropout=0.1, num_class=2):
        super().__init__()

        self.ptm = pretrained_model
        ptm_out_dim = self.ptm.config["hidden_size"]
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(ptm_out_dim, ptm_out_dim // 2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(ptm_out_dim // 2, num_class)

    def encoder(self, input_ids, token_type_ids):
        _, embd = self.ptm(input_ids, token_type_ids)
        embd = self.dropout(embd)
        return embd

    def forward(self, input_ids, token_type_ids):
        embd = self.encoder(input_ids, token_type_ids)
        hidden = self.relu(self.fc1(embd))
        logits = self.fc2(hidden)
        return logits

In [7]:
from paddlenlp.transformers import LinearDecayWithWarmup

epoch = 3
weight_decay = 0.01
warmup_proportion = 0.0
lr_scheduler = LinearDecayWithWarmup(4e-5, len(train_loader) * epoch,
                                         warmup_proportion)

def get_model(model):
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
    parameters=model.parameters(), 
    learning_rate=lr_scheduler, 
    weight_decay=weight_decay, 
    apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.CrossEntropyLoss()

    model = paddle.Model(model)
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

In [8]:
model = SeqClfModel.from_pretrained(MODEL_NAME)
model = PTM(model)
model = get_model(model)

[2022-03-27 03:58:48,025] [    INFO] - Downloading https://paddlenlp.bj.bcebos.com/models/transformers/bert-base-uncased.pdparams and saved to /home/aistudio/.paddlenlp/models/bert-base-uncased
[2022-03-27 03:58:48,028] [    INFO] - Downloading bert-base-uncased.pdparams from https://paddlenlp.bj.bcebos.com/models/transformers/bert-base-uncased.pdparams
100%|██████████| 793257/793257 [00:19<00:00, 41075.00it/s]
W0327 03:59:07.499900   143 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0327 03:59:07.504976   143 device_context.cc:465] device: 0, cuDNN Version: 7.6.
[2022-03-27 03:59:18,077] [    INFO] - Weights from pretrained model not used in BertModel: ['cls.predictions.decoder_weight', 'cls.predictions.decoder_bias', 'cls.predictions.transform.weight', 'cls.predictions.transform.bias', 'cls.predictions.layer_norm.weight', 'cls.predictions.layer_norm.bias', 'cls.seq_relationship.weight', 'cls.seq_relati

## 4. Model training

In [9]:
model.fit(train_loader, epochs=epoch, verbose=2, log_freq=100)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/3
step  100/1580 - loss: 0.3890 - acc: 0.7434 - 837ms/step
step  200/1580 - loss: 0.3007 - acc: 0.7871 - 840ms/step
step  300/1580 - loss: 0.3287 - acc: 0.8088 - 841ms/step
step  400/1580 - loss: 0.3036 - acc: 0.8216 - 841ms/step
step  500/1580 - loss: 0.2715 - acc: 0.8307 - 844ms/step
step  600/1580 - loss: 0.3190 - acc: 0.8373 - 845ms/step
step  700/1580 - loss: 0.3348 - acc: 0.8424 - 846ms/step
step  800/1580 - loss: 0.2716 - acc: 0.8464 - 847ms/step
step  900/1580 - loss: 0.2783 - acc: 0.8500 - 847ms/step
step 1000/1580 - loss: 0.2876 - acc: 0.8531 - 848ms/step
step 1100/1580 - loss: 0.2597 - acc: 0.8559 - 848ms/step
step 1200/1580 - loss: 0.2745 - acc: 0.8583 - 847ms/step
step 1300/1580 - loss: 0.2550 - acc: 0.8607 - 847ms/step
step 1400/1580 - loss: 0.2401 - acc: 0.8624 - 847ms/step
step 1500/1580 - loss: 0.2375 - acc: 0.8643 - 848ms/step
step 1580/1580 - loss: 0.

## 5. Prediction

In [10]:

import paddle.nn.functional as F
from tqdm import tqdm

predictions = []
logits = model.predict(test_loader)

for batch in tqdm(logits[0]):
    batch = paddle.to_tensor(batch)
    probs = F.softmax(batch, axis=1)
    preds = paddle.argmax(probs, axis=1).numpy().tolist()
    predictions.extend(preds)

Predict begin...
Predict samples: 2345796


100%|██████████| 9164/9164 [00:01<00:00, 7426.49it/s]


In [11]:
results = [[idx, p] for idx, p in enumerate(predictions)]
columns = ['test_id', 'is_duplicate']
pd.DataFrame(results, columns=columns).to_csv('results.csv', index=False)