## 1. Load dataset

In [1]:
def relabel(g, reverse=False):
    if reverse:
        return {0: 'M', 1: 'F', 2: 'U'}[g]
    return {'M': 0, 'F': 1, 'U': 2}[g]


def data_loader(filepath, num_row_skip=1, exclude_u=False):
    def readFile(path):
        f = open(path, 'r')
        for _ in range(num_row_skip):
            next(f)
        out = []
        for line in f:
            line = line.split('\t')
            name, gender = line[-2], relabel(line[-1].strip())
            if exclude_u:
                if gender != 'U':
                    out.append([name, gender])
            else:
                out.append([name, gender])
        return out
    
    if isinstance(filepath, str):
        return readFile(filepath)
    elif isinstance(filepath, list):
        return [readFile(path) for path in filepath]
    else:
         raise TypeError('filepath must be either a str or a list.')

In [2]:
ccnc = data_loader('data/data96861/ccnc.txt')
len(ccnc), ccnc[:3]

(3658109, [['陈品如', 0], ['陈祥旭', 0], ['陈晓', 0]])

In [3]:
from random import seed, shuffle 


def train_dev_test_split(data, train=0.6, dev=0.2, test=0.2, seed_idx=5):
    seed(seed_idx)
    shuffle(data)
    length = len(data)
    boundary1 = round(length * train)
    boundary2 = round(length * (train + dev))    
    return data[:boundary1], data[boundary1: boundary2], data[boundary2:]

In [4]:
train_set, dev_set, test_set = train_dev_test_split(ccnc)
len(train_set), len(dev_set), len(test_set)

(2194865, 731622, 731622)

## 2. Transform text

In [5]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.data import Vocab


class TextVectorizer:
     
    def __init__(self, tokenizer=None):
        self.tokenize = tokenizer
        self.vocab_to_idx = None
        self._V = None
    
    def build_vocab(self, text):
        tokens = list(map(self.tokenize, text))
        self._V = Vocab.build_vocab(tokens, unk_token='[UNK]', pad_token='[PAD]')
        self.vocab_to_idx = self._V.token_to_idx
        
    def text_encoder(self, text):
        if isinstance(text, list):
            return [self(t) for t in text]
        
        tks = self.tokenize(text)
        out = [self.vocab_to_idx[tk] for tk in tks]
        return out

    def __len__(self):
        return len(self.vocab_to_idx)

    def __getitem__(self, w):
        return self.vocab_to_idx[w]
    
    def __call__(self, text):
        if self.vocab_to_idx:
            return self.text_encoder(text)
        raise ValueError("No vocab is built!")


def example_converter(example, text_encoder, include_seq_len):
    
    text, label = example
    encoded = text_encoder(text)
    if include_seq_len:
        text_len = len(encoded)
        return encoded, text_len, label
    return encoded, label


def get_trans_fn(text_encoder, include_seq_len):
    return lambda ex: example_converter(ex, text_encoder, include_seq_len)


def get_batchify_fn(include_seq_len):
    
    if include_seq_len:
        stack = [Stack(dtype="int64")] * 2
    else:
        stack = [Stack(dtype="int64")]
    
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  
        *stack
    ): fn(samples)
    
    return batchify_fn


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder

In [6]:
text = [t[0] for t in train_set]
V = TextVectorizer(list)
V.build_vocab(text)
print("Number of vocab (char):", len(V))

Number of vocab (char): 6255


In [7]:
include_seq_len = False; batch_size = 1024
trans_fn = get_trans_fn(V, include_seq_len=include_seq_len)
batchify_fn = get_batchify_fn(include_seq_len=include_seq_len)
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, shuffle=False, batch_size=batch_size)

## 3. Model building

In [8]:
import paddle 
import paddle.nn as nn
import paddle.nn.functional as F


class CNN(nn.Layer):

    def __init__(self,
                 vocab_size,
                 output_dim,
                 embedding_dim=30,
                 padding_idx=0,
                 num_filter=256,
                 filter_sizes=(3,),
                 hidden_dim=15,
                 activation=nn.ReLU()):
        
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=padding_idx)
        
        self.convs = nn.LayerList([
            nn.Conv1D(
                in_channels=embedding_dim,
                out_channels=num_filter,
                kernel_size=fz
            ) for fz in filter_sizes
        ])
        self.dense = nn.Linear(len(filter_sizes) * num_filter, hidden_dim)
        self.activation = activation
        self.dense_out = nn.Linear(hidden_dim, output_dim)
    
    def encoder(self, embd):
        embd = embd.transpose((0,2,1))
        conved = [self.activation(conv(embd)) for conv in self.convs]
        max_pooled = [F.adaptive_max_pool1d(conv, output_size=1).squeeze(2) for conv in conved]
        pooled_concat = paddle.concat(max_pooled, axis=1)
        return pooled_concat
 
    def forward(self, text_ids):
        text_embd = self.embedding(text_ids)
        encoded = self.encoder(text_embd)
        hidden_out = self.activation(self.dense(encoded))
        out_logits = self.dense_out(hidden_out)
        return out_logits

In [9]:
from paddlenlp.transformers import LinearDecayWithWarmup

epoch = 20
weight_decay = 0.0
warmup_proportion = 0.0
lr_scheduler = LinearDecayWithWarmup(5e-3, len(train_loader) * epoch, warmup_proportion)

def get_model(model):
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
    parameters=model.parameters(), 
    learning_rate=lr_scheduler, 
    weight_decay=weight_decay, 
    apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.CrossEntropyLoss()

    model = paddle.Model(model)
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

In [10]:
model = CNN(len(V), 3)
model = get_model(model)

W0328 03:31:11.381294 12201 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0328 03:31:11.385860 12201 device_context.cc:465] device: 0, cuDNN Version: 7.6.


## 4. Model training

In [11]:
from paddle.callbacks import EarlyStopping

earlystop = EarlyStopping(patience=5)

model.fit(train_loader, dev_loader, epochs=epoch, verbose=2, log_freq=1000, callbacks=[earlystop])

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/20
step 1000/2144 - loss: 0.0964 - acc: 0.9493 - 16ms/step
step 2000/2144 - loss: 0.0662 - acc: 0.9573 - 16ms/step
step 2144/2144 - loss: 0.0611 - acc: 0.9579 - 16ms/step
Eval begin...
step 715/715 - loss: 0.0663 - acc: 0.9682 - 14ms/step
Eval samples: 731622
Epoch 2/20
step 1000/2144 - loss: 0.0667 - acc: 0.9700 - 16ms/step
step 2000/2144 - loss: 0.0511 - acc: 0.9705 - 16ms/step
step 2144/2144 - loss: 0.0870 - acc: 0.9706 - 16ms/step
Eval begin...
step 715/715 - loss: 0.0591 - acc: 0.9711 - 14ms/step
Eval samples: 731622
Epoch 3/20
step 1000/2144 - loss: 0.0658 - acc: 0.9732 - 16ms/step
step 2000/2144 - loss: 0.0503 - acc: 0.9734 - 16ms/step
step 2144/2144 - loss: 0.0597 - acc: 0.9735 - 16ms/step
Eval begin...
step 715/715 - loss: 0.0405 - acc: 0.9728 - 14ms/step
Eval samples: 731622
Epoch 4/20
step 1000/2144 - loss: 0.0535 - acc: 0.9751 - 16ms/step
step 2000/2144 - lo

In [12]:
model.save("ckpt")

## 5. Evaluation

In [13]:
model.evaluate(test_loader)

Eval begin...
step  10/715 - loss: 0.0527 - acc: 0.9732 - 18ms/step
step  20/715 - loss: 0.0581 - acc: 0.9744 - 16ms/step
step  30/715 - loss: 0.0754 - acc: 0.9741 - 99ms/step
step  40/715 - loss: 0.0559 - acc: 0.9744 - 78ms/step
step  50/715 - loss: 0.0671 - acc: 0.9748 - 65ms/step
step  60/715 - loss: 0.0637 - acc: 0.9745 - 57ms/step
step  70/715 - loss: 0.0542 - acc: 0.9747 - 50ms/step
step  80/715 - loss: 0.0755 - acc: 0.9747 - 46ms/step
step  90/715 - loss: 0.0434 - acc: 0.9747 - 42ms/step
step 100/715 - loss: 0.0469 - acc: 0.9748 - 39ms/step
step 110/715 - loss: 0.0572 - acc: 0.9748 - 37ms/step
step 120/715 - loss: 0.0514 - acc: 0.9747 - 35ms/step
step 130/715 - loss: 0.0958 - acc: 0.9747 - 34ms/step
step 140/715 - loss: 0.0526 - acc: 0.9749 - 32ms/step
step 150/715 - loss: 0.0452 - acc: 0.9751 - 31ms/step
step 160/715 - loss: 0.0359 - acc: 0.9750 - 30ms/step
step 170/715 - loss: 0.0379 - acc: 0.9750 - 29ms/step
step 180/715 - loss: 0.0759 - acc: 0.9749 - 28ms/step
step 190/715 -

{'loss': [0.045631748], 'acc': 0.9749488123648551}