## 1. Load dataset

In [1]:
def relabel(g, reverse=False):
    if reverse:
        return {0: 'M', 1: 'F', 2: 'U'}[g]
    return {'M': 0, 'F': 1, 'U': 2}[g]


def data_loader(filepath, num_row_skip=1):
    def readFile(path):
        f = open(path, 'r')
        for _ in range(num_row_skip):
            next(f)
        out = []
        for line in f:
            line = line.split('\t')
            out.append([line[-2], relabel(line[-1].strip())])
        return out
    
    if isinstance(filepath, str):
        return readFile(filepath)
    elif isinstance(filepath, list):
        return [readFile(path) for path in filepath]
    else:
         raise TypeError('filepath must be either a str or a list.')

In [2]:
ccnc = data_loader('data/data96861/ccnc.txt')
len(ccnc), ccnc[:3]

(3658109, [['陈品如', 0], ['陈祥旭', 0], ['陈晓', 0]])

In [3]:
from random import seed, shuffle 


def train_dev_test_split(data, train=0.6, dev=0.2, test=0.2, seed_idx=5):
    seed(seed_idx)
    shuffle(data)
    length = len(data)
    boundary1 = round(length * train)
    boundary2 = round(length * (train + dev))    
    return data[:boundary1], data[boundary1: boundary2], data[boundary2:]

In [4]:
train_set, dev_set, test_set = train_dev_test_split(ccnc)
len(train_set), len(dev_set), len(test_set)

(2194865, 731622, 731622)

## 2. Transform text

In [5]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.data import Vocab


class TextVectorizer:
     
    def __init__(self, tokenizer=None):
        self.tokenize = tokenizer
        self.vocab_to_idx = None
        self._V = None
    
    def build_vocab(self, text):
        tokens = list(map(self.tokenize, text))
        self._V = Vocab.build_vocab(tokens, unk_token='[UNK]', pad_token='[PAD]')
        self.vocab_to_idx = self._V.token_to_idx
        
    def text_encoder(self, text):
        if isinstance(text, list):
            return [self(t) for t in text]
        
        tks = self.tokenize(text)
        out = [self.vocab_to_idx[tk] for tk in tks]
        return out

    def __len__(self):
        return len(self.vocab_to_idx)

    def __getitem__(self, w):
        return self.vocab_to_idx[w]
    
    def __call__(self, text):
        if self.vocab_to_idx:
            return self.text_encoder(text)
        raise ValueError("No vocab is built!")


def example_converter(example, text_encoder, include_seq_len):
    
    text, label = example
    encoded = text_encoder(text)
    if include_seq_len:
        text_len = len(encoded)
        return encoded, text_len, label
    return encoded, label


def get_trans_fn(text_encoder, include_seq_len):
    return lambda ex: example_converter(ex, text_encoder, include_seq_len)


def get_batchify_fn(include_seq_len):
    
    if include_seq_len:
        stack = [Stack(dtype="int64")] * 2
    else:
        stack = [Stack(dtype="int64")]
    
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  
        *stack
    ): fn(samples)
    
    return batchify_fn


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder

In [6]:
text = [t[0] for t in train_set]
V = TextVectorizer(list)
V.build_vocab(text)
print("Number of vocab (char):", len(V))

Number of vocab (char): 6255


In [7]:
include_seq_len = True; batch_size = 1024
trans_fn = get_trans_fn(V, include_seq_len=include_seq_len)
batchify_fn = get_batchify_fn(include_seq_len=include_seq_len)
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, shuffle=False, batch_size=batch_size)

## 3. Model building

In [8]:
import paddle, paddlenlp
import paddle.nn as nn
import paddle.nn.functional as F


class LSTM(nn.Layer):
    def __init__(self,
                 vocab_size,
                 num_classes,
                 emb_dim=30,
                 padding_idx=0,
                 lstm_hidden_size=15,
                 direction='forward',
                 lstm_layers=1,
                 dropout_rate=0.0,
                 pooling_type=None,
                 fc_hidden_size=96):
        super().__init__()

        self.embedder = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim,
            padding_idx=padding_idx)

        self.lstm_encoder = paddlenlp.seq2vec.LSTMEncoder(
            emb_dim,
            lstm_hidden_size,
            num_layers=lstm_layers,
            direction=direction,
            dropout=dropout_rate,
            pooling_type=pooling_type)

        self.fc = nn.Linear(self.lstm_encoder.get_output_dim(), fc_hidden_size)
        self.output_layer = nn.Linear(fc_hidden_size, num_classes)

    def forward(self, text, seq_len):
        embedded_text = self.embedder(text)
        text_repr = self.lstm_encoder(embedded_text, sequence_length=seq_len)
        fc_out = paddle.tanh(self.fc(text_repr))
        logits = self.output_layer(fc_out)
        return logits


def get_model(model):
    model = paddle.Model(model)
    optimizer = paddle.optimizer.Adam(
    parameters=model.parameters(), learning_rate=5e-4)
    criterion = paddle.nn.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

In [9]:
model = LSTM(len(V), 3)
model = get_model(model)

W0328 01:55:17.914574  4710 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W0328 01:55:17.919216  4710 device_context.cc:465] device: 0, cuDNN Version: 7.6.


## 4. Model training

In [10]:
from paddle.callbacks import EarlyStopping

earlystop = EarlyStopping(patience=3)

model.fit(train_loader, dev_loader, epochs=10, verbose=2, log_freq=1000, callbacks=[earlystop])

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10
step 1000/2144 - loss: 0.1340 - acc: 0.8981 - 18ms/step
step 2000/2144 - loss: 0.1084 - acc: 0.9249 - 18ms/step
step 2144/2144 - loss: 0.1210 - acc: 0.9268 - 18ms/step
Eval begin...
step 715/715 - loss: 0.1480 - acc: 0.9541 - 16ms/step
Eval samples: 731622
Epoch 2/10
step 1000/2144 - loss: 0.1054 - acc: 0.9557 - 18ms/step
step 2000/2144 - loss: 0.1136 - acc: 0.9565 - 18ms/step
step 2144/2144 - loss: 0.1384 - acc: 0.9567 - 18ms/step
Eval begin...
step 715/715 - loss: 0.0894 - acc: 0.9590 - 15ms/step
Eval samples: 731622
Epoch 3/10
step 1000/2144 - loss: 0.1169 - acc: 0.9603 - 17ms/step
step 2000/2144 - loss: 0.0797 - acc: 0.9609 - 18ms/step
step 2144/2144 - loss: 0.1184 - acc: 0.9610 - 18ms/step
Eval begin...
step 715/715 - loss: 0.1055 - acc: 0.9618 - 15ms/step
Eval samples: 731622
Epoch 4/10
step 1000/2144 - loss: 0.0901 - acc: 0.9630 - 17ms/step
step 2000/2144 - lo

In [11]:
model.save("ckpt")

## 5. Evaluation

In [12]:
model.evaluate(test_loader)

Eval begin...
step  10/715 - loss: 0.0853 - acc: 0.9674 - 21ms/step
step  20/715 - loss: 0.0747 - acc: 0.9680 - 18ms/step
step  30/715 - loss: 0.0853 - acc: 0.9667 - 17ms/step
step  40/715 - loss: 0.0783 - acc: 0.9660 - 16ms/step
step  50/715 - loss: 0.0853 - acc: 0.9663 - 16ms/step
step  60/715 - loss: 0.0791 - acc: 0.9666 - 16ms/step
step  70/715 - loss: 0.0811 - acc: 0.9665 - 16ms/step
step  80/715 - loss: 0.0813 - acc: 0.9661 - 16ms/step
step  90/715 - loss: 0.0643 - acc: 0.9663 - 15ms/step
step 100/715 - loss: 0.0768 - acc: 0.9663 - 15ms/step
step 110/715 - loss: 0.0953 - acc: 0.9662 - 15ms/step
step 120/715 - loss: 0.0723 - acc: 0.9662 - 15ms/step
step 130/715 - loss: 0.0931 - acc: 0.9662 - 15ms/step
step 140/715 - loss: 0.0799 - acc: 0.9663 - 15ms/step
step 150/715 - loss: 0.0676 - acc: 0.9665 - 15ms/step
step 160/715 - loss: 0.0650 - acc: 0.9663 - 15ms/step
step 170/715 - loss: 0.0660 - acc: 0.9665 - 15ms/step
step 180/715 - loss: 0.0825 - acc: 0.9663 - 15ms/step
step 190/715 -

{'loss': [0.075469315], 'acc': 0.9660179710287553}