## 1. Load dataset

In [1]:
def relabel(g, reverse=False):
    if reverse:
        return {0: 'M', 1: 'F', 2: 'U'}[g]
    return {'M': 0, 'F': 1, 'U': 2}[g]


def data_loader(filepath, num_row_skip=1):
    def readFile(path):
        f = open(path, 'r')
        for _ in range(num_row_skip):
            next(f)
        out = []
        for line in f:
            line = line.split('\t')
            out.append([line[-2], relabel(line[-1].strip())])
        return out
    
    if isinstance(filepath, str):
        return readFile(filepath)
    elif isinstance(filepath, list):
        return [readFile(path) for path in filepath]
    else:
         raise TypeError('filepath must be either a str or a list.')

In [2]:
ccnc = data_loader('data/data96861/ccnc.txt')
len(ccnc), ccnc[:3]

(3658109, [['陈品如', 0], ['陈祥旭', 0], ['陈晓', 0]])

In [3]:
from random import seed, shuffle 


def train_dev_test_split(data, train=0.6, dev=0.2, test=0.2, seed_idx=5):
    seed(seed_idx)
    shuffle(data)
    length = len(data)
    boundary1 = round(length * train)
    boundary2 = round(length * (train + dev))    
    return data[:boundary1], data[boundary1: boundary2], data[boundary2:]

In [4]:
train_set, dev_set, test_set = train_dev_test_split(ccnc)
len(train_set), len(dev_set), len(test_set)

(2194865, 731622, 731622)

## 2. Transform text

In [5]:
from paddlenlp.datasets import MapDataset
from paddle.io import BatchSampler, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.data import Vocab


class TextVectorizer:
     
    def __init__(self, tokenizer=None):
        self.tokenize = tokenizer
        self.vocab_to_idx = None
        self._V = None
    
    def build_vocab(self, text):
        tokens = list(map(self.tokenize, text))
        self._V = Vocab.build_vocab(tokens, unk_token='[UNK]', pad_token='[PAD]')
        self.vocab_to_idx = self._V.token_to_idx
        
    def text_encoder(self, text):
        if isinstance(text, list):
            return [self(t) for t in text]
        
        tks = self.tokenize(text)
        out = [self.vocab_to_idx[tk] for tk in tks]
        return out

    def __len__(self):
        return len(self.vocab_to_idx)

    def __getitem__(self, w):
        return self.vocab_to_idx[w]
    
    def __call__(self, text):
        if self.vocab_to_idx:
            return self.text_encoder(text)
        raise ValueError("No vocab is built!")


def example_converter(example, text_encoder, include_seq_len):
    
    text, label = example
    encoded = text_encoder(text)
    if include_seq_len:
        text_len = len(encoded)
        return encoded, text_len, label
    return encoded, label


def get_trans_fn(text_encoder, include_seq_len):
    return lambda ex: example_converter(ex, text_encoder, include_seq_len)


def get_batchify_fn(include_seq_len):
    
    if include_seq_len:
        stack = [Stack(dtype="int64")] * 2
    else:
        stack = [Stack(dtype="int64")]
    
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  
        *stack
    ): fn(samples)
    
    return batchify_fn


def create_dataloader(dataset, 
                      trans_fn, 
                      batchify_fn, 
                      batch_size=128, 
                      shuffle=True, 
                      sampler=BatchSampler):

    if not isinstance(dataset, MapDataset):
        dataset = MapDataset(dataset)
        
    dataset.map(trans_fn)
    batch_sampler = sampler(dataset, 
                            shuffle=shuffle, 
                            batch_size=batch_size)
    
    dataloder = DataLoader(dataset, 
                           batch_sampler=batch_sampler, 
                           collate_fn=batchify_fn)
    
    return dataloder

In [6]:
text = [t[0] for t in train_set]
V = TextVectorizer(list)
V.build_vocab(text)
print("Number of vocab (char):", len(V))

Number of vocab (char): 6255


In [7]:
include_seq_len = False; batch_size = 1024
trans_fn = get_trans_fn(V, include_seq_len=include_seq_len)
batchify_fn = get_batchify_fn(include_seq_len=include_seq_len)
train_loader = create_dataloader(train_set, trans_fn, batchify_fn, batch_size=batch_size)
dev_loader = create_dataloader(dev_set, trans_fn, batchify_fn, batch_size=batch_size)
test_loader = create_dataloader(test_set, trans_fn, batchify_fn, shuffle=False, batch_size=batch_size)

## 3. Model building

In [11]:
import paddle 
import paddle.nn as nn
import paddle.nn.functional as F


class LogisticRegression(nn.Layer):

    def __init__(self, 
                vocab_size, 
                output_dim,
                embedding_dim=100,
                padding_idx=0):
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=padding_idx)
        
        self.dense = nn.Linear(embedding_dim, output_dim)

    def encoder(self, embd):
        return embd.sum(axis=1)

    def forward(self, text_ids): 
        text_embd = self.embedding(text_ids)
        encoded = self.encoder(text_embd)
        out_logits = self.dense(encoded)
        return out_logits


def get_model(model):
    model = paddle.Model(model)
    optimizer = paddle.optimizer.Adam(
    parameters=model.parameters(), learning_rate=5e-4)
    criterion = paddle.nn.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

In [12]:
model = LogisticRegression(len(V), 3)
model = get_model(model)

## 4. Model training

In [13]:
from paddle.callbacks import EarlyStopping

earlystop = EarlyStopping(patience=3)

model.fit(train_loader, dev_loader, epochs=10, verbose=2, log_freq=1000, callbacks=[earlystop])

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10
step 1000/2144 - loss: 0.2021 - acc: 0.9149 - 16ms/step
step 2000/2144 - loss: 0.1605 - acc: 0.9241 - 16ms/step
step 2144/2144 - loss: 0.1666 - acc: 0.9247 - 16ms/step
Eval begin...
step 715/715 - loss: 0.1979 - acc: 0.9343 - 11ms/step
Eval samples: 731622
Epoch 2/10
step 1000/2144 - loss: 0.1668 - acc: 0.9341 - 16ms/step
step 2000/2144 - loss: 0.1985 - acc: 0.9346 - 16ms/step
step 2144/2144 - loss: 0.1399 - acc: 0.9347 - 16ms/step
Eval begin...
step 715/715 - loss: 0.1289 - acc: 0.9351 - 12ms/step
Eval samples: 731622
Epoch 3/10
step 1000/2144 - loss: 0.1950 - acc: 0.9353 - 17ms/step
step 2000/2144 - loss: 0.1659 - acc: 0.9352 - 16ms/step
step 2144/2144 - loss: 0.1240 - acc: 0.9353 - 17ms/step
Eval begin...
step 715/715 - loss: 0.1834 - acc: 0.9354 - 15ms/step
Eval samples: 731622
Epoch 4/10
step 1000/2144 - loss: 0.1649 - acc: 0.9352 - 20ms/step
step 2000/2144 - lo

## 5. Evaluation

In [14]:
model.evaluate(test_loader)

Eval begin...
step  10/715 - loss: 0.1996 - acc: 0.9375 - 21ms/step
step  20/715 - loss: 0.1589 - acc: 0.9361 - 17ms/step
step  30/715 - loss: 0.1965 - acc: 0.9348 - 16ms/step
step  40/715 - loss: 0.1495 - acc: 0.9360 - 16ms/step
step  50/715 - loss: 0.1826 - acc: 0.9363 - 15ms/step
step  60/715 - loss: 0.1599 - acc: 0.9366 - 15ms/step
step  70/715 - loss: 0.1661 - acc: 0.9363 - 15ms/step
step  80/715 - loss: 0.1792 - acc: 0.9359 - 15ms/step
step  90/715 - loss: 0.1491 - acc: 0.9354 - 15ms/step
step 100/715 - loss: 0.1782 - acc: 0.9354 - 15ms/step
step 110/715 - loss: 0.1814 - acc: 0.9354 - 15ms/step
step 120/715 - loss: 0.1928 - acc: 0.9356 - 15ms/step
step 130/715 - loss: 0.1963 - acc: 0.9356 - 15ms/step
step 140/715 - loss: 0.1599 - acc: 0.9358 - 15ms/step
step 150/715 - loss: 0.1554 - acc: 0.9358 - 15ms/step
step 160/715 - loss: 0.1588 - acc: 0.9357 - 15ms/step
step 170/715 - loss: 0.1512 - acc: 0.9358 - 15ms/step
step 180/715 - loss: 0.2028 - acc: 0.9357 - 15ms/step
step 190/715 -

{'loss': [0.17419836], 'acc': 0.9351523054254792}