In [None]:
import os
import pickle
from deepparse.dataset_container import PickleDatasetContainer
from deepparse.parser import AddressParser
import shutil
from poutyne import set_seeds
import poutyne
import timeit
import uk_test_data
import training
from pathlib import Path

seed = 42
set_seeds(seed)

In [None]:
# Retrain an Address Parser for Single Country Uses

In this project, a pre-trained model is retrained to maximize its performance for UK addresses.

## Retrain a Model

First, to retrain the supervised model, we need parsed address example, the sample data set is from Companies House.

In [None]:
root_dir = os.path.join("../data/")
pkl_file_extension = ".p"
training_dataset_name = "uk_training_data"
test_dataset_name = "uk_test_data"
checkpoints_dir = "./checkpoints/uk"
tag_dictionary = {"CareOf": 0, "Unit": 1, "Floor": 2, "Building": 3, "StreetNumber": 4, "StreetName": 5, "District": 6, "City": 7, "Village": 8, "Estate": 9, "Town": 10, "POBox": 11, "PostCode": 12, "EOS": 13}

test_pkl_path = os.path.join(root_dir, "pkl", test_dataset_name + pkl_file_extension)
training_pkl_path =  os.path.join(root_dir, "pkl", training_dataset_name + pkl_file_extension)

# create pickle dataset
with open(test_pkl_path, 'wb') as f:
    pickle.dump(uk_test_data.test, f)

with open(training_pkl_path, 'wb') as f:
    pickle.dump(training.training, f)

read = pickle.load(open(training_pkl_path, 'rb'))
print(read)

Now, let's import our train and test datasets into memory to retrain our parser model.

In [None]:
training_container = PickleDatasetContainer(training_pkl_path)
test_container = PickleDatasetContainer(test_pkl_path)

We will use the FastText one for our base pre-trained model since it is faster to retrain.

In [None]:
address_parser = AddressParser(model_type="fasttext", device=0)

But first, let's see what the performance is before retraining.

In [None]:
address_parser.test(test_container, batch_size=256)

In [None]:
_ = address_parser.retrain(
    training_container,
    train_ratio=0.8,
    epochs=5,
    batch_size=8,
    num_workers=3,
    learning_rate=0.001,
    prediction_tags=tag_dictionary,
    logging_path=checkpoints_dir,
    name_of_the_retrain_parser="UKParser"
    )

In [None]:
address_parser.test(test_container, batch_size=256)

To further improve performance, we could train for longer, increase the training dataset size (the actual size of 100,000 addresses), or rework the Seq2Seq hidden sizes. See the [retrain interface documentation](https://deepparse.org/parser.html#deepparse.parser.AddressParser.retrain) for all the training parameters.