## Mount drive, unzip data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/fingerprint/train_data.zip

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
  inflating: train_data/train_set/14_zoom.jpg  
  inflating: train_data/train_set/33_v_shifted.jpg  
  inflating: train_data/train_set/34_h_shifted.jpg  
  inflating: train_data/train_set/46_rotated.jpg  
  inflating: train_data/train_set/58_h_shifted.jpg  
  inflating: train_data/train_set/62_rotated.jpg  
  inflating: train_data/train_set/71_original.jpg  
  inflating: train_data/train_set/100_v_shifted.jpg  
  inflating: train_data/train_set/114_h_shifted.jpg  
  inflating: train_data/train_set/136_noise.jpg  
  inflating: train_data/train_set/139_v_shifted.jpg  
  inflating: train_data/train_set/154_rotated.jpg  
  inflating: train_data/train_set/169_h_shifted.jpg  
  inflating: train_data/train_set/209_noise.jpg  
  inflating: train_data/train_set/232_zoom.jpg  
  inflating: train_data/train_set/257_v_shifted.jpg  
  inflating: train_data/train_set/258_zoom.jpg  
  inflating: train_data/train_set/282_original.j

## Dataset class

In [None]:
from PIL import Image

from torch.utils.data import Dataset
from torch.utils.data.sampler import BatchSampler

def get_img_label(img_fp):
    img_fn = img_fp.split('/')[-1]
    img_label = img_fn.split('_')[0]
    return int(img_label)


class FingerprintDataset(Dataset):
    def __init__(self, imgs_fp, transform=None):
        self.imgs_fp = imgs_fp
        self.transform = transform

    def __getitem__(self, idx):
        img_path = self.imgs_fp[idx]
        img = Image.open(img_path).convert('L')
        label = get_img_label(img_path)
        if self.transform is not None:
            img = self.transform(img)
        return img, label

    def __len__(self):
        return len(self.imgs_fp)


class BalancedBatchSampler(BatchSampler):
    """
    BatchSampler - from a MNIST-like dataset, samples n_classes and within these classes samples n_samples.
    Returns batches of size n_classes * n_samples
    """

    def __init__(self, labels, n_classes, n_samples):
        self.labels = labels
        self.labels_set = list(set(self.labels))
        self.label_to_indices = {label: np.where(np.array(self.labels) == label)[0]
                                 for label in self.labels_set}
        for l in self.labels_set:
            np.random.shuffle(self.label_to_indices[l])
        self.used_label_indices_count = {label: 0 for label in self.labels_set}
        self.count = 0
        self.n_classes = n_classes
        self.n_samples = n_samples
        self.n_dataset = len(self.labels)
        self.batch_size = self.n_samples * self.n_classes

    def __iter__(self):
        self.count = 0
        while self.count + self.batch_size < self.n_dataset:
            classes = np.random.choice(self.labels_set, self.n_classes, replace=False)
            indices = []
            for class_ in classes:
                indices.extend(self.label_to_indices[class_][
                               self.used_label_indices_count[class_]:self.used_label_indices_count[
                                                                         class_] + self.n_samples])
                self.used_label_indices_count[class_] += self.n_samples
                if self.used_label_indices_count[class_] + self.n_samples > len(self.label_to_indices[class_]):
                    np.random.shuffle(self.label_to_indices[class_])
                    self.used_label_indices_count[class_] = 0
            yield indices
            self.count += self.n_classes * self.n_samples

    def __len__(self):
        return self.n_dataset // self.batch_size

## Network


In [None]:
import torch.nn as nn


class EmbeddingNet(nn.Module):
    # Input size = (1, 128, 128)
    def __init__(self):
        super(EmbeddingNet, self).__init__()
        self.convnet = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=(1,1)), nn.PReLU(),
                                     nn.MaxPool2d(2, stride=2),
                                     nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=(1,1)), nn.PReLU(),
                                     nn.MaxPool2d(2, stride=2),
                                     nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=(1,1)), nn.PReLU(),
                                     nn.MaxPool2d(2, stride=2)
                                     )

        self.fc = nn.Sequential(nn.Linear(128 * 16 * 16, 256),
                                nn.PReLU(),
                                nn.Linear(256, 256),
                                nn.PReLU(),
                                nn.Linear(256, 128)
                                )

    def forward(self, x):
        output = self.convnet(x)
        output = output.view(output.size()[0], -1)
        output = self.fc(output)
        return output

    def get_embedding(self, x):
        return self.forward(x)


## Loss

In [None]:
import torch.nn as nn
import torch.nn.functional as F

    
class OnlineTripletLoss(nn.Module):
    """
    Online Triplets loss
    Takes a batch of embeddings and corresponding labels.
    Triplets are generated using triplet_selector object that take embeddings and targets and return indices of
    triplets
    """

    def __init__(self, margin, triplet_selector):
        super(OnlineTripletLoss, self).__init__()
        self.margin = margin
        self.triplet_selector = triplet_selector

    def forward(self, embeddings, target):

        triplets = self.triplet_selector.get_triplets(embeddings, target)

        if embeddings.is_cuda:
            triplets = triplets.cuda()

        ap_distances = (embeddings[triplets[:, 0]] - embeddings[triplets[:, 1]]).pow(2).sum(1)  # .pow(.5)
        an_distances = (embeddings[triplets[:, 0]] - embeddings[triplets[:, 2]]).pow(2).sum(1)  # .pow(.5)
        losses = F.relu(ap_distances - an_distances + self.margin)

        return losses.mean(), len(triplets)

In [None]:
from itertools import combinations

import numpy as np
import torch


def pdist(vectors):
    distance_matrix = -2 * vectors.mm(torch.t(vectors)) + vectors.pow(2).sum(dim=1).view(1, -1) + vectors.pow(2).sum(
        dim=1).view(-1, 1)
    return distance_matrix


class TripletSelector:
    """
    Implementation should return indices of anchors, positive and negative samples
    return np array of shape [N_triplets x 3]
    """

    def __init__(self):
        pass

    def get_triplets(self, embeddings, labels):
        raise NotImplementedError


def hardest_negative(loss_values):
    hard_negative = np.argmax(loss_values)
    return hard_negative if loss_values[hard_negative] > 0 else None


def semihard_negative(loss_values, margin):
    semihard_negatives = np.where(np.logical_and(loss_values < margin, loss_values > 0))[0]
    return np.random.choice(semihard_negatives) if len(semihard_negatives) > 0 else None


class FunctionNegativeTripletSelector(TripletSelector):
    """
    For each positive pair, takes the hardest negative sample (with the greatest triplet loss value) to create a triplet
    Margin should match the margin used in triplet loss.
    negative_selection_fn should take array of loss_values for a given anchor-positive pair and all negative samples
    and return a negative index for that pair
    """

    def __init__(self, margin, negative_selection_fn, cpu=True):
        super(FunctionNegativeTripletSelector, self).__init__()
        self.cpu = cpu
        self.margin = margin
        self.negative_selection_fn = negative_selection_fn

    def get_triplets(self, embeddings, labels):
        if self.cpu:
            embeddings = embeddings.cpu()
        distance_matrix = pdist(embeddings)
        distance_matrix = distance_matrix.cpu()

        labels = labels.cpu().data.numpy()
        triplets = []

        for label in set(labels):
            label_mask = (labels == label)
            label_indices = np.where(label_mask)[0]
            if len(label_indices) < 2:
                continue
            negative_indices = np.where(np.logical_not(label_mask))[0]
            anchor_positives = list(combinations(label_indices, 2))  # All anchor-positive pairs
            anchor_positives = np.array(anchor_positives)

            ap_distances = distance_matrix[anchor_positives[:, 0], anchor_positives[:, 1]]
            for anchor_positive, ap_distance in zip(anchor_positives, ap_distances):
                loss_values = ap_distance - distance_matrix[torch.LongTensor(np.array([anchor_positive[0]])), torch.LongTensor(negative_indices)] + self.margin
                loss_values = loss_values.data.cpu().numpy()
                hard_negative = self.negative_selection_fn(loss_values)
                if hard_negative is not None:
                    hard_negative = negative_indices[hard_negative]
                    triplets.append([anchor_positive[0], anchor_positive[1], hard_negative])

        if len(triplets) == 0:
            triplets.append([anchor_positive[0], anchor_positive[1], negative_indices[0]])

        triplets = np.array(triplets)

        return torch.LongTensor(triplets)


def HardestNegativeTripletSelector(margin, cpu=False): return FunctionNegativeTripletSelector(margin=margin,
                                                                                 negative_selection_fn=hardest_negative,
                                                                                 cpu=cpu)


def SemihardNegativeTripletSelector(margin, cpu=False): return FunctionNegativeTripletSelector(margin=margin,
                                                                                  negative_selection_fn=lambda x: semihard_negative(x, margin),
                                                                                  cpu=cpu)

## Trainer

In [None]:
import torch
import numpy as np
from tqdm import tqdm

def fit(train_loader, val_loader, model, loss_fn, optimizer, scheduler, n_epochs, cuda, start_epoch=0):
    for epoch in range(0, start_epoch):
        scheduler.step()

    for epoch in range(start_epoch, n_epochs):
        scheduler.step()
        print('Epoch: {}/{}'.format(epoch + 1, n_epochs))

        # Train stage
        train_loss = train_epoch(train_loader, model, loss_fn, optimizer, cuda)

        message = '\nAverage training loss: {:.4f}. '.format(train_loss)

        val_loss = test_epoch(val_loader, model, loss_fn, cuda)
        val_loss /= len(val_loader)

        message += 'Average validating loss: {:.4f}'.format(val_loss)
        print(message)

        torch.save(model.state_dict(), "backup/weight_{}.pth".format(epoch+1), _use_new_zipfile_serialization=False)
        print('Saving model...')

        print('\n' + '='*80 + '\n')

def train_epoch(train_loader, model, loss_fn, optimizer, cuda):
    model.train()
    losses = []
    total_loss = 0

    for batch_idx, (data, target) in enumerate(tqdm(train_loader, desc="Training epoch", position=0, leave=False)):
        target = target if len(target) > 0 else None
        if not type(data) in (tuple, list):
            data = (data,)
        if cuda:
            data = tuple(d.cuda() for d in data)
            if target is not None:
                target = target.cuda()

        optimizer.zero_grad()
        outputs = model(*data)

        if type(outputs) not in (tuple, list):
            outputs = (outputs,)

        loss_inputs = outputs
        if target is not None:
            target = (target,)
            loss_inputs += target

        loss_outputs = loss_fn(*loss_inputs)
        loss = loss_outputs[0] if type(loss_outputs) in (tuple, list) else loss_outputs
        losses.append(loss.item())
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        losses = []

    total_loss /= (batch_idx + 1)
    return total_loss


def test_epoch(val_loader, model, loss_fn, cuda):
    with torch.no_grad():
        model.eval()
        val_loss = 0
        for batch_idx, (data, target) in enumerate(tqdm(val_loader, desc="Validating epoch", position=0, leave=False)):
            target = target if len(target) > 0 else None
            if not type(data) in (tuple, list):
                data = (data,)
            if cuda:
                data = tuple(d.cuda() for d in data)
                if target is not None:
                    target = target.cuda()

            outputs = model(*data)

            if type(outputs) not in (tuple, list):
                outputs = (outputs,)
            loss_inputs = outputs
            if target is not None:
                target = (target,)
                loss_inputs += target

            loss_outputs = loss_fn(*loss_inputs)
            loss = loss_outputs[0] if type(loss_outputs) in (tuple, list) else loss_outputs
            val_loss += loss.item()

    return val_loss


## Train model

In [None]:
!rm -rf /content/backup
!ln -s /content/drive/'My Drive'/fingerprint/backup /content/

In [None]:
import glob
import torch
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader
import torchvision.transforms as transforms


# Device
cuda = torch.cuda.is_available()
device = torch.device('cuda' if cuda else 'cpu')

# Hyperparameters
in_channel = 1
batch_size = 128
learning_rate = 0.001
step_size = 50
num_epochs = 40


# Load Data
train_dir = sorted(glob.glob('train_data/train_set/*.jpg'))
valid_dir = sorted(glob.glob('train_data/valid_set/*.jpg'))

train_label = [get_img_label(img) for img in train_dir]
valid_label = [get_img_label(img) for img in valid_dir]

# train_classes = list(set(train_label))
# valid_classes = list(set(valid_label))

transforms = transforms.Compose([
    transforms.ToTensor()
])
train_set = FingerprintDataset(train_dir, transforms)
valid_set = FingerprintDataset(valid_dir, transforms)

train_batch_sampler = BalancedBatchSampler(train_label, n_classes=50, n_samples=4)
test_batch_sampler = BalancedBatchSampler(valid_label, n_classes=50, n_samples=4)

In [None]:
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}

train_loader = DataLoader(dataset=train_set, batch_sampler=train_batch_sampler, **kwargs)
valid_loader = DataLoader(dataset=valid_set, batch_sampler=test_batch_sampler, **kwargs)

# Model
model = EmbeddingNet()

model.to(device)

# Loss and Optimizer
margin = 1.
loss = OnlineTripletLoss(margin, HardestNegativeTripletSelector(margin))
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=0.1)

In [None]:
# Train Network
fit(train_loader, valid_loader, model, loss, optimizer, scheduler, num_epochs, cuda)

Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]

Epoch: 1/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.4202. Average validating loss: 0.5042
Saving model...


Epoch: 2/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.2216. Average validating loss: 0.7267
Saving model...


Epoch: 3/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.0227. Average validating loss: 1.4945
Saving model...


Epoch: 4/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.0831. Average validating loss: 0.9568
Saving model...


Epoch: 5/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.0606. Average validating loss: 0.9250
Saving model...


Epoch: 6/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.9334. Average validating loss: 0.7243
Saving model...


Epoch: 7/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.9294. Average validating loss: 0.8271
Saving model...


Epoch: 8/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8835. Average validating loss: 0.8942
Saving model...


Epoch: 9/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8806. Average validating loss: 0.5526
Saving model...


Epoch: 10/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8326. Average validating loss: 1.0035
Saving model...


Epoch: 11/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.0813. Average validating loss: 1.0225
Saving model...


Epoch: 12/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8353. Average validating loss: 1.0768
Saving model...


Epoch: 13/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8314. Average validating loss: 1.4147
Saving model...


Epoch: 14/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.0814. Average validating loss: 0.8484
Saving model...


Epoch: 15/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.3139. Average validating loss: 0.9360
Saving model...


Epoch: 16/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.0279. Average validating loss: 1.2267
Saving model...


Epoch: 17/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.9849. Average validating loss: 0.9871
Saving model...


Epoch: 18/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.2127. Average validating loss: 1.3147
Saving model...


Epoch: 19/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8601. Average validating loss: 0.9633
Saving model...


Epoch: 20/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8849. Average validating loss: 0.8959
Saving model...


Epoch: 21/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.9806. Average validating loss: 0.8241
Saving model...


Epoch: 22/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8265. Average validating loss: 1.5178
Saving model...


Epoch: 23/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.9056. Average validating loss: 0.9116
Saving model...


Epoch: 24/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.7605. Average validating loss: 0.9464
Saving model...


Epoch: 25/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 3.2244. Average validating loss: 1.1860
Saving model...


Epoch: 26/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8840. Average validating loss: 0.8141
Saving model...


Epoch: 27/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.9804. Average validating loss: 1.0684
Saving model...


Epoch: 28/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.7836. Average validating loss: 0.9213
Saving model...


Epoch: 29/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8966. Average validating loss: 1.0844
Saving model...


Epoch: 30/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.6522. Average validating loss: 0.8896
Saving model...


Epoch: 31/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 2.4527. Average validating loss: 1.3486
Saving model...


Epoch: 32/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.7136. Average validating loss: 0.8822
Saving model...


Epoch: 33/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.8006. Average validating loss: 1.0084
Saving model...


Epoch: 34/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.6515. Average validating loss: 0.7762
Saving model...


Epoch: 35/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.6023. Average validating loss: 0.4460
Saving model...


Epoch: 36/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.5724. Average validating loss: 0.6049
Saving model...


Epoch: 37/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 1.0373. Average validating loss: 2.7965
Saving model...


Epoch: 38/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.5836. Average validating loss: 0.2088
Saving model...


Epoch: 39/40


Training epoch:   0%|          | 0/480 [00:00<?, ?it/s]


Average training loss: 0.4052. Average validating loss: 0.1475
Saving model...


Epoch: 40/40





Average training loss: 0.8527. Average validating loss: 0.3525
Saving model...


