In [12]:
!pip install wandb



## Dataset

The data set is available from [Kaggle](https://www.kaggle.com/pengcw1/market-1501/data "Market1501")

In [1]:
from pathlib import Path
import sagemaker
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
import wandb

session = sagemaker.Session()
role = get_execution_role()
BUCKET = 'sagemaker-capra'
BASE_PATH = Path('')
SAVE_PATH = BASE_PATH / 'models' / 'convolutional_market1501'
DATA_PATH = BASE_PATH / 'data/Market-1501-v15.09.15'

In [18]:
wandb.login()

wandb: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter:  ········································


wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [19]:
wandb.sagemaker_auth(path='source')

In [None]:
pytorch_estimator = PyTorch('train.py',
                            source_dir='source',
                            instance_type='ml.m5.large',
                            instance_count=1,
                            framework_version='1.6.0',
                            py_version='py3',
                            role=role,
                            hyperparameters = {
                                'num-epochs': 500,
                                'learning-rate': 0.001,
                                'batch-size': 128,
                                'model-name': 'model.pt'
                            })
pytorch_estimator.fit({'training': f's3://{BUCKET}/{DATA_PATH}/bounding_box_train'})

2020-12-10 22:31:30 Starting - Starting the training job...
2020-12-10 22:31:53 Starting - Launching requested ML instancesProfilerReport-1607639489: InProgress
......
2020-12-10 22:32:54 Starting - Preparing the instances for training......
2020-12-10 22:33:55 Downloading - Downloading input data.........
2020-12-10 22:35:28 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-12-10 22:35:29,261 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-12-10 22:35:29,264 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-12-10 22:35:29,292 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-12-10 22:35:29,299 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[

# Code for testing and using the AutoEncoder

In [10]:
import re
import tarfile

import boto3
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
import torch
from torch.utils.data import DataLoader, Dataset

from train import AutoEncoder, get_image_transform

In [11]:
MODEL_BUCKET = session.default_bucket()
# Set MODEL_PATH to the model you have trained
MODEL_PATH = Path('pytorch-training-2020-12-10-12-52-50-641/output')
MODEL_PACKAGE = 'model.tar.gz'
MODEL_NAME = 'model.pt'
LOCAL_MODEL_DIRECTORY = Path('models')
LOCAL_MODEL_DIRECTORY.mkdir(exist_ok=True)

In [12]:
class Market1501Dataset(Dataset):
    BUCKET_INDEX = 2
    PATH_INDEX = 3
    def __init__(self, directory: Path, transform=None):
        self.directory = directory
        self.s3 = None
        if directory[:5] == 's3://':
            self.s3 = boto3.client('s3')
        self._images = None
        if not self.s3:
            self._images = list(self.directory.glob('*'))
        else:
            self._images = self.list_s3_objects()
        self.transform = transform

    def list_s3_objects(self):
        parts = self.directory.split('/')
        self.bucket = parts[self.BUCKET_INDEX]
        self.directory = '/'.join(parts[self.PATH_INDEX:])
        first = True
        continuation_token = None
        keys = []
        while first or continuation_token:
            first = False
            if continuation_token:
                response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=self.directory, ContinuationToken=continuation_token)
            else:
                response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=self.directory)
            if 'NextContinuationToken' in response:
                continuation_token = response['NextContinuationToken']
            else:
                continuation_token = None
            keys += [item['Key'] for item in response['Contents']]
        return keys
    
    def get_key(self, index):
        return self._images[index]
    
    def get_s3_image(self, index: int):
        object = self.s3.get_object(Bucket=self.bucket, Key=self._images[index])
        return mpimg.imread(object['Body'], format='jpeg')

    def __len__(self):
        return len(self._images)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = int(index)
        try:
            if not self.s3:
                image = io.imread(str(self._images[index]))
            else:
                image = self.get_s3_image(index)
        except ValueError as e:
            print(f'Failed with ValueError on <{self._images[index]}>')
            raise

        if not self.transform:
            return image

        return self.transform(image)

In [13]:
s3 = boto3.client('s3')
s3.download_file(MODEL_BUCKET, str(MODEL_PATH / MODEL_PACKAGE), str(LOCAL_MODEL_DIRECTORY / MODEL_PACKAGE))
model_package = tarfile.open(LOCAL_MODEL_DIRECTORY / MODEL_PACKAGE, 'r:gz')
model_package.extract(MODEL_NAME, LOCAL_MODEL_DIRECTORY)

In [14]:
model = AutoEncoder()
model.load_state_dict(torch.load(LOCAL_MODEL_DIRECTORY / MODEL_NAME))
model.eval()
torch.no_grad()

<torch.autograd.grad_mode.no_grad at 0x7f1975d939d0>

In [15]:
image_transform = get_image_transform()
testset = Market1501Dataset(f's3://{BUCKET}/{DATA_PATH}/bounding_box_test', transform=image_transform)
test_loader = DataLoader(testset, batch_size=1, shuffle=False)
trainset = Market1501Dataset(f's3://{BUCKET}/{DATA_PATH}/bounding_box_train', transform=image_transform)
train_loader = DataLoader(trainset, batch_size=1, shuffle=False)

Encode all the train and test images

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X = {
    'train': [],
     'test': []
}
y = {
    'train': [],
     'test': []
}
for i, images in enumerate(train_loader):
    key = trainset.get_key(i)
    features = images.to(device)
    code = model.encode(features)
    X['train'].append(code[0].view(400).detach().numpy())
    y['train'].append(key)

In [17]:
X['test'] = []
y['test'] = []

for i, images in enumerate(test_loader):
    key = testset.get_key(i)
    features = images.to(device)
    code = model.encode(features)
    X['test'].append(code[0].view(400).detach().numpy())
    y['test'].append(key)

In [18]:
y_index = {}
y_index['train'] = [i for i in range(len(y['train']))]
y_index['test'] = [i for i in range(len(y['test']))]

## Train a kNN on the training set to check the accuracy

In [19]:
targeter = KNeighborsClassifier(n_neighbors=1)
targeter.fit(X['train'], y_index['train'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [20]:
y_pred = {}
y_pred['train'] = [int(targeter.predict(x.reshape(1, -1))[0]) for x in X['train']]
y_pred['test'] = [int(targeter.predict(x.reshape(1, -1))[0]) for x in X['test']]

In [21]:
train_report = classification_report(y_index['train'], y_pred['train'], output_dict=True)

In [22]:
train_report['accuracy']

1.0

## Train a kNN on the test set and check the accuracy.

In [23]:
test_targeter = KNeighborsClassifier(n_neighbors=1)
test_targeter.fit(X['test'], y_index['test'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [24]:
y_pred['test'] = [int(test_targeter.predict(x.reshape(1, -1))[0]) for x in X['test']]

In [25]:
test_report = classification_report(y_index['test'], y_pred['test'], output_dict=True)
test_report['accuracy']

1.0

## Test the kNN trained on the test set on the queryset 

In [26]:
X['query'] = []
y['query'] = []
queryset = Market1501Dataset(f's3://{BUCKET}/{DATA_PATH}/query', transform=image_transform)
query_loader = DataLoader(queryset, batch_size=1, shuffle=False)
for i, images in enumerate(query_loader):
    key = queryset.get_key(i)
    features = images.to(device)
    code = model.encode(features)
    X['query'].append(code[0].view(400).detach().numpy())
    y['query'].append(key)

In [27]:
y_pred['query'] = [int(test_targeter.predict(x.reshape(1, -1))[0]) for x in X['query']]

## Test with PIDs

The PID is the person identifier which is embedded in the filename

In [30]:
PATTERN = re.compile(r'([-\d]+)_c(\d)')
def pid_finder(filename):
    pid, _ = map(int, PATTERN.search(filename).groups())
    return pid

In [31]:
y_pid = {
    'query': [pid_finder(filename) for filename in y['query']],
    'test': [pid_finder(filename) for filename in y['test']]
}

In [32]:
result_tuples = []
for i, predicted_index in enumerate(y_pred['query']):
    predicted_pid = pid_finder(y['test'][predicted_index])
    pid = y_pid['query'][i]
    result_tuples.append((pid, predicted_pid))

In [33]:
expected = [row[0] for row in result_tuples]
actual = [row[1] for row in result_tuples]

In [34]:
query_results = classification_report(expected, actual, output_dict=True)
query_results['accuracy']

0.6802256532066508

Accuracy is the same as rank-1 score

In [35]:
query_results['accuracy']

0.6802256532066508

In [36]:
number_of_correct = 0
total_number = 0
for row in result_tuples:
    expected = row[0]
    actual = row[1]
    if expected == actual:
        number_of_correct += 1
    total_number += 1
print(f'rank-1: {float(number_of_correct) / total_number}')

rank-1: 0.6802256532066508


Let's check the result on a random person identifier

The Macro Average Precision is mAP (mean Average Precision)

In [37]:
query_results['macro avg']

{'precision': 0.7813895456945362,
 'recall': 0.6755984042553191,
 'f1-score': 0.6941265140184637,
 'support': 3368}

In [38]:
query_results['weighted avg']

{'precision': 0.7804660224581281,
 'recall': 0.6802256532066508,
 'f1-score': 0.6969085881390867,
 'support': 3368}