# Facial Anti-Spoofing using Deep Neural Network Approaches

In [1]:
import os
import shutil
import evaluate
import numpy as np
import time
import copy
import json
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, random_split

In [4]:
from transformers import ViTConfig, ViTModel, ViTImageProcessor, ViTForImageClassification
from transformers import AutoImageProcessor
from transformers import TrainingArguments, Trainer

## Preparing Data Loaders

In [5]:
class SpoofDataset(Dataset):
    def __init__(self, data_dir, label_file, transform=None):
        """
        Args:
            data_dir: Directory with all the images, separated into 'live' and 'spoof' subdirectories.
            label_file: Path to JSON file with labels, where label 0 indicates 'live' and others indicate 'spoof'.
            transform: Optional transform.
        """
        self.data_dir = data_dir
        self.transform = transform
        with open(label_file, 'r') as f:
            self.labels = json.load(f)

        self.img_paths = []
        self.img_labels = []
        self.features_list = []
        for img_name, features in self.labels.items():
            # Correct the file extension from .png to .jpg
            #img_name = img_name.replace('.png', '.jpg')
            
            # final item in feature is for live vs spoof
            label = features[-1]
            subfolder = 'live' if label == 0 else 'spoof'
            full_path = os.path.join(self.data_dir, subfolder, img_name)
            
            if os.path.exists(full_path):
                self.img_paths.append(full_path)
                self.img_labels.append(label)
                self.features_list.append(features[:-1])

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)
            
        label = self.img_labels[idx]
        features = self.features_list[idx]

        return image, label, features


In [6]:
data_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.ToPILImage()
])

In [7]:
data_dir = "/Users/jasminecjwchen/Documents/GitHub/COMS-4995-ACV-Project/preprocessed_data"
test_dir = "/Users/jasminecjwchen/Documents/GitHub/COMS-4995-ACV-Project/unseen_data"
output_dir = '/Users/jasminecjwchen/Documents/GitHub/COMS-4995-ACV-Project/split_data'

In [8]:
label_file = "/Users/jasminecjwchen/Documents/GitHub/COMS-4995-ACV-Project/preprocessing/test_labels v2.json"
dataset = SpoofDataset(data_dir, label_file, data_transforms)

In [9]:
# splitting
num_train = int(len(dataset) * 0.8)
num_val = len(dataset) - num_train
train_dataset, val_dataset = random_split(dataset, [num_train, num_val])

In [10]:
def collate_fn(batch):
    # Filter failed images first
    batch = list(filter(lambda x: x is not None, batch))

    images = [sample[0] for sample in batch]
    labels = torch.LongTensor([sample[1] for sample in batch])
    features = torch.stack([torch.Tensor(sample[2]) for sample in batch])
    
    return images, labels, features

In [11]:
dataloaders = {
    'train': DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True, collate_fn=collate_fn),
    'val': DataLoader(val_dataset, batch_size=32, shuffle=False, pin_memory=True, collate_fn=collate_fn)
}

In [12]:
dataset_sizes = {
    "train": len(train_dataset),
    "val": len(val_dataset)
}

In [13]:
print(dataset_sizes)

{'train': 31876, 'val': 7970}


## VIT

In [14]:
class VITConcatNeuralNetworkModel(nn.Module):
    '''
        width should either be an integer (all layers would have same width) or a list that is of length "depth"
        vit config params are passed through kwargs
    '''

    def __init__(self, freeze_vit = True, depth = 8, width = 8, **kwargs):
        super().__init__()
        
        self.tokenizer = ViTImageProcessor()
        
        #vit_config = ViTConfig(**kwargs)
        self.model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        
        first_layer_width = width if isinstance(width, int) else width[0]
        nn_modules = [nn.Linear(151336, first_layer_width)]
        
        for i in range(1, depth):
            if isinstance(width, int):
                nn_modules.append(nn.Linear(width, width))
            else:
                nn_modules.append(nn.Linear(width[i - 1], width[i]))
        
        logistic = nn.Linear(width, 5) if isinstance(width, int) else width[-1]
        nn_modules.append(logistic)
        self.sequential = nn.Sequential(nn_modules)
        
        if freeze_vit: 
            for param in self.model.parameters():
                param.requires_grad = False
    
    def forward(self, image, features):
        encoded_image = self.tokenizer(image, return_tensors = "pt")
        model_output = self.model(**encoded_image)
        
        flattened_output = model_output.last_hidden_state.flatten(start_dim = 1)
        #print(flattened_output.shape)
        
        # concat 40 features with flattened image
        #print(features.shape, flattened_output.shape)
        concated = torch.cat((features, flattened_output), 1)
        
        return self.sequential(concated)

## Training

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")
use_gpu = (device != "cpu")

Using cpu


In [16]:
def train_model(model, optimizer, criterion = None, scheduler = None, dataloaders = dataloaders, num_epochs = 1, patience = 10, output_filename = "best_model.pth"):    
    since = time.time()
    
    # Initialize best metrics tracking
    best_metrics = {
        'epoch': 0,
        'val_loss': float('inf'),
        'val_accuracy': 0,
        'val_precision': 0,
        'val_recall': 0,
        'val_f1': 0,
    }
    
    best_model_wts = copy.deepcopy(model.state_dict())
    
    if not criterion:
        criterion = nn.CrossEntropyLoss()
    
    # use patience for early stopping when validation isnt getting better
    patience_left = patience

    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        
        for phase in ["train", "val"]:
            if phase == "train":
                model.train()
            else:
                model.eval()
            
            running_loss = 0.0
            all_preds = []
            all_labels = []
            
            for image, labels, features in dataloaders[phase]:
                if use_gpu:
                    labels = labels.to(device)
                
                optimizer.zero_grad()
                
                # forward
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(image, features)
                    preds = torch.argmax(outputs, 1)
                    loss = criterion(outputs, labels)
                    
                    if phase == "train":
                        loss.backward()
                        optimizer.step()
                
                running_loss += loss.item() * len(image)
                #print(running_loss, running_corrects)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
            
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_accuracy = accuracy_score(all_labels, all_preds)
            epoch_precision = precision_score(all_labels, all_preds, zero_division=0, average='macro')
            epoch_recall = recall_score(all_labels, all_preds, zero_division=0, average='macro')
            epoch_f1 = f1_score(all_labels, all_preds, zero_division=0, average='macro')

            epoch_time = time.time() - epoch_start_time

            # deep copy the model if it's best so far
            if phase == "val" and (epoch_loss < best_metrics['val_loss'] or epoch_recall > best_metrics['val_recall']):
                best_metrics.update({
                    'epoch': epoch + 1,
                    'val_loss': epoch_loss,
                    'val_accuracy': epoch_accuracy,
                    'val_precision': epoch_precision,
                    'val_recall': epoch_recall,
                    'val_f1': epoch_f1,
                })
                
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), output_filename)
                patience_left = patience
            elif phase == "val":
                patience_left -= 1
            
            print(f'Epoch {epoch}/{num_epochs - 1} {phase} complete in {epoch_time:.4f} seconds. {phase} loss: {epoch_loss:.4f} recall: {epoch_recall:.4f}. Patience left: {patience_left}')
            
        if patience_left <= 0:
            print("Ran out of patience. Stopping early")
            break
        
        if scheduler:
            scheduler.step()
    
    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f"Best Metrics at Epoch {best_metrics['epoch']}:")
    for metric, value in best_metrics.items():
        if metric != 'epoch':
            print(f"{metric.capitalize()}: {value:.4f}")

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [17]:
model = VITConcatNeuralNetworkModel(depth = 8, width = 16)
optimizer = optim.Adam(model.parameters(), lr = 0.001)

TypeError: list is not a Module subclass

In [None]:
train_model(model, optimizer, output_filename = "vit_multiclass_concat_nn_depth8_width16.pth")

Epoch 0/0 train complete in 4008.3923 seconds. train loss: 0.3308 recall: 0.9186. Patience left: 10
Epoch 0/0 val complete in 4817.2729 seconds. val loss: 0.3391 recall: 0.9277. Patience left: 10
Training complete in 80m 19s
Best Metrics at Epoch 1:
Val_loss: 0.3391
Val_accuracy: 0.9487
Val_precision: 0.9240
Val_recall: 0.9277
Val_f1: 0.9253


VITConcatModel(
  (model): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_f