# Prepare the Resnet

In [12]:
# ! pip uninstall transformers -y
# ! pip install transformers==2.2.0

In [1]:
# Install and import necessary packages
! pip install transformers datasets
# ! pip install --upgrade transformers
# ! pip install transformers datasets av
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
import torchvision
import torchvision.transforms.functional as F
from PIL import Image

Defaulting to user installation because normal site-packages is not writeable


# Define the new X-Clip class with Resnet

In [22]:
# Import necessary modules first
import torch
batch_size = 10

class X_Clip_mod(torch.nn.Module):

    def __init__(self, device, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        
        self.device = device
        
        original_model = torch.load("X-CLIP.bin", map_location = "cpu")
        
        # original_model.load_state_dict(params['state_dict'])
        # And move model to GPU
        original_model = original_model.to(device)
        
        self.resnet = torch.load("Resnet 101.bin")
        self.preprocessor = DetrImageProcessor.from_json_file("preprocessor_config.json")

        # Freeze the Resnet model parameters
        for param in self.resnet.parameters():
            param.requires_grad = False

        # Extract the vision model and make change to its parameters
        vision = original_model.base_model.vision_model
        vision.embeddings.patch_embedding = torch.nn.Conv2d(2048, 768, kernel_size=(1, 1), stride=(1, 1), bias=False)
        vision.embeddings.position_embedding = torch.nn.Embedding(1009, 768)
        vision.embeddings.position_ids = torch.arange(1009).expand((1, -1))

        # Shrink the vision encoder to 2 layers as they only need to digest the ResNet feautures, not extracting features
        # Also, change their number of frames to a typical 10 for processed data.
        vision.encoder.layers = vision.encoder.layers[0:2]
        for layer in vision.encoder.layers:
            layer.num_frames = batch_size

        self.vision = vision
        del vision

        # The visual projection module can be used as is
        self.visual_projection = original_model.base_model.visual_projection

        # Extract and change the position embedding size of the mit
        mit = original_model.base_model.mit
        mit.position_embedding = torch.nn.Parameter(torch.rand(1, batch_size, 512))

        self.mit = mit
        del mit

        # And finally the final score prediction layer
        self.scoring = torch.nn.Sequential(torch.nn.Linear(512, 1), torch.nn.Sigmoid())

    def forward(self, frames) -> torch.Tensor:
        '''
        Frames shape: (1, num_frames, height, width, RBG_channels)
        Output shape: (1, num_frames, 1)
        '''
        inputs = self.preprocessor(frames.squeeze(dim=0), return_tensors="pt")
        
        inputs = inputs.to(self.device)
        
        features, _ = self.resnet(**inputs)
        features, _ = features[-1]
        
        features = features.to(self.device)
        
        vision_output = self.vision(features)
        del features

        video_embeds = vision_output[1]
        video_embeds = self.visual_projection(video_embeds)

        cls_features = video_embeds.view(1, batch_size, -1)

        mit_output = self.mit(cls_features)

        video_embeds = mit_output[0]

        scores = self.scoring(video_embeds)
        scores = 4*scores + 1
        return scores

# Define the dataloader to be used for training

In [23]:
import json

class X_Clip_Dataset(torch.utils.data.Dataset):

    def __init__(self, device, mode="test") -> None:
        super().__init__()

        if mode == "train":
            with open("dataset/TVSum/down-sampled frames/train.json", 'r') as j:
                self.frames = json.load(j)
        else:
            with open("dataset/TVSum/down-sampled frames/test.json", 'r') as j:
                self.frames = json.load(j)

        self.device = device


    def __getitem__(self, i):
        # Read the images
        frames_locs = self.frames[i]
        # frame_set = torch.stack([F.pil_to_tensor(Image.open(item)).permute(1, 2, 0) for item in frames_locs], dim=0)
        frame_set = torch.stack([F.pil_to_tensor(Image.open(item).resize((640, 360))).permute(1, 2, 0) for item in frames_locs], dim=0)

        # Read the score tensor
        path_items = frames_locs[0].split('/')
        target = path_items[-2]
        index = int(path_items[-1].rstrip('.jpg'))
        score = torch.load(f"dataset/TVSum/ground truth/{target}.pt").mean(axis=0)
        if index+batch_size > len(score):
            score = score[-batch_size:]
        else:
            score = score[index:index+batch_size]
        score = score.reshape((batch_size, 1))
        return frame_set, score.float()

    def __len__(self):
        return len(self.frames)

# Start training

In [24]:
# Some libraries and model parameters
import os

checkpoint = 'X_Clip_mod.bin'
lr = 0.00001
workers = 1
iterations = 10000
start_epoch = 0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {device}")


Running on cuda


In [25]:
def train(train_loader, model, criterion, optimizer, epoch):
    """
    One epoch's training.

    :param train_loader: DataLoader for training data
    :param model: model
    :param criterion: content loss function (Mean Squared-Error loss)
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    model.train()  # training mode enables batch normalization
    # Batches
    for i, (features, scores) in enumerate(train_loader):
        print(f'iteration: {i}')
        # Move to default device
        
        features = features.to(device)  # (batch_size=1, 10, 2048, 24, 42)
        
        scores = scores.to(device)  # (batch_size=1, 10, 1)
        # Forward prop.
        predict_scores = model(features)  # (batch_size=1, 10, 1)
        print(predict_scores, scores)
        
        # Loss
        loss = criterion(predict_scores, scores)  # scalar
        
        # Backward prop.
        optimizer.zero_grad()
        loss.backward()
        
        # Update model
        optimizer.step()

        # Print status
        if i % batch_size == 0:
            print('Epoch: [{0}][{1}/{2}]----'
                  'Loss {loss:.4f}'.format(epoch, i, len(train_loader), loss=loss))

            # Save checkpoint
            torch.save({'epoch': epoch,
                        'model': model,
                        'optimizer': optimizer},
                        'X_Clip_mod.bin')
            
    del features, scores, predict_scores  # free some memory since their histories may be stored

# Initialize model or load checkpoint
if not os.path.exists(checkpoint):
    model = X_Clip_mod(device)

    # Initialize the optimizer
    optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

else:
    checkpoint = torch.load(checkpoint, map_location=device)
    start_epoch = checkpoint['epoch'] + 1
    model = checkpoint['model']
    optimizer = checkpoint['optimizer']

# Move to default device
model = model.to(device)
criterion = torch.nn.MSELoss().to(device)

# Custom dataloaders
train_dataset = X_Clip_Dataset(device)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=workers, pin_memory=True)  # note that we're passing the collate function here

# Total number of epochs to train for
epochs = int(iterations // len(train_loader) + 1)

In [26]:
epochs = 20
for epoch in range(start_epoch, epochs):
    # One epoch's training
    train(train_loader=train_loader,
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            epoch=epoch)

iteration: 0
tensor([[[1.8214],
         [1.8374],
         [1.7793],
         [1.7956],
         [1.8043],
         [1.8997],
         [1.8051],
         [1.8624],
         [1.8891],
         [1.8348]]], device='cuda:0', grad_fn=<AddBackward0>) tensor([[[2.],
         [2.],
         [2.],
         [2.],
         [2.],
         [2.],
         [2.],
         [2.],
         [2.],
         [2.]]], device='cuda:0')
Epoch: [18][0/2079]----Loss 0.0294
iteration: 1
tensor([[[1.8167],
         [1.8347],
         [1.8245],
         [1.8288],
         [1.7987],
         [1.8149],
         [1.7727],
         [1.8457],
         [1.8459],
         [1.8493]]], device='cuda:0', grad_fn=<AddBackward0>) tensor([[[1.4000],
         [1.4000],
         [1.4000],
         [1.4000],
         [1.4000],
         [1.4000],
         [1.4000],
         [1.4000],
         [1.4000],
         [1.4000]]], device='cuda:0')
iteration: 2
tensor([[[2.4201],
         [2.4493],
         [2.4168],
         [2.3433],
      

KeyboardInterrupt: 

# Evaluation

In [13]:
def evaluate(test_loader, model, criterion, threshold=0.1):
    """
    Evaluate the model.

    :param test_loader: DataLoader for test data
    :param model: model
    :param criterion: loss function
    """
    model.eval()  # evaluation mode disables dropout
    total_loss = 0.0
    total_corrects = 0

    # No need to track gradients for validation, we're not optimizing.
    with torch.no_grad():
        for i, (features, scores) in enumerate(test_loader):
            features = features.to(device)
            scores = scores.to(device)

            # Forward prop.
            predict_scores = model(features)

            # Loss
            loss = criterion(predict_scores, scores)

            total_loss += loss.item()

            # Calculate accuracy            
            binary_predictions = (loss < threshold).float()
            total_corrects += (binary_predictions == 1)

        avg_loss = total_loss / len(test_loader)
        avg_acc = total_corrects.double() / len(test_loader.dataset)

    return avg_loss, avg_acc

test_dataset = X_Clip_Dataset(device)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=workers, pin_memory=True)
print(evaluate(test_loader, model, criterion))

(0.2644921514249533, tensor(0.5767, device='cuda:0', dtype=torch.float64))


# Calcualte F1 score

In [26]:
import numpy as np
from sklearn.metrics import f1_score

def evaluate(model, criterion, data_loader):
    model.eval()
    all_predictions = []
    all_targets = []
    with torch.no_grad():
        for features, scores in data_loader:
            features = features.to(device)
            scores = scores.to(device)
            predict_scores = model(features)
            # Round up the scores
            predicted_labels = np.round(predict_scores.cpu().numpy())
            scores_labels = np.round(scores.cpu().numpy())
            # append labels
            all_predictions.extend(predicted_labels)
            all_targets.extend(scores_labels)
            # print(all_predictions)
            # print(all_targets)
    # convert to 1-d array
    all_predictions = np.array(all_predictions).flatten()
    all_targets = np.array(all_targets).flatten()
    f1 = f1_score(all_targets, all_predictions, average='micro')  # Calculate F1 score
    return f1

test_dataset = X_Clip_Dataset(device)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=workers, pin_memory=True)
f1_score = evaluate(model, criterion, test_loader)
print("F1 Score:", f1_score) 

F1 Score: 0.635
