In [3]:
# Inflate a DINO Image transformer model into a DINO Video transformer model that uses 32 frames
!python3 vivit_transformers/convert_vit_to_vivit.py --vit_model_path facebook/dino-vitb16 --tubelet_n 2 --video_length 32 --output_path vivit_dino_32_untrained

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vitb16 and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from data import VIVIT_UCF101
from transformers import ViTFeatureExtractor, TrainingArguments, Trainer
from vivit_transformers import ViViTForImageClassification
import joblib
import os
from sklearn.metrics import accuracy_score


def compute_metrics(data):
    return {'accuracy': accuracy_score(data.label_ids, data.predictions.argmax(-1))}

# Train ViViT with random weights

In [None]:
from vivit_transformers import ViViTConfig
config = ViViTConfig.from_pretrained("./vivit_dino_32_untrained", num_labels=101)

# ViViT uses the same feature extractor as ViT (cropping, norm and etc)
feature_extractor = ViTFeatureExtractor.from_pretrained('facebook/dino-vitb16')

dataset_train = VIVIT_UCF101('UCF-101', 'ucfTrainTestlist', 1, True, feature_extractor=feature_extractor)
dataset_test = VIVIT_UCF101('UCF-101', 'ucfTrainTestlist', 1, False, feature_extractor=feature_extractor)

model = ViViTForImageClassification(config)

for p in model.parameters():
    p.requires_grad = True 


BATCH_SIZE=6
GRAD_ACC_STEPS=2

training_args = TrainingArguments(
    output_dir='./vivit_dino_32frames_random',          # output directory
    num_train_epochs=15,              # total # of training epochs
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    warmup_steps=10000/GRAD_ACC_STEPS ,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_strategy='steps',
    save_strategy="epoch",
    fp16=True,
    dataloader_pin_memory=True,
    learning_rate=5e-4,
    evaluation_strategy='epoch',
    load_best_model_at_end=True,
    logging_steps=500,
    label_smoothing_factor=0.15,
    save_total_limit=1,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    dataloader_num_workers=16
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset_train,         # training dataset
    eval_dataset=dataset_test,            # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

# Train ViViT with DINO weights

In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained('facebook/dino-vitb16')

dataset_train = VIVIT_UCF101('UCF-101', 'ucfTrainTestlist', 1, True, feature_extractor=feature_extractor)
dataset_test = VIVIT_UCF101('UCF-101', 'ucfTrainTestlist', 1, False, feature_extractor=feature_extractor)

model = ViViTForImageClassification.from_pretrained('./vivit_dino_32_untrained', num_labels=101)

for p in model.parameters():
    p.requires_grad = True 


BATCH_SIZE=6
GRAD_ACC_STEPS=2

training_args = TrainingArguments(
    output_dir='./vivit_dino_32frames',          # output directory
    num_train_epochs=10,              # total # of training epochs
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    warmup_steps=10000/GRAD_ACC_STEPS ,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_strategy='steps',
    save_strategy="epoch",
    fp16=True,
    dataloader_pin_memory=True,
    learning_rate=5e-5,
    evaluation_strategy='epoch',
    load_best_model_at_end=True,
    logging_steps=500,
    label_smoothing_factor=0.15,
    save_total_limit=1,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    dataloader_num_workers=16
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset_train,         # training dataset
    eval_dataset=dataset_test,            # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

# Train ViViT with DINO weights with 32 keyframes

In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained('facebook/dino-vitb16')

train_kfs = joblib.load("keyframes/train_01_32frames.pkl") 
test_kfs = joblib.load("keyframes/test_01_32frames.pkl")

dataset_train = VIVIT_UCF101('UCF-101', 'ucfTrainTestlist', 1, True, feature_extractor=feature_extractor, frame_sampler=train_kfs)
dataset_test = VIVIT_UCF101('UCF-101', 'ucfTrainTestlist', 1, False, feature_extractor=feature_extractor, frame_sampler=test_kfs)

model = ViViTForImageClassification.from_pretrained('./vivit_dino_32_untrained', num_labels=101)

for p in model.parameters():
    p.requires_grad = True 


BATCH_SIZE=6
GRAD_ACC_STEPS=2

training_args = TrainingArguments(
    output_dir='./vivit_dino_32frames',          # output directory
    num_train_epochs=10,              # total # of training epochs
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    warmup_steps=10000/GRAD_ACC_STEPS ,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_strategy='steps',
    save_strategy="epoch",
    fp16=True, 
    dataloader_pin_memory=True,
    learning_rate=5e-5,
    evaluation_strategy='epoch',
    load_best_model_at_end=True,
    logging_steps=500,
    label_smoothing_factor=0.15,
    save_total_limit=1,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    dataloader_num_workers=16
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset_train,         # training dataset
    eval_dataset=dataset_test,            # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

# Train ViViT with DINO weights with 10 keyframes

In [2]:
# Inflate a DINO Image transformer model into a DINO Video transformer model that uses 10 frames
!python3 vivit_transformers/convert_vit_to_vivit.py --vit_model_path facebook/dino-vitb16 --tubelet_n 2 --video_length 10 --output_path vivit_dino_10_untrained

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vitb16 and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from data import VIVIT_UCF101
from transformers import ViTFeatureExtractor, TrainingArguments, Trainer
from vivit_transformers import ViViTForImageClassification
import joblib
import os
from sklearn.metrics import accuracy_score


feature_extractor = ViTFeatureExtractor.from_pretrained('facebook/dino-vitb16')

train_kfs = joblib.load("keyframes/train_01_10frames.pkl") 
test_kfs = joblib.load("keyframes/test_01_10frames.pkl")

dataset_train = VIVIT_UCF101('UCF-101', 'ucfTrainTestlist', 1, True, feature_extractor=feature_extractor, frame_sampler=train_kfs)
dataset_test = VIVIT_UCF101('UCF-101', 'ucfTrainTestlist', 1, False, feature_extractor=feature_extractor, frame_sampler=test_kfs)

dataset_train.n_frames = 10
dataset_test.n_frames = 10

model = ViViTForImageClassification.from_pretrained('./vivit_dino_10_untrained', num_labels=101)

for p in model.parameters():
    p.requires_grad = True 


BATCH_SIZE=12
GRAD_ACC_STEPS=1

training_args = TrainingArguments(
    output_dir='./vivit_dino_10frames_kmeans',          # output directory
    num_train_epochs=10,              # total # of training epochs
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    warmup_steps=10000/GRAD_ACC_STEPS ,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_strategy='steps',
    save_strategy="epoch",
    fp16=True,
    dataloader_pin_memory=True,
    learning_rate=5e-5,
    evaluation_strategy='epoch',
    load_best_model_at_end=True,
    logging_steps=500,
    label_smoothing_factor=0.15,
    save_total_limit=1,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    dataloader_num_workers=16
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset_train,         # training dataset
    eval_dataset=dataset_test,            # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()