# Hands-on tutorial for AV-HuBERT

In this notebook, we show-case how to use pre-trained models for:
* lip reading
* feature extraction

## Preliminaries
This section installs necessary python packages for the other sections. Run it first.

In [1]:
import os 

%cd /srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo

!git clone https://github.com/facebookresearch/av_hubert.git

%cd av_hubert
!git submodule init
!git submodule update
!pip install scipy
!pip install sentencepiece
!pip install python_speech_features
!pip install scikit-video

%cd fairseq
!pip install ./

/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo
Cloning into 'av_hubert'...
remote: Enumerating objects: 146, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 146 (delta 17), reused 25 (delta 12), pack-reused 111[K
Receiving objects: 100% (146/146), 4.65 MiB | 1.37 MiB/s, done.
Resolving deltas: 100% (63/63), done.
/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/av_hubert
Submodule 'fairseq' (https://github.com/pytorch/fairseq) registered for path 'fairseq'
Cloning into '/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/av_hubert/fairseq'...
Submodule path 'fairseq': checked out 'afc77bdf4bb51453ce76f1572ef2ee6ddcda8eeb'
/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/av_hubert/fairseq
Processing /srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/av_hubert/fairseq
  In

## Import a pre-trained model
This section illustrates how to load a pre-trained model and use it for inference.

1. Download a model checkpoint

In [2]:
!pwd
%mkdir -p /srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/pretrained_model
!wget https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/vsr/base_vox_433h.pt -O /srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/pretrained_model/finetune-model.pt

/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/av_hubert/fairseq
--2023-06-05 16:09:31--  https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/vsr/base_vox_433h.pt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.164.52.45, 18.164.52.29, 18.164.52.20, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.164.52.45|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1928063847 (1.8G) [binary/octet-stream]
Saving to: ‘/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/pretrained_model/finetune-model.pt’


2023-06-05 16:09:38 (249 MB/s) - ‘/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/pretrained_model/finetune-model.pt’ saved [1928063847/1928063847]



3. Extract visual feature with the model

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import torch

import cv2
import tempfile
import torch
import utils as avhubert_utils
from argparse import Namespace
import fairseq
from fairseq import checkpoint_utils, options, tasks, utils
#from IPython.display import HTML

In [2]:
def load_array(path):
  arr = np.load(path )
  arr = np.transpose(arr.reshape(67,67,-1),axes=(1,0,2) )
  arr = torch.from_numpy(np.transpose(arr.reshape(67,67,-1),axes=(2,0,1) ))
  arr = torch.FloatTensor(arr)
  #plt.imshow(arr[0,...], cmap='gray')
  #plt.show()
  #print(arr.shape); print(arr.max())
  return arr

def load_array_2(path,ind=0):
  import torchvision.transforms as trf
  arr = np.load(path )
  arr = np.transpose(arr.reshape(67,67,-1),axes=(1,0,2) )
  arr = trf.ToTensor()(arr)
  plt.imshow(arr[ind,...], cmap='gray')
  plt.show()


In [3]:

def extract_visual_feature(video_path, ckpt_path, user_dir, is_finetune_ckpt=False):
  utils.import_user_module(Namespace(user_dir=user_dir))
  models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
  #frames = #avhubert_utils.load_video(video_path)
  print(f"task, {task}")
  print(f"saved_cfg, {saved_cfg}")
      
  frames = load_array(video_path)  
  #print(f"frames.shape {frames.shape}")

  frames = torch.FloatTensor(frames).unsqueeze(dim=0).unsqueeze(dim=0).cuda()
  model = models[0]
  if hasattr(models[0], 'decoder'):
    print(f"Checkpoint: fine-tuned")
    model = models[0].encoder.w2v_model
  else:
    print(f"Checkpoint: pre-trained w/o fine-tuning")
  model.cuda()
  model.eval()
  with torch.no_grad():
    # Specify output_layer if you want to extract feature of an intermediate layer
    feature, _ = model.extract_finetune(source={'video': frames, 'audio': None}, padding_mask=None, output_layer=None)
    feature = feature.squeeze(dim=0)
  #print(f"Video feature shape: {feature.shape}")
  return feature

In [4]:
mouth_roi_path = "/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/sa1Raw.npy"
ckpt_path = "/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/pretrained_model/finetune-model.pt"
user_dir = "/srv/storage/talc3@storage4.nancy.grid5000.fr/multispeech/calcul/users/jayilo/av_hubert/avhubert"

feature = extract_visual_feature(mouth_roi_path, ckpt_path, user_dir)

task, <avhubert.hubert_pretraining.AVHubertPretrainingTask object at 0x7efb90b96340>
saved_cfg, {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 200, 'log_format': 'json', 'log_file': None, 'tensorboard_logdir': 'tblog', 'wandb_project': None, 'azureml_logging': False, 'seed': 1337, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': '/private/home/bshi/code/fairseq-py/examples/av_hubert/model', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_

In [5]:
feature.shape

torch.Size([155, 768])