# Convert AutoAVSR label files to AV-HuBERT / MuAViC label files 

In [None]:
import os
root = '/data/sls/scratch/roudi/datasets/lrs2'
output_root = os.path.join(root, 'whisper-flamingo')

In [None]:
import pandas as pd
def read_label_file(file):
    # Read the CSV file
    df = pd.read_csv(file)
    # Extract the second column as a list
    video_list = df.iloc[:, 1].tolist()
    video_list = [os.path.join(root, 'lrs2', i) for i in video_list]
    return video_list

['/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6330311066473698535/00018.mp4', '/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6330311066473698535/00022.mp4', '/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6330311066473698535/00025.mp4', '/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6331559613336179781/00019.mp4', '/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6331559613336179781/00020.mp4', '/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6331559613336179781/00021.mp4', '/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6331559613336179781/00027.mp4', '/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6331559613336179781/00029.mp4', '/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6331559613336179781/00036.mp4', '/data/sls/scratch/roudi/datasets/lrs2/lrs2/lrs2_video_seg24s/main/6331559613336179781/00038.mp4', '/data/sl

In [3]:
# from https://github.com/facebookresearch/av_hubert/blob/main/avhubert/preparation/count_frames.py
from tqdm import tqdm
import cv2
from scipy.io import wavfile

def count_frames(fids):
    total_num_frames = []
    for fid in tqdm(fids):
        video_fn = fid
        wav_fn = video_fn.replace('.mp4', '.wav')
        num_frames_audio = len(wavfile.read(wav_fn)[1])
        cap = cv2.VideoCapture(video_fn)
        num_frames_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        total_num_frames.append([num_frames_audio, num_frames_video])
    return total_num_frames

In [None]:
def write_manifest(output_file, video_list, total_num_frames): 
    with open(os.path.join(output_root, output_file), 'w') as fo:
        fo.write('/\n')
        for i in range(len(total_num_frames)):
            fid = video_list[i].replace('/data/sls/scratch/roudi/datasets/lrs2/lrs2/', '')
            fo.write('\t'.join([fid, 
                                video_list[i], 
                                video_list[i].replace('.mp4', '.wav'), 
                                str(total_num_frames[i][1]), 
                                str(total_num_frames[i][0])]
                                ) + '\n')

In [None]:
test_label_file = os.path.join(root, 'labels/lrs2_test_transcript_lengths_seg24s.csv')
video_list = read_label_file(test_label_file)
total_num_frames = count_frames(video_list)
write_manifest('test.tsv', video_list, total_num_frames)

In [None]:
valid_label_file = os.path.join(root, 'labels/lrs2_valid_transcript_lengths_seg24s.csv')
video_list = read_label_file(valid_label_file)
total_num_frames = count_frames(video_list)
write_manifest('valid.tsv', video_list, total_num_frames)

100%|██████████| 1081/1081 [00:12<00:00, 88.22it/s]


In [None]:
train_label_file = os.path.join(root, 'labels/lrs2_train_transcript_lengths_seg24s.csv')
video_list = read_label_file(train_label_file)
total_num_frames = count_frames(video_list)
write_manifest('train.tsv', video_list, total_num_frames)

100%|██████████| 143683/143683 [2:16:20<00:00, 17.57it/s]  


# Make text labels

In [None]:
def load_sentences(video_list):
    text_file_list = [i.replace('video', 'text').replace('.mp4', '.txt') for i in video_list]
    sentences = []
    for text_file in text_file_list:
        with open(os.path.join(root, text_file), 'r') as file:
            first_line = file.readline().strip().lower()
            sentences.append(first_line)
    return sentences

In [None]:
def write_txt(output_file, sentences):
    with open(os.path.join(output_root, output_file), 'w') as file:
        for sentence in sentences:
            file.write(sentence + '\n')

In [6]:
valid_label_file = os.path.join(root, 'labels/lrs2_valid_transcript_lengths_seg24s.csv')
video_list = read_label_file(valid_label_file)
sentences = load_sentences(video_list)
write_txt('valid.wrd', sentences)

In [7]:
test_label_file = os.path.join(root, 'labels/lrs2_test_transcript_lengths_seg24s.csv')
video_list = read_label_file(test_label_file)
sentences = load_sentences(video_list)
write_txt('test.wrd', sentences)

In [8]:
train_label_file = os.path.join(root, 'labels/lrs2_train_transcript_lengths_seg24s.csv')
video_list = read_label_file(train_label_file)
sentences = load_sentences(video_list)
write_txt('train.wrd', sentences)