In [11]:
import webvtt
from datetime import datetime
from typing import List, Tuple
import re

In [15]:
# Some vtt files do not contain full time stamp information for each word token
# In these case, need to come up with an estimated time span for each word, by 
# dividing the elapse time of the entire sentence evenly

# For instance, `data/raw_videos/3EfyAzayxqQ.en.vtt` looks like the following:
# ...
# 00:00:00.680 --> 00:00:03.389
# do you need to be good-looking be an actor?

# 00:00:03.389 --> 00:00:08.410
# yes. thanks for watching. what? come on
# ...

# 1. Need to have a simple tokenizer that recognizes the period (.), comma (,), question mark (?), and exclamation (!)

# vtt = webvtt.read('../data/raw_videos/3EfyAzayxqQ.en.vtt')
vtt = webvtt.read('../data/raw_videos/91_5g9fTGKM.en.vtt')

print(len(vtt))
print(len(vtt.captions))

print(type(vtt.captions[0]))

print(vtt.captions[0])
print(vtt.captions[0].text)
print(vtt.captions[0].start, vtt.captions[0].end)

print(vtt.captions[0].lines)

full_text = ''
for i in range(len(vtt.captions)):
    full_text += ' '.join(vtt.captions[i].lines)
# print(full_text)
# full_text = 'sfadfd'
match = re.findall('<(.+?)>', full_text)
print(len(match))

351
351
<class 'webvtt.structures.Caption'>
00:00:00.000 00:00:01.589  \nwhat's up my fellow actors kurt you here
 
what's up my fellow actors kurt you here
00:00:00.000 00:00:01.589
[' ', "what's<00:00:00.160><c> up</c><00:00:00.240><c> my</c><00:00:00.320><c> fellow</c><00:00:00.640><c> actors</c><00:00:00.960><c> kurt</c><00:00:01.199><c> you</c><00:00:01.360><c> here</c>"]
0


In [4]:
def str_to_float(time_str: str):
    """
    :param time_str:
    :return: the amount of time measured in seconds
    """
    ms = datetime.strptime(time_str, "%H:%M:%S.%f").microsecond
    s = datetime.strptime(time_str, "%H:%M:%S.%f").second
    m = datetime.strptime(time_str, "%H:%M:%S.%f").minute
    h = datetime.strptime(time_str, "%H:%M:%S.%f").hour
    return ms/1e6 + s + m*60 + h*3600

In [7]:
print(str_to_float(vtt.captions[0].start))
print(str_to_float(vtt.captions[0].end))

print(str_to_float(vtt.captions[-1].start))
print(str_to_float(vtt.captions[-1].end))

0.68
3.3890000000000002
182.39
185.91


In [20]:
def get_average_timespans(line: str, start: float, end: float):
    words = line.strip().split(' ')
    words = [w for w in words if w]
    # clean each word token that contain punctuations
    def _clean_word(word: str) -> str:
        if word.endswith('.') or word.endswith(',') or word.endswith('?') or word.endswith('!'):
            return word[:-1]
        else:
            return word
    words = list(map(_clean_word, words))

    if len(words) == 0:
        return None, None
    elif len(words) == 1:
        return words, [(start, end)]
    else:
        avg_span = (end - start) / len(words)
        time_spans = []
        for i in range(len(words)):
            time_spans.append((i*avg_span + start, (i+1)*avg_span + start))
        return words, time_spans

In [21]:
words, time_spans = get_average_timespans(vtt.captions[0].text, str_to_float(vtt.captions[0].start), str_to_float(vtt.captions[0].end))
print(words)
print(time_spans)

['do', 'you', 'need', 'to', 'be', 'good-looking', 'be', 'an', 'actor']
[(0.68, 0.9810000000000001), (0.9810000000000001, 1.282), (1.282, 1.5830000000000002), (1.5830000000000002, 1.884), (1.884, 2.185), (2.185, 2.486), (2.486, 2.787), (2.787, 3.088), (3.088, 3.3890000000000002)]
