# Parse raw data

In [1]:
!pip install --user webvtt-py tqdm inflect spacy && spacy download en

Collecting webvtt-py
  Downloading webvtt_py-0.4.5-py3-none-any.whl (16 kB)
Processing /home/raltman/.cache/pip/wheels/9b/04/dd/7daf4150b6d9b12949298737de9431a324d4b797ffd63f526e/docopt-0.6.2-py2.py3-none-any.whl
Installing collected packages: docopt, webvtt-py
Successfully installed docopt-0.6.2 webvtt-py-0.4.5


In [1]:
import re
import pandas as pd
import webvtt as wv
from tqdm import tqdm
from pathlib import Path
from typing import Iterator, Dict, Any

In [113]:
def split_on_pauses(vtt_parser, split_time: float = 2.) -> Iterator[Dict[str, Any]]:
    name = Path(vtt_parser.file).name
    duration = vtt_parser.total_length
    start, end = 0, 0
    text = ""
    for i, caption in enumerate(vtt_parser):
        if i % 2 == 1:  # Every other line is a new caption
            if (caption.start_in_seconds - end > split_time) and text:  # Split on extended silence
                yield dict(name=name, duration=duration, start=start, end=end, text=text)
                start = caption.start_in_seconds
                text = ""
            line = re.sub('\[.*\]', '', caption.text)  # Remove [Music] or [Laughter] captions
            line = re.sub('\s+', ' ', line)  # Condense whitespace
            if text:
                text += ' '
            text += line.strip()
            end = caption.end_in_seconds
        i += 1
    if text:
        yield dict(name=name, duration=duration, start=start, end=end, text=text)

In [114]:
records = []
files = list(Path('data').glob('*.en.vtt'))
for file in tqdm(files):
    vtt_parser = wv.read(file)
    records.extend(split_on_pauses(vtt_parser))
data = pd.DataFrame.from_records(records)
data = data.dropna()  # Drop empty lines of text
data = data.drop_duplicates(subset='text')  # Drop lines from duplicate videos
data.to_csv("data/raw_dataset.tsv", sep='\t', index=False)

100%|██████████| 655/655 [00:19<00:00, 34.45it/s]


In [115]:
data = pd.read_csv("data/raw_dataset.tsv", sep='\t')
content_duration = data.groupby('name').head(1).duration.sum() / (60 * 60)
print(f"Dataset contains {content_duration:.2f} hours of content")

Dataset contains 99.79 hours of content


# Preprocess data

## Preprocess for text generation

In [116]:
import re
import string
import inflect
import spacy

In [117]:
number_to_words = inflect.engine().number_to_words
all_numbers_to_words = lambda text: re.sub('([0-9]+[\.,]*[0-9]*)', lambda x: number_to_words(x[1]), text)

In [118]:
%time
data = pd.read_csv("data/raw_dataset.tsv", sep='\t')
data.text = data.text.apply(all_numbers_to_words)  # Convert numbers to words
data.text = data.text.str.replace('-', ' ')  # Remove hyphens
data.to_csv("data/cleaned_dataset.tsv", sep='\t', index=False)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


In [119]:
# Do train/valid/test split
train_frac = 0.9
valid_frac = 0.1
random_seed = 42

video_data = data.groupby('name').head(1)
video_data = video_data.sample(frac=1, random_state=random_seed)
video_names = video_data.name
video_fracs = video_data.duration.cumsum() / video_data.duration.sum()

with open("data/corpus.train.txt", 'w') as fh:
    train_names = video_names[(video_fracs < train_frac)]
    for name in train_names:
        train_lines = data[data.name == name].text
        print(*train_lines, sep='. ', file=fh)
with open("data/corpus.valid.txt", 'w') as fh:
    valid_names = video_names[(train_frac <= video_fracs) & (video_fracs < train_frac + valid_frac)]
    for name in valid_names:
        valid_lines = data[data.name == name].text
        print(*valid_lines, sep='. ', file=fh)
with open("data/corpus.test.txt", 'w') as fh:
    test_names = video_names[train_frac + valid_frac < video_fracs]
    for name in test_names:
        test_lines = data[data.name == name].text
        print(*test_lines, sep='. ', file=fh)

In [112]:
%%bash
tar czfv data/corpus.tar.gz data/corpus.*.txt
curl -F "file=@data/corpus.tar.gz" https://file.io/?expires=1d

data/corpus.test.txt
data/corpus.train.txt
data/corpus.valid.txt
{"success":true,"key":"Vr3tfD7AVqo6","link":"https://file.io/Vr3tfD7AVqo6","expiry":"14 days"}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1535k  100    94  100 1535k    661  10.5M --:--:-- --:--:-- --:--:-- 10.5M


## Preprocess for BOW

In [120]:
import spacy
from tqdm import tqdm
tqdm.pandas()

In [121]:
sp = spacy.load('en', disable=['parser', 'ner'])
lemmatize = lambda text: ' '.join([x.text if x.lemma_ == '-PRON-' else x.lemma_ for x in sp(text)])

In [122]:
data = pd.read_csv("data/cleaned_dataset.tsv", sep='\t')
data.text = data.text.apply(str.lower)  # All words to lowercase
data.text = data.text.str.replace(f'[{string.punctuation}]', '')  # Remove all common punctuation
data.text = data.text.progress_apply(lemmatize)  # Lemmatize tokens
data.to_csv("data/lemmatized_dataset.tsv", sep='\t', index=False)

100%|██████████| 67098/67098 [03:05<00:00, 362.46it/s]
