# Parse raw data

In [1]:
!pip install --user webvtt-py tqdm inflect spacy && spacy download en

Collecting webvtt-py
  Downloading webvtt_py-0.4.5-py3-none-any.whl (16 kB)
Processing /home/raltman/.cache/pip/wheels/9b/04/dd/7daf4150b6d9b12949298737de9431a324d4b797ffd63f526e/docopt-0.6.2-py2.py3-none-any.whl
Installing collected packages: docopt, webvtt-py
Successfully installed docopt-0.6.2 webvtt-py-0.4.5


In [1]:
import re
import pandas as pd
import webvtt as wv
from tqdm import tqdm
from pathlib import Path
from typing import Iterator, Dict, Any

In [2]:
def split_on_pauses(vtt_parser, split_time: float = 2.) -> Iterator[Dict[str, Any]]:
    name = Path(vtt_parser.file).name
    duration = vtt_parser.total_length
    start, end = 0, 0
    text = ""
    for i, caption in enumerate(vtt_parser):
        if i % 2 == 1:  # Every other line is a new caption
            if (caption.start_in_seconds - end > split_time) and text:  # Split on extended silence
                yield dict(name=name, duration=duration, start=start, end=end, text=text)
                start = caption.start_in_seconds
                text = ""
            line = re.sub('\[.*\]', '', caption.text)  # Remove [Music] or [Laughter] captions
            line = re.sub('\s+', ' ', line)  # Condense whitespace
            if text:
                text += ' '
            text += line.strip()
            end = caption.end_in_seconds
        i += 1
    if text:
        yield dict(name=name, duration=duration, start=start, end=end, text=text)

In [3]:
records = []
files = list(Path('data').glob('*.en.vtt'))
for file in tqdm(files):
    vtt_parser = wv.read(file)
    records.extend(split_on_pauses(vtt_parser))
data = pd.DataFrame.from_records(records)
data = data.dropna()  # Drop empty lines of text
data = data.drop_duplicates(subset='text')  # Drop lines from duplicate videos
data.to_csv("data/raw_dataset.tsv", sep='\t', index=False)

100%|██████████| 655/655 [00:18<00:00, 35.12it/s]


In [4]:
data

Unnamed: 0,name,duration,start,end,text
0,UftcYumwzdA.en.vtt,202,0.000,7.939,you know comfort food may sound simple but the...
1,UftcYumwzdA.en.vtt,202,10.780,12.139,right and to keeping it that way which is what...
2,UftcYumwzdA.en.vtt,202,14.230,14.240,doing for 70 years here in San Antonio
3,UftcYumwzdA.en.vtt,202,34.900,34.910,Texas at the tip-top cafe yeah always
4,UftcYumwzdA.en.vtt,202,37.329,37.339,been great ever since that'd be too easy
...,...,...,...,...,...
72031,URKEWBXpv40.en.vtt,249,208.410,208.420,here ready to go to flavortown no clean
72032,URKEWBXpv40.en.vtt,249,212.480,216.310,Steve are you sure this is good to eat
72033,URKEWBXpv40.en.vtt,249,224.820,224.830,I just okay anyways where was that all
72034,URKEWBXpv40.en.vtt,249,228.070,228.080,right here goes nothing


In [5]:
data = pd.read_csv("data/raw_dataset.tsv", sep='\t')
content_duration = data.groupby('name').head(1).duration.sum() / (60 * 60)
print(f"Dataset contains {content_duration:.2f} hours of content")

Dataset contains 99.79 hours of content


# Preprocess data

## Preprocess for text generation

In [6]:
import re
import string
import inflect
import spacy

In [7]:
number_to_words = inflect.engine().number_to_words
all_numbers_to_words = lambda text: re.sub('([0-9]+[\.,]*[0-9]*)', lambda x: number_to_words(x[1]), text)

In [8]:
%time
data = pd.read_csv("data/raw_dataset.tsv", sep='\t')
data.text = data.text.apply(all_numbers_to_words)  # Convert numbers to words
data.text = data.text.str.replace('-', ' ')  # Remove hyphens
data.to_csv("data/cleaned_dataset.tsv", sep='\t', index=False)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 11.9 µs


In [9]:
data

Unnamed: 0,name,duration,start,end,text
0,UftcYumwzdA.en.vtt,202,0.000,7.939,you know comfort food may sound simple but the...
1,UftcYumwzdA.en.vtt,202,10.780,12.139,right and to keeping it that way which is what...
2,UftcYumwzdA.en.vtt,202,14.230,14.240,doing for seventy years here in San Antonio
3,UftcYumwzdA.en.vtt,202,34.900,34.910,Texas at the tip top cafe yeah always
4,UftcYumwzdA.en.vtt,202,37.329,37.339,been great ever since that'd be too easy
...,...,...,...,...,...
67093,URKEWBXpv40.en.vtt,249,208.410,208.420,here ready to go to flavortown no clean
67094,URKEWBXpv40.en.vtt,249,212.480,216.310,Steve are you sure this is good to eat
67095,URKEWBXpv40.en.vtt,249,224.820,224.830,I just okay anyways where was that all
67096,URKEWBXpv40.en.vtt,249,228.070,228.080,right here goes nothing


In [10]:
# Do train/valid/test split
train_frac = 0.9
valid_frac = 0.1
random_seed = 42

video_data = data.groupby('name').head(1)
video_data = video_data.sample(frac=1, random_state=random_seed)
video_names = video_data.name
video_fracs = video_data.duration.cumsum() / video_data.duration.sum()

with open("data/corpus.train.txt", 'w') as fh:
    train_names = video_names[(video_fracs < train_frac)]
    for name in train_names:
        train_lines = data[data.name == name].text
        print(*train_lines, sep='. ', file=fh)
with open("data/corpus.valid.txt", 'w') as fh:
    valid_names = video_names[(train_frac <= video_fracs) & (video_fracs < train_frac + valid_frac)]
    for name in valid_names:
        valid_lines = data[data.name == name].text
        print(*valid_lines, sep='. ', file=fh)
with open("data/corpus.test.txt", 'w') as fh:
    test_names = video_names[train_frac + valid_frac < video_fracs]
    for name in test_names:
        test_lines = data[data.name == name].text
        print(*test_lines, sep='. ', file=fh)

In [11]:
valid_lines

44948          this trip it's really nice lights out chefs
44949    hot hut go bacon their roots right into their ...
44950    this is an adventure what next there's louisia...
44951     lots of big flavors israeli with a twisted organ
44952    yes i'll have one of those and a funky pizza j...
                               ...                        
45158    but it's just phenomenal just brings a whole n...
45159    neighbor and he flies us up here every two wee...
45160    seriously you fly in here for lunch i got shri...
45161    bad so was that another righteous road trip fo...
45162    but don't you worry we got plenty more joints ...
Name: text, Length: 215, dtype: object

In [12]:
%%bash
tar czfv data/corpus.tar.gz data/corpus.*.txt
curl -F "file=@data/corpus.tar.gz" https://file.io/?expires=1d

data/corpus.test.txt
data/corpus.train.txt
data/corpus.valid.txt
{"success":true,"key":"BoQKvCaXfniX","link":"https://file.io/BoQKvCaXfniX","expiry":"1 day"}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1514k  100    92  100 1514k    779  12.5M --:--:-- --:--:-- --:--:-- 12.6M


## Preprocess for BOW

In [13]:
import spacy
from tqdm import tqdm
tqdm.pandas()

In [14]:
sp = spacy.load('en', disable=['parser', 'ner'])
lemmatize = lambda text: ' '.join([x.text if x.lemma_ == '-PRON-' else x.lemma_ for x in sp(text)])

In [15]:
pd.read_csv("data/lemmatized_dataset.tsv", sep='\t')

Unnamed: 0,name,duration,start,end,text
0,UftcYumwzdA.en.vtt,202,0.000,7.939,you know comfort food may sound simple but the...
1,UftcYumwzdA.en.vtt,202,10.780,12.139,right and to keep it that way which be what th...
2,UftcYumwzdA.en.vtt,202,14.230,14.240,do for seventy year here in san antonio
3,UftcYumwzdA.en.vtt,202,34.900,34.910,texas at the tip top cafe yeah always
4,UftcYumwzdA.en.vtt,202,37.329,37.339,be great ever since that d be too easy
...,...,...,...,...,...
67093,URKEWBXpv40.en.vtt,249,208.410,208.420,here ready to go to flavortown no clean
67094,URKEWBXpv40.en.vtt,249,212.480,216.310,steve be you sure this be good to eat
67095,URKEWBXpv40.en.vtt,249,224.820,224.830,i just okay anyways where be that all
67096,URKEWBXpv40.en.vtt,249,228.070,228.080,right here go nothing


ERROR! Session/line number was not unique in database. History logging moved to new session 1077


In [14]:
data = pd.read_csv("data/cleaned_dataset.tsv", sep='\t')
data.text = data.text.apply(str.lower)  # All words to lowercase
data.text = data.text.str.replace(f'[{string.punctuation}]', '')  # Remove all common punctuation
data.text = data.text.progress_apply(lemmatize)  # Lemmatize tokens
data.to_csv("data/lemmatized_dataset.tsv", sep='\t', index=False)

  9%|▉         | 6033/67098 [00:17<02:52, 354.82it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/raltman/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-d3b58a97d6b6>", line 4, in <module>
    data.text = data.text.progress_apply(lemmatize)  # Lemmatize tokens
  File "/home/raltman/.local/lib/python3.6/site-packages/tqdm/std.py", line 797, in inner
    return getattr(df, df_function)(wrapper, **kwargs)
  File "/home/raltman/.local/lib/python3.6/site-packages/pandas/core/series.py", line 4200, in apply
    mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "pandas/_libs/lib.pyx", line 2402, in pandas._libs.lib.map_infer
  File "/home/raltman/.local/lib/python3.6/site-packages/tqdm/std.py", line 792, in wrapper
    return func(*args, **kwargs)
  File "<ipython-input-14-60d3095fb733>", line 2, in <lambda>
    lemmatize = lambda text: ' '.join([x.text if x.lemma_ == '-PRON-' else x.lemma_ for x in sp(

TypeError: object of type 'NoneType' has no len()