In [1]:
!pip install pysrt
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_trf



In [1]:
import pysrt
import spacy
from tqdm.auto import tqdm
from copy import deepcopy
from tabulate import tabulate

In [2]:
subtitles = pysrt.open("./Copy of 08. Lords of the air.srt")
for subtitle in subtitles[:5]:
    print(subtitle)

1
00:01:19,727 --> 00:01:23,356
White storks. If you wanted to pick one bird

2
00:01:23,567 --> 00:01:26,400
as a representative
of all the birds in the world,

3
00:01:26,447 --> 00:01:29,200
you could do worse
than pick the white stork.

4
00:01:29,327 --> 00:01:32,637
It's a marvellous flyer, an intrepid traveller.

5
00:01:33,167 --> 00:01:38,366
This pair will have come from Africa
to nest in this small town in Bavaria.



In [3]:
spacy.require_gpu() # Infer through transformer on GPU (far more efficient than on CPU)
nlp = spacy.load("en_core_web_trf") # Load the transformer model into SpaCy



In [4]:
def token_filter(t, pos_filter={}):
    """ 
    Returns a boolean whether a token should be kept or not.
    
    Parameters:
    t (spacy.Doc.Token): A spaCy token from the document being preprocessed.
    
    Returns:
    bool: A boolean to indicate whether the token should be kept or not.
    
    """
    has_digit = lambda s : any(i.isdigit() for i in s)
    return (not t.is_punct 
            and not has_digit(t.text) 
            and len(t.text)>2 
            and not t.is_stop
            and not t.pos_ in pos_filter)

def preprocess(doc, pos_filter={}):
    """
    Splits documents into tokens, filters out unwanted tokens and lemmatizes the
    text.
    
    Parameters:
    doc (spacy.document)
    """
    out = list()
    for subtitle in doc:
        s = []
        for token in subtitle:
            if token_filter(token, pos_filter):
                s.append(token.text.lower())
        out.append(s)
    return out

In [5]:
utterances = [subtitle.text for subtitle in subtitles]

corpus = nlp.pipe(utterances, batch_size=26000)

processed = preprocess(corpus)

processed[:15]

[['white', 'storks', 'wanted', 'pick', 'bird'],
 ['representative', 'birds', 'world'],
 ['worse', 'pick', 'white', 'stork'],
 ['marvellous', 'flyer', 'intrepid', 'traveller'],
 ['pair', 'come', 'africa', 'nest', 'small', 'town', 'bavaria'],
 ['complicated', 'courtship', 'greeting', 'rituals'],
 ['devoted', 'parents'],
 ['stand', 'birds', 'world', 'stork', 'feather'],
 ['seen', 'key', 'crucial', 'bird'],
 ['feather', 'marvellous', 'aerofoil'],
 ['man', 'invent', 'strong', 'weight', 'weight'],
 ['extremely', 'efficient', 'insulator', 'important', 'bird'],
 ['complicated', 'structure'],
 ['feather', 'separate', 'filaments'],
 ['central', 'quill']]

In [6]:
doc = nlp("I am an apple")

In [7]:
preprocessed_subtitles = deepcopy(subtitles)

for subtitle, proc_subtitle in zip(preprocessed_subtitles, processed):
    subtitle.text = " ".join(proc_subtitle)

In [8]:
for subtitle in preprocessed_subtitles[:5]:
    print(subtitle)

1
00:01:19,727 --> 00:01:23,356
white storks wanted pick bird

2
00:01:23,567 --> 00:01:26,400
representative birds world

3
00:01:26,447 --> 00:01:29,200
worse pick white stork

4
00:01:29,327 --> 00:01:32,637
marvellous flyer intrepid traveller

5
00:01:33,167 --> 00:01:38,366
pair come africa nest small town bavaria



In [9]:
preprocessed_subtitles.save('./preprocessed_sub.txt')