# Install & load libraries

To run this jupyter notebook:

* create a virtual environment, for example using [miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)
* install python 3.8 (so we all have the same version) into your virtual environment (if you have `miniconda` installed, you can do this while setting up the virtual environment: `conda create -n venv_name python=3.8 pip`
* activate virtual environment `conda activate venv_name`
* install necessary packages into the environment (including `jupyter`). Before you install the `spacy` package, check the [documentation](https://spacy.io/usage) and make sure you select the installation method for your operating system. 
* after installing `spacy`, you need to download the transformer pipeline (in English) (this contains the transformer, tagger, parser, ...) - this should be something like `python -m spacy download en_core_web_trf`
* run the notebook: `jupyter notebook`

In [1]:
# !pip install pysrt
# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download en_core_web_lg

Collecting pysrt
  Downloading pysrt-1.1.2.tar.gz (104 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.4/104.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting chardet
  Downloading chardet-4.0.0-py2.py3-none-any.whl (178 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.7/178.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m[31m8.8 MB/s[0m eta [36m0:00:01[0m
[?25hBuilding wheels for collected packages: pysrt
  Building wheel for pysrt (setup.py) ... [?25ldone
[?25h  Created wheel for pysrt: filename=pysrt-1.1.2-py3-none-any.whl size=13444 sha256=d08cbdcc2187bebd9c5d3c94053192023ddbef3f5b8e704315877e14849ff194
  Stored in directory: /Users/lenamangold/Library/Caches/pip/wheels/30/a6/ab/4705174e11f44e74d58c14b32edbacbc852644f86658316aef
Successfully built pysrt
Installing collected packages: chardet, pysrt
Successfully installed char

Collecting torch>=1.6.0
  Using cached torch-1.11.0-cp38-none-macosx_11_0_arm64.whl (43.1 MB)
Collecting spacy-alignments<1.0.0,>=0.7.2
  Using cached spacy_alignments-0.8.5-cp38-cp38-macosx_11_0_arm64.whl (302 kB)
Collecting transformers<4.20.0,>=3.4.0
  Using cached transformers-4.19.4-py3-none-any.whl (4.2 MB)
Collecting filelock
  Using cached filelock-3.7.1-py3-none-any.whl (10 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting pyyaml>=5.1
  Using cached PyYAML-6.0-cp38-cp38-macosx_11_0_arm64.whl
Collecting regex!=2019.12.17
  Using cached regex-2022.6.2-cp38-cp38-macosx_11_0_arm64.whl (281 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Using cached tokenizers-0.12.1.tar.gz (220 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting r

In [3]:
import pysrt
import spacy
from tqdm.auto import tqdm
from copy import deepcopy
from tabulate import tabulate

In [4]:
subtitles = pysrt.open("./Copy of 08. Lords of the air.srt")
for subtitle in subtitles[:5]:
    print(subtitle)

1
00:01:19,727 --> 00:01:23,356
White storks. If you wanted to pick one bird

2
00:01:23,567 --> 00:01:26,400
as a representative
of all the birds in the world,

3
00:01:26,447 --> 00:01:29,200
you could do worse
than pick the white stork.

4
00:01:29,327 --> 00:01:32,637
It's a marvellous flyer, an intrepid traveller.

5
00:01:33,167 --> 00:01:38,366
This pair will have come from Africa
to nest in this small town in Bavaria.



Load the transformer model into SpaCy.

In [5]:
# spacy.require_gpu() # Infer through transformer on GPU (far more efficient than on CPU)
nlp = spacy.load("en_core_web_lg") # Load the transformer model into SpaCy

In [6]:
def token_filter(t, pos_filter={}):
    """ 
    Returns a boolean whether a token should be kept or not.
    
    Parameters:
    t (spacy.Doc.Token): A spaCy token from the document being preprocessed.
    
    Returns:
    bool: A boolean to indicate whether the token should be kept or not.
    
    """
    has_digit = lambda s : any(i.isdigit() for i in s)
    return (not t.is_punct 
            and not has_digit(t.text) 
            and len(t.text)>2 
            and not t.is_stop
            and not t.pos_ in pos_filter)

def preprocess(doc, pos_filter={}):
    """
    Splits documents into tokens, filters out unwanted tokens and lemmatizes the
    text.
    
    Parameters:
    doc (spacy.document)
    """
    out = list()
    for subtitle in doc:
        s = []
        for token in subtitle:
            if token_filter(token, pos_filter):
                s.append(token.text.lower())
        out.append(s)
    return out

Tokenise and preprocess. In this example, we only split each line from the subtitles into its individual tokens (see here for details on tokenization) through the `pipe` method. 

The preprocess method removes punctuation, removes words of length 2 or shorter and removes English stopwords (and, of, if, ...).

In [7]:
utterances = [subtitle.text for subtitle in subtitles]

corpus = nlp.pipe(utterances, batch_size=26000)

processed = preprocess(corpus)

processed[:15]

[['white', 'storks', 'wanted', 'pick', 'bird'],
 ['representative', 'birds', 'world'],
 ['worse', 'pick', 'white', 'stork'],
 ['marvellous', 'flyer', 'intrepid', 'traveller'],
 ['pair', 'come', 'africa', 'nest', 'small', 'town', 'bavaria'],
 ['complicated', 'courtship', 'greeting', 'rituals'],
 ['devoted', 'parents'],
 ['stand', 'birds', 'world', 'stork', 'feather'],
 ['seen', 'key', 'crucial', 'bird'],
 ['feather', 'marvellous', 'aerofoil'],
 ['man', 'invent', 'strong', 'weight', 'weight'],
 ['extremely', 'efficient', 'insulator', 'important', 'bird'],
 ['complicated', 'structure'],
 ['feather', 'separate', 'filaments'],
 ['central', 'quill']]

In [8]:
preprocessed_subtitles = deepcopy(subtitles)

for subtitle, proc_subtitle in zip(preprocessed_subtitles, processed):
    subtitle.text = " ".join(proc_subtitle)

In [9]:
for subtitle in preprocessed_subtitles[:5]:
    print(subtitle)

1
00:01:19,727 --> 00:01:23,356
white storks wanted pick bird

2
00:01:23,567 --> 00:01:26,400
representative birds world

3
00:01:26,447 --> 00:01:29,200
worse pick white stork

4
00:01:29,327 --> 00:01:32,637
marvellous flyer intrepid traveller

5
00:01:33,167 --> 00:01:38,366
pair come africa nest small town bavaria



In [10]:
preprocessed_subtitles.save('./preprocessed_sub.txt')