# Process subtitle text and extract part of speech considering pronoun resolution

In [1]:
# install spacy version 2.1.3 due to issues with the neural co ref library 
!pip install spacy==2.1.3


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==2.1.3
  Downloading spacy-2.1.3-cp37-cp37m-manylinux1_x86_64.whl (27.7 MB)
[K     |████████████████████████████████| 27.7 MB 1.6 MB/s 
Collecting preshed<2.1.0,>=2.0.1
  Downloading preshed-2.0.1-cp37-cp37m-manylinux1_x86_64.whl (82 kB)
[K     |████████████████████████████████| 82 kB 397 kB/s 
Collecting blis<0.3.0,>=0.2.2
  Downloading blis-0.2.4-cp37-cp37m-manylinux1_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 37.4 MB/s 
[?25hCollecting jsonschema<3.0.0,>=2.6.0
  Downloading jsonschema-2.6.0-py2.py3-none-any.whl (39 kB)
Collecting srsly<1.1.0,>=0.0.5
  Downloading srsly-1.0.5-cp37-cp37m-manylinux2014_x86_64.whl (184 kB)
[K     |████████████████████████████████| 184 kB 56.5 MB/s 
[?25hCollecting thinc<7.1.0,>=7.0.2
  Downloading thinc-7.0.8-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 46.3 MB/

In [2]:
# install the neural co ref
!pip install neuralcoref

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting neuralcoref
  Downloading neuralcoref-4.0-cp37-cp37m-manylinux1_x86_64.whl (286 kB)
[K     |████████████████████████████████| 286 kB 4.2 MB/s 
Collecting boto3
  Downloading boto3-1.24.14-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 49.2 MB/s 
Collecting botocore<1.28.0,>=1.27.14
  Downloading botocore-1.27.14-py3-none-any.whl (8.9 MB)
[K     |████████████████████████████████| 8.9 MB 49.4 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 7.4 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 63.2 MB/s 
Installing collected packages: urlli

In [5]:
# download the small english library
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_sm==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 4.3 MB/s 
[?25hBuilding wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.1.0-py3-none-any.whl size=11074433 sha256=3b1aac9fdf4c7948419255b6eb6bab2aee7dad4c18aa8362c4545959f2f18455
  Stored in directory: /tmp/pip-ephem-wheel-cache-ped71gix/wheels/59/4f/8c/0dbaab09a776d1fa3740e9465078bfd903cc22f3985382b496
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.3.0
    Uninstalling en-core-web-sm-3.3.0:
      Successfully uninstalled en-core

In [8]:
# imports
import spacy
import neuralcoref

100%|██████████| 40155833/40155833 [00:04<00:00, 9664544.21B/s] 


In [9]:
#  load up the small model
nlp = spacy.load("en_core_web_sm")


In [10]:
# pop the neural co ref module into the pipeline

neuralcoref.add_to_pipe(nlp,greedyness=0.52)

# quick test to check it works
doc = nlp("Tell Sam that he will have to leave without Arthur, as he is sick.")
print(doc._.coref_resolved)

Tell Sam that Sam will have to leave without Arthur, as Sam is sick.


In [60]:
# load the sub title file
f = open("life_of_birds_epi_1_srt_text.txt")
doc = nlp(f.read())


In [61]:
# process the doc to replace the pronouns with the referenced nouns

resolved_doc = doc._.coref_resolved
print(resolved_doc[:500]) # first 500 characters

Birds are the most accomplished aeronauts the world has ever seen.
Birds fly high and low.
At great speed and very slowly.
And always with extraordinary precision and control  But birds are not the only creatures in the air.
There are also small furry mammals, bats, like birds in Texas.
birds are so competent in the air  that birds have just made a journey from Mexico, a thousand miles away,  simply in order to rear birds young in this cave,  which is particularly suitable for birds as a nursery


In [62]:
f_out = open("birds_resolved.txt", "w")
f_out.write(resolved_doc)
f_out.close()

In [63]:
# load the list of bird names extracted from wikipedia

bird_dict = {}
birds = open("bird_list.txt").read().lower().split("\n")
for bird in birds:
  # skip single letters A, B, C etc
  if len(bird) == 1:
    continue
  bird_dict[bird] = True
  # insert the last word too e..g yellow crested parrot > parrot
  bird_dict[bird.split(" ")[-1]] = True

# check it has worked
list(bird_dict.keys())[:5]

["abbott's babbler", 'babbler', "abbott's booby", 'booby', "abbott's starling"]

## Extract the nouns adj and proper nouns highlighting bird names

In [64]:
# split the subtitles into a set of rows for processing
resolved_subtitles = open("birds_resolved.txt").read()

subtitle_rows = resolved_subtitles.split("\n")
print(subtitle_rows[:5])


['Birds are the most accomplished aeronauts the world has ever seen.', 'Birds fly high and low.', 'At great speed and very slowly.', 'And always with extraordinary precision and control  But birds are not the only creatures in the air.', 'There are also small furry mammals, bats, like birds in Texas.']


In [73]:
# dump the filtered resolved subtitles to a file

valid_tokens = ["NOUN", 'ADJ', 'PROPN']

filtered_subtitle_file = open("birds_resolved_filtered.txt", "w")
for row in subtitle_rows:
  filtered_subtitle_file.write(row + "\n")
  row_doc = nlp(row)
  for token in row_doc:
    if token.pos_ not in valid_tokens:
      continue
    is_bird = token.pos_ == "NOUN" and token.lemma_ in bird_dict
    filtered_subtitle_file.write(token.text + "\t" + token.lemma_ + "\t" + token.pos_ + "\t" + str(is_bird) + "\n")
  filtered_subtitle_file.write("\n")


filtered_subtitle_file.close()