Singular and plural instances of the same word counts as separate words. 

The lemma of each Doc is able to recognice the singular word only when we use the large corpus:

In [111]:
import pysrt
import spacy
import numpy as np
from collections import Counter
from copy import deepcopy
from tabulate import tabulate

Example:

In [112]:
nlp = spacy.load("en_core_web_lg")
doc = nlp("donors nannies children child")

for t in doc:
    print(t.lemma_)
    print(t.tag_)
    print(t.text)
    print(t.pos_)
    print('...')

donor
NNS
donors
NOUN
...
nanny
NNS
nannies
NOUN
...
child
NNS
children
NOUN
...
child
NN
child
NOUN
...


In [113]:
f = open("preprocessed_sub.txt")
doc = nlp(f.read())

print(doc[:50]) # first 500 characters

1
00:01:19,727 --> 00:01:23,356
white storks wanted pick bird

2
00:01:23,567 --> 00:01:26,400
representative birds world

3
00:01:26,447 --> 00:01:29,200
worse pick white stork

4
00:01:29,327 --> 00:01:32,637
marvellous flyer intrepid traveller

5



In [114]:
def plural2sing(doc, pos_filter={}):
    """
    Splits documents into tokens, change plural tokens to singular and lemmatizes the
    text.
    
    Parameters:
    doc (spacy.document)
    """
    out = list()
    for subtitle in doc:
        s = []
        for token in subtitle:
            if token.tag_ == "NNS":
                s.append(token.lemma_)
            else:
                s.append(token.text.lower())
        out.append(s)
    return out

In [115]:
utterances = [subtitle.text for subtitle in subtitles]

Here we can see how Counter recognizes "bird" and "birds" as separate words:

In [116]:
text = ' '.join(utterances)
text_for_counts = text.split(" ")

counts = Counter(text_for_counts)
labels, values = zip(*counts.items())

# sort your values in descending order
indSort = np.argsort(values)[::-1]

# rearrange your data
labels = np.array(labels)[indSort]
values = np.array(values)[indSort]
print(labels[:15])

['birds' 'bird' 'feathers' 'like' 'wings' 'air' 'great' 'way' 'display'
 'flight' 'nest' 'beak' 'africa' 'body' 'sun']


Change plural words by singular words:

In [117]:
corpus = nlp.pipe(utterances, batch_size=26000)

processed = plural2sing(corpus)

processed[:15]

[['white', 'stork', 'wanted', 'pick', 'bird'],
 ['representative', 'bird', 'world'],
 ['worse', 'pick', 'white', 'stork'],
 ['marvellous', 'flyer', 'intrepid', 'traveller'],
 ['pair', 'come', 'africa', 'nest', 'small', 'town', 'bavaria'],
 ['complicated', 'courtship', 'greeting', 'ritual'],
 ['devoted', 'parent'],
 ['stand', 'bird', 'world', 'stork', 'feather'],
 ['seen', 'key', 'crucial', 'bird'],
 ['feather', 'marvellous', 'aerofoil'],
 ['man', 'invent', 'strong', 'weight', 'weight'],
 ['extremely', 'efficient', 'insulator', 'important', 'bird'],
 ['complicated', 'structure'],
 ['feather', 'separate', 'filament'],
 ['central', 'quill']]

In [118]:
def flatten(xss):
    return [x for xs in xss for x in xs]

In [119]:
flat_processed = flatten(processed)
text = ' '.join(flat_processed)
text_for_counts = text.split(" ")

counts = Counter(text_for_counts)
labels, values = zip(*counts.items())

# sort your values in descending order
indSort = np.argsort(values)[::-1]

# rearrange your data
labels = np.array(labels)[indSort]
values = np.array(values)[indSort]
print(labels[:15])

['bird' 'feather' 'wing' 'like' 'nest' 'beak' 'air' 'stork' 'display'
 'great' 'way' 'egg' 'flight' 'body' 'young']


In [120]:
preprocessed_subtitles = deepcopy(subtitles)

for subtitle, proc_subtitle in zip(preprocessed_subtitles, processed):
    subtitle.text = " ".join(proc_subtitle)

In [121]:
preprocessed_subtitles.save('./preprocessed_singular_sub.txt')