In [10]:
# if not downloaded yet
# nltk.download('punkt')
from collections import defaultdict
import nltk
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer

# try simple engish
textfile = "datasets/nlp/the old man and the sea.txt"
# or more complex
# textfile = "datasets/nlp/MenOfGoodWill.txt"

stemmer = SnowballStemmer("english")

sentences = []
words = []
lexemes = []
with open(textfile) as f:
    text = f.read().lower()
    
    # lets split text for sentences first
    
    # these 2 parts are the same. Either complex one:
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(text)
    print("Sentences with English punkt:", len(sentences))
    # or "from the box"
    sentences = tokenize.sent_tokenize(text)
    print("Sentences with default method:", len(sentences))

    
    # let's explode sentences to lexemes
    for sentence in sentences:
        if not sentence:
            continue
        s_words = [word for word
                    in tokenize.word_tokenize(sentence)
                    if word not in (',', '.', ':', '-', ';', '?', '!', '"', "``", "`", "''")
                ]
        s_lexemes = [stemmer.stem(word) for word in s_words]
        words.append(s_words)
        lexemes.append(s_lexemes)

# test
print(sentences[400])
print(words[400])
print(lexemes[400])

Sentences with English punkt: 1923
Sentences with default method: 1923
he rowed slowly and steadily toward where the bird was circling.
['he', 'rowed', 'slowly', 'and', 'steadily', 'toward', 'where', 'the', 'bird', 'was', 'circling']
['he', 'row', 'slowli', 'and', 'steadili', 'toward', 'where', 'the', 'bird', 'was', 'circl']


In [23]:
import itertools

lexicon1 = set(itertools.chain(*words))
print("Words lexicon:", len(lexicon1))

lexicon2 = set(itertools.chain(*lexemes))
print("Words lexicon:", len(lexicon2))

# will the numbers change if you change text file?

In [19]:
clusters = defaultdict(set)
for i in range(len(lexemes)):
    for j in range(len(lexemes[i])):
        clusters[lexemes[i][j]].add(words[i][j])
        
for key, value in clusters.items():
    if len(value) > 2:
        print(key, value)

fish {'fish', 'fished', 'fishes', 'fishing'}
form {'formed', 'forms', 'form'}
see {'see', 'sees', 'seeing'}
come {'coming', 'comes', 'come'}
carri {'carrying', 'carried', 'carry'}
coil {'coiled', 'coil', 'coils'}
sail {'sails', 'sailing', 'sailed', 'sail'}
patch {'patch', 'patched', 'patches'}
look {'looking', 'looks', 'look', 'looked'}
back {'backing', 'back', 'backed', 'backs'}
skin {'skinned', 'skinning', 'skin'}
bring {'bring', 'bringing', 'brings'}
love {'lovely', 'loved', 'love', 'loving'}
stay {'stays', 'stay', 'stayed'}
rememb {'remember', 'remembers', 'remembered'}
know {'knows', 'know', 'knowing'}
leav {'leaves', 'leaving', 'leave'}
show {'showing', 'showed', 'shows', 'show'}
drift {'drift', 'drifted', 'drifting'}
plank {'plank', 'planks', 'planking'}
end {'end', 'ended', 'ends'}
wait {'waiting', 'waited', 'wait'}
cut {'cuts', 'cut', 'cutting'}
salt {'salting', 'salted', 'salt'}
smell {'smell', 'smelled', 'smelling'}
drop {'drop', 'drops', 'dropped', 'dropping'}
think {'think

In [18]:
clusters

defaultdict(set,
            {'the': {'the'},
             'old': {'old'},
             'man': {'man'},
             'and': {'and'},
             'sea': {'sea', 'seas'},
             'he': {'he'},
             'was': {'was'},
             'an': {'an'},
             'who': {'who'},
             'fish': {'fish', 'fished', 'fishes', 'fishing'},
             'alon': {'alone'},
             'in': {'in'},
             'a': {'a'},
             'skiff': {'skiff'},
             'gulf': {'gulf'},
             'stream': {'stream'},
             'had': {'had'},
             'gone': {'gone'},
             'eighty-four': {'eighty-four'},
             'day': {'day', 'days'},
             'now': {'now'},
             'without': {'without'},
             'take': {'take', 'taking'},
             'first': {'first'},
             'forti': {'forty'},
             'boy': {'boy', 'boys'},
             'been': {'been'},
             'with': {'with'},
             'him': {'him'},
             'but': {'but'},
 