In [24]:
import nltk
import re

sentences = []
lines = open('../../code/datasets/nlp/the old man and the sea.txt').readlines()

tokenized_sentences = []
for line in lines:
    line = line.strip().strip('"').strip('`').strip("'")
    for sentence in re.split('\.|\?|!', line):
        tokenized_sentences.append(nltk.word_tokenize(sentence))

In [25]:
# sample sentences
print(tokenized_sentences[201])
nltk.pos_tag(tokenized_sentences[201])

['I', "'ll", 'be', 'back', 'when', 'I', 'have', 'the', 'sardines']


[('I', 'PRP'),
 ("'ll", 'MD'),
 ('be', 'VB'),
 ('back', 'RB'),
 ('when', 'WRB'),
 ('I', 'PRP'),
 ('have', 'VBP'),
 ('the', 'DT'),
 ('sardines', 'NNS')]

In [26]:
def get_bigrams(sentence):
    bigram = []
    for i in range(len(sentence) - 1):
        bigram.append((sentence[i].lower(), sentence[i + 1].lower()))
    return bigram

# sample
print(get_bigrams(tokenized_sentences[101]))

[('his', 'hope'), ('hope', 'and'), ('and', 'his'), ('his', 'confidence'), ('confidence', 'had'), ('had', 'never'), ('never', 'gone')]


In [18]:
bigrammed_sentences = [get_bigrams(sentence) for sentence in tokenized_sentences]

In [38]:
### Most common words
word_map = {}
for sentence in tokenized_sentences:
    for word in sentence:
        if word.lower() in word_map:
            word_map[word.lower()] += 1
        else:
            word_map[word.lower()] = 1
word_map = sorted(word_map.items(), key=lambda kv: kv[1], reverse=True)

top_20 = []
for i in range(20):
    top_20.append(word_map[i])

print("Top 20 words: ")
print(top_20)
print()


### Most common bigrams
bigram_map = {}
for sentence in bigrammed_sentences:
    for bigram in sentence:
        key = bigram[0] + " " + bigram[1]
        if key in bigram_map:
            bigram_map[key] += 1
        else:
            bigram_map[key] = 1
bigram_map = sorted(bigram_map.items(), key=lambda kv: kv[1], reverse=True)

top_20 = []
for i in range(20):
    top_20.append(bigram_map[i])

print("Top 20 bigrams: ")
print(top_20)
print()


################ It can be done differently with NLTK: ################
all_words = []
for sentence in tokenized_sentences:
    for word in sentence:
        all_words.append(word)
fdist = nltk.FreqDist(word.lower() for word in all_words)
top_20 = fdist.most_common(20)

print("Top 20 words via another way:")
print(top_20)
print()


############ ADVANCED: most popular nouns and verbs ############
all_tagged_words = []

for sentence in tokenized_sentences:
    all_tagged_words += nltk.pos_tag(sentence)

word_map = {}
for tokenized_word in all_tagged_words:
    key = tokenized_word[0] + " " + tokenized_word[1]
    if key in word_map:
        word_map[key] += 1
    else:
        word_map[key] = 1
word_map = sorted(word_map.items(), key=lambda kv: kv[1], reverse=True)

top_20_nouns = []
top_20_actions = []

index = 0
while len(top_20_nouns) < 20 and index < len(word_map):
    if word_map[index][0].split()[1] == "NN":
        top_20_nouns.append(word_map[index])
    index += 1

index = 0
while len(top_20_actions) < 20 and index < len(word_map):
    if word_map[index][0].split()[1] == "VBD":
        top_20_actions.append(word_map[index])
    index += 1

print("Most used nouns: ")
print(top_20_nouns)
print()
print("Most used actions: ")
print(top_20_actions)

Top 20 words: 
[('the', 2316), ('and', 1259), ('he', 1166), (',', 784), ('of', 540), ('i', 508), ('it', 494), ('to', 454), ('his', 446), ('was', 435), ('a', 397), ('in', 359), ('that', 299), ('fish', 285), ('man', 264), ('old', 248), ('but', 233), ('him', 230), ('not', 217), ('with', 205)]

Top 20 bigrams: 
[('of the', 248), ('old man', 229), ('the old', 205), ('in the', 197), (", ''", 179), ('and the', 172), (', he', 161), ('and he', 151), ('he thought', 145), ('the fish', 139), ('he was', 107), ('on the', 100), ('the boy', 96), ('he said', 96), ("'' the", 89), ('it was', 88), ("'' he", 87), ('the line', 83), ('he had', 80), ('the water', 73)]

Top 20 words via another way:
[('the', 2316), ('and', 1259), ('he', 1166), (',', 784), ('of', 540), ('i', 508), ('it', 494), ('to', 454), ('his', 446), ('was', 435), ('a', 397), ('in', 359), ('that', 299), ('fish', 285), ('man', 264), ('old', 248), ('but', 233), ('him', 230), ('not', 217), ('with', 205)]

Most used nouns: 
[('man NN', 262), ('f