# Text Recommendations

In [2]:
import collections
import json
import re
import pandas as pd
import nltk
from nltk.book import text6

To provide recommendations of a next word given the previous word, I'ld like to get a list of all the words that typically follow another word and how often those words were mentioned. To do this I am going to create a dictionary whose keywords are the words and the items are named tuples. The named tuples will have a list of words that occurced after the keyword and a list the number of times it was used.

For example, suppose we were only analyzing the text that read as follows.

"Hello world. Hello world. Why hello there." 

The word "hello" would have two lists ['world', 'there'] and [2,1] because the word "world" succeeds "hello" twice in the text while "there" succeeds "hello" once.



In [2]:
Word = collections.namedtuple("Word", ['after_words','counts'])

In the spirit of python we will use Monty Python and the Holy Grail for this. Let's get a list of bigrams.



We need to write some functions to clean the text and omit certain text.

In [44]:
#Returns True for text we want to omit and False otherwise
def omit_text(text):
    
    #Eliminate these marks for now as I don't think they will be meaningful
    is_a_mark = text in ["_", "$", "#", ";", ",", "]", "[", "{", "}", "", " ", ":", "!", "'", "."]
    
    #You can add more conditions. In previous commits I eliminated numbers, but I decided to loosen up a bit.
    
    answer = True if is_a_mark else False
    
    return answer


In [5]:
omit_text(" ")

True

In [46]:
words = [w for w in text6.tokens if not omit_text(w)]

words[:5]


['SCENE', '1', 'wind', 'clop', 'clop']

In [47]:
word_counts =pd.Series(words).value_counts()

word_counts[:5]

the       299
I         255
ARTHUR    225
?         207
you       204
dtype: int64

In [None]:
bigrams = [b for b in zip(words[:-1], words[1:])]

bigrams[:5]





In [71]:
words = [w[0] for w in bigram_list]

word_counts = pd.Series(words).value_counts()

word_counts[:10]

yyyzzz    454
the       331
i         234
a         224
you       219
and       178
of        175
to        146
s         138
we        121
dtype: int64

In [59]:
word_dict = {}

for word in list(set(words)):
    all_words_after = [w[1] for w in bigrams if w[0] == word]
    word_counts = pd.Series(all_words_after).value_counts()
    
    word_dict[word] = Word(after_words=word_counts.index.tolist(), counts=word_counts.tolist())

In [62]:
word_dict['King']

Word(after_words=['Arthur', 'of', 'you', 'who'], counts=[16, 9, 1, 1])

Now, we need to save the data.

In [63]:
with open("data/monty_python_holy_grail_word_data.json", "w") as file:
    json.dump(word_dict, file)