# Text Recommendations

In [9]:
import collections
import json
import re
import pandas as pd
import nltk
from nltk.book import text6

To provide recommendations of a next word given the previous word, I'ld like to get a list of all the words that typically follow another word and how often those words were mentioned. To do this I am going to create a dictionary whose keywords are the words and the items are named tuples. The named tuples is the list of words that occurced after the keyword and the number of times it was used. 

In [None]:
Word = collections.namedtuple("Word", ['after_words','counts'])

In the spirit of python we will use Monty Python and the Holy Grail for this. Let's get a list of bigrams.

In [2]:
dirty_bigrams = list(nltk.bigrams(text6))

print(dirty_bigrams[:10])

[('SCENE', '1'), ('1', ':'), (':', '['), ('[', 'wind'), ('wind', ']'), (']', '['), ('[', 'clop'), ('clop', 'clop'), ('clop', 'clop'), ('clop', ']')]


We need to write some functions to clean the text and omit certain text.

In [3]:
#Returns True for text we want to omit and False otherwise
def omit_text(text):
    
    is_a_mark = text in [",", "!", "_", "$", "#", ":", ";", ",", "'", '"', " ",
                         "]", "[", "{", "}", ""]
    
    condition = is_a_mark or text.isnumeric()
    
    answer = True if condition else False
    
    return answer

In [4]:
omit_text("1")

True

In [5]:
def clean_text(text):
    
    text = text.strip()
    
    text = text.lower()
    
    #Temporarily replace '
    text = text.replace("'", "yyyzzz")
    
    text = re.sub(r'[^\w\s]','',text)
    
    #put "'" back
    text = text.replace("yyyzzz", "'")
    
    return text

In [6]:
clean_text("  it's!...  ")

"it's"

In [8]:
cleaned_bigrams = [(clean_text(b[0]),clean_text(b[1])) for b in dirty_bigrams]

def keep_bigram(bigram):
    keep_b0 = not omit_text(bigram[0])
    keep_b1 = not omit_text(bigram[1])
    
    return keep_b0 and keep_b1

bigram_list = [b for b in cleaned_bigrams if keep_bigram(b)]

bigram_list[:5]

[('clop', 'clop'),
 ('clop', 'clop'),
 ('king', 'arthur'),
 ('whoa', 'there'),
 ('clop', 'clop')]

In [12]:
word0 = [w[0] for w in bigram_list]

word_list = list(set(word0))

word_list[:5]

['lost', 'sell', 'inside', 'nothing', 'keep']

In [37]:
word_dict = {}

for word in word_list:
    all_words_after = [w[1] for w in bigram_list if w[0] == word]
    word_counts = pd.Series(all_words_after).value_counts()
    
    word_dict[word] = Word(after_words=word_counts.index.tolist(), counts=word_counts.tolist())

In [38]:
word_dict['knights']

Word(after_words=['of', 'who', 'went', 'in', 'seemed'], counts=[25, 8, 1, 1, 1])

Now, we need to save the data.

In [39]:
with open("data/monty_python_holy_grail_word_data.json", "w") as file:
    json.dump(word_dict, file)