The Google of Quotes

In [1]:
import re
import math

In [2]:
quotes = []

In [3]:
# build a list of full quotes
def get_full_quotes():
    quote = ''
    speaker = ''
    count = 0
    fp = open('quotes.txt', 'r', encoding='ISO-8859-1')
    line = fp.readline()
    while line is not '':
        line = line.strip()
        if count % 2 == 0:
            quote = line
        else:
            speaker = line
            quotes.append(quote + ' - ' + speaker)
        line = fp.readline()
        count += 1
    fp.close()

In [4]:
# get a list of the words in a full quote
def get_word_list(s):
    word_list = re.split(r'[^a-z0-9_]+', s.lower())
    return word_list

In [5]:
# build a postings-list dictionary
def get_postings_list(qs):
    postings_list = {}
    for q in qs:
        postings_dict = {}
        words = get_word_list(q)
        for word in words:
            if word in postings_dict.keys():
                postings_dict[word] += 1
            else:
                postings_dict[word] = 1
        postings_list[q] = postings_dict
    return postings_list

In [6]:
# build a reverse postings-list dictionary
def get_reverse_postings_list(ws):
    reverse_postings_list = {}
    for w in ws:
        quote_dict = {}
        postings_list = get_postings_list(quotes)
        for q in postings_list.keys():
            postings = postings_list[q]
            if w in postings.keys():
                if q in quote_dict.keys():
                    quote_dict[q] += 1
                else:
                    quote_dict[q] = 1
        reverse_postings_list[w] = quote_dict
    return reverse_postings_list

In [7]:
# compute the TF-IDF of a word in a full quote
def tf_idf(w, q):
    w_times = 0
    q_times = 0
    maximum = 0
    tf = 0
    idf = 0
    total = len(quotes)
    postings = get_postings_list([q])[q]
    reverse_postings = get_reverse_postings_list([w])[w]
    if w in postings.keys():
        w_times = postings[w]
    for p in postings.keys():
        if postings[p] > maximum:
            maximum = postings[p]
    if maximum != 0:
        tf = 1.0 * w_times / maximum
    for q in reverse_postings.keys():
        q_times += reverse_postings[q]
    if q_times != 0:
        idf = math.log(1.0 * total / q_times)
    return tf * idf

In [8]:
# input: a word
# output: a dictionary - keys are full quotes containing that word
#                      - values are the TF-IDF score of that word for that full quote
def single_quote_search(w):
    quote_dict = {}
    for q in quotes:
        word_list = get_word_list(q)
        if w in word_list:
            quote_dict[q] = tf_idf(w, q)
    return quote_dict

In [9]:
# input: a list of words
# output: a dictionary - keys are full quotes containing one or more of the words in that list
#                      - values are the sum of TF-IDF scores of the words in that list for that full quote
def multiple_quote_search(ws):
    multiple_quote_dict = {}
    for w in ws:
        quote_dict = single_quote_search(w)
        for q in quote_dict.keys():
            if q in multiple_quote_dict.keys():
                multiple_quote_dict[q] += quote_dict[q]
            else:
                multiple_quote_dict[q] = quote_dict[q]
    return multiple_quote_dict

In [10]:
def main():
    get_full_quotes()

In [11]:
main()

In [12]:
# sample case
print('Get a list of the words in a full quote\n"' + quotes[0] + '": ')
print(get_word_list(quotes[0]))
print('\nBuild a postings-list dictionary for the first two quotes: ')
print(get_postings_list(quotes[0:2]))
print('\nBuild a reverse postings-list dictionary for a list [\'entertainer\']: ')
print(get_reverse_postings_list(['entertainer']))
print('\nTF-IDF of a word \'entertainer\' in the Marlon Brando quote: ')
print(tf_idf('entertainer', 'An actor is at most a poet and at least an entertainer. - Marlon Brando'))
print('\nQuote search using a single word \'entertainer\': ')
print(single_quote_search('entertainer'))
print('\nQuote search using multiple words \'entertainer\' and \'foundation\': ')
print(multiple_quote_search(['entertainer', 'foundation']))

Get a list of the words in a full quote
"How we spend our days is, of course, how we spend our lives. - Annie Dillard": 
['how', 'we', 'spend', 'our', 'days', 'is', 'of', 'course', 'how', 'we', 'spend', 'our', 'lives', 'annie', 'dillard']

Build a postings-list dictionary for the first two quotes: 
{'How we spend our days is, of course, how we spend our lives. - Annie Dillard': {'how': 2, 'we': 2, 'spend': 2, 'our': 2, 'days': 1, 'is': 1, 'of': 1, 'course': 1, 'lives': 1, 'annie': 1, 'dillard': 1}, 'Two roads diverged in a wood, and I...I took the one less traveled by, and that has made all the difference. - Robert Frost': {'two': 1, 'roads': 1, 'diverged': 1, 'in': 1, 'a': 1, 'wood': 1, 'and': 2, 'i': 2, 'took': 1, 'the': 2, 'one': 1, 'less': 1, 'traveled': 1, 'by': 1, 'that': 1, 'has': 1, 'made': 1, 'all': 1, 'difference': 1, 'robert': 1, 'frost': 1}}

Build a reverse postings-list dictionary for a list ['entertainer']: 
{'entertainer': {'An actor is at most a poet and at least an en