In [1]:
import nltk
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
import itertools
from collections import Counter
import math

def load_database(textfile):
    stemmer = SnowballStemmer("english")
    sentences = []
    words = []
    lexemes = []
    with open(textfile) as f:
        text = f.read().lower()
        sentences = tokenize.sent_tokenize(text)
        for sentence in sentences:
            if not sentence:
                continue
            s_words = [word for word
                        in tokenize.word_tokenize(sentence)
                        if word not in (',', '.', ':', '-', ';', '?', '!', '"', "``", "`", "''")
                    ]
            s_lexemes = [stemmer.stem(word) for word in s_words]
            words.append(s_words)
            lexemes.append(s_lexemes)
    return sentences, words, lexemes

In [2]:
sentences, words, lexemes = load_database("C:/Users/diego/PycharmProjects/AI/week_1/Homework/the old man and the sea.txt")
text_stemmed = list(itertools.chain(*lexemes))
sentences_stemmed = []
for i in range(len(lexemes)):
    temp = " ".join(lexemes[i])
    sentences_stemmed.append(temp)
stemmed_lexicon = Counter(text_stemmed).most_common(len(text_stemmed))


In [3]:
# TODO: write the code that will find ALL sentences which contain all words of query
def exact_match(query):
    result = []
    query = query.lower()
    for sentence in sentences:
        if query in sentence:
            result.append(sentence)
    return result

# TODO: write the code that will find TOP sentences with THE BEST matches with query
def ranked_match(query, top=5):
    stemmer = SnowballStemmer("english")
    ranked_result = []
    intersection = []
    intersection_stemmed = []
    query = [word.lower() for word in query.split(" ")]
    query = [stemmer.stem(word) for word in query]
    
    for word in query:
        for i in range(len(lexemes)):
            for j in range(len(lexemes[i])):
                if word == lexemes[i][j]:
                    if sentences[i] not in intersection:
                        intersection.append(sentences[i])
                        intersection_stemmed.append(sentences_stemmed[i])
    
    f = []
    N = len(lexemes)
    n = len(intersection)
    search_result = []
                
    for sentence in intersection_stemmed:
        s = 0
        number_of_words = len(sentence.split())
        for word in query: 
            n_temp = []
            if word in sentence:
                n_temp.append(sentence)

            f_word = sentence.count(word)
            if len(n_temp) == 0:
                s += 0
            else:
                s += ( 0.5 + 0.5 *  f_word / number_of_words) * math.log(N / len(n_temp))

        search_result.append(s)   

    for i in range(n):
        ranked_result.append((intersection[i], search_result[i]))
    
    ranked_result.sort(key=lambda x: x[1], reverse = True)     
    return [item[0] for item in ranked_result[:5]]

In [4]:
def ranked_cos(query, top=5):
    stemmer = SnowballStemmer("english")
    ranked_result = []
    intersection = []
    intersection_stemmed = []
    intersection_word = []
    query_vector = []
    sentence_vector = []
    Search = []
    query = [word.lower() for word in query.split(" ")]
    query = [stemmer.stem(word) for word in query]
    
    for word in query:
        for i in range(len(lexemes)):
            for j in range(len(lexemes[i])):
                if word == lexemes[i][j]:
                    if sentences[i] not in intersection:
                        intersection.append(sentences[i])
                        intersection_stemmed.append(sentences_stemmed[i])

    for sentence in intersection_stemmed:
        s_words = [word for word
                        in tokenize.word_tokenize(sentence)
                        if word not in (',', '.', ':', '-', ';', '?', '!', '"', "``", "`", "''")
                    ]
        
        intersection_word.append(s_words)
    
    for sentence in intersection_word:
        query_temporal = []
        sentence_temporal = []
        for word in sentence:
            check = False
            for quer in query:
                if word == quer: 
                    check = True
                    break
            if not check:
                query_temporal.append(0)
            else:
                query_temporal.append(1)
                
            for i in range(len(stemmed_lexicon)):
                if word == stemmed_lexicon[i][0]:
                        sentence_temporal.append(stemmed_lexicon[i][1])
               
        query_vector.append(query_temporal)
        sentence_vector.append(sentence_temporal)
    
    AB = 0
    A_sq = 0
    B_sq = 0
    temp_result = 0
    for i in range(len(query_vector)):
        for j in range(len(query_vector[i])):
            AB += query_vector[i][j]*sentence_vector[i][j]
            A_sq += query_vector[i][j]**2
            B_sq += sentence_vector[i][j]**2
        temp_result = AB/(A_sq**0.5*B_sq**0.5)
        Search.append(temp_result)
        AB = 0
        A_sq = 0
        B_sq = 0
    
    for i in range(len(Search)):
        ranked_result.append((intersection[i], Search[i]))
    
    ranked_result.sort(key=lambda x: x[1], reverse = True)
    return [item[0] for item in ranked_result[:5]]

In [5]:
print(exact_match("yellow Gulf weed"))
print(ranked_match('yellow Gulf weed'))
print(ranked_cos("yellow Gulf weed"))

['it was the yellow gulf weed that had made so much phosphorescence in the night.', 'so he hooked a patch of yellow gulf weed with the gaff as they passed and shook it so that the small shrimps that were in it fell onto the planking of the skiff.']
['it was the yellow gulf weed that had made so much phosphorescence in the night.', 'so he hooked a patch of yellow gulf weed with the gaff as they passed and shook it so that the small shrimps that were in it fell onto the planking of the skiff.', 'there was yellow weed on the line but the old man knew that only made an added drag and he was pleased.', 'there were only the flying fish that went up from his bow sailing away to either side and the yellow patches of gulf-weed.', 'but the bird was almost out of sight now and nothing showed on the surface of the water but some patches of yellow, sun-bleached sargasso weed and the purple, formalized, iridescent, gelatinous bladder of a portuguese man-of-war floating close beside the boat.']
['"a 