In [3]:
import nltk
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def load_database(textfile):
    sentences = []
    words = []
    lexemes = []
    with open(textfile) as f:
        text = f.read().lower()
        sentences = tokenize.sent_tokenize(text)
        for sentence in sentences:
            if not sentence:
                continue
            s_words = [word for word
                        in tokenize.word_tokenize(sentence)
                        if word not in (',', '.', ':', '-', ';', '?', '!', '"', "``", "`", "''")
                    ]
            s_lexemes = [stemmer.stem(word) for word in s_words]
            words.append(s_words)
            lexemes.append(s_lexemes)
    return sentences, words, lexemes

In [7]:
sentences, words, lexemes = load_database("../../code/datasets/nlp/the old man and the sea.txt")

In [14]:
unique_stems = set()
for sentence in lexemes:
    for stem in sentence:
        unique_stems.add(stem)
        
index = 0
indexed_stems = {}
for stem in unique_stems:
    indexed_stems[stem] = index
    index += 1

In [39]:
import math

##### Get the TF
vectors_sentences = [[0] * len(indexed_stems) for i in range(len(lexemes))]
index = 0
for sentence in lexemes:
    for stem in sentence:
        vectors_sentences[index][indexed_stems[stem]] += 1
    index += 1

tf = [[0] * len(indexed_stems) for i in range(len(lexemes))]
index = 0
for sentence in lexemes:
    for stem in sentence:
        tf[index][indexed_stems[stem]] = vectors_sentences[index][indexed_stems[stem]] / len(sentence)
    index += 1

## Calculate the total amount of occurence for each word
total_vector = [0] * len(indexed_stems)
for vector in vectors_sentences:
    index = 0
    for element in vector:
        total_vector[index] += element
        index += 1
        
total_occs_per_sentence = [0] * len(indexed_stems)
for indexed_stem in indexed_stems.items():
    occs = 0
    for vector in vectors_sentences:
        if vector[indexed_stem[1]] > 0:
            occs += 1
    
    total_occs_per_sentence[indexed_stem[1]] = occs

##### Calculate IDF
idf = [[0] * len(indexed_stems) for i in range(len(lexemes))]
index = 0
for sentence in lexemes:
    for stem in sentence:
        idf[index][indexed_stems[stem]] = math.log(len(lexemes) / total_occs_per_sentence[indexed_stems[stem]])
    index += 1
        

tf_idf = [[0] * len(indexed_stems) for i in range(len(lexemes))]
index = 0
for sentence in lexemes:
    for stem in sentence:
        tf_idf[index][indexed_stems[stem]] = tf[index][indexed_stems[stem]] * idf[index][indexed_stems[stem]]
    index += 1

In [92]:
from scipy import spatial

def amount_contains_sentence(bag, sentence):
    amount = 0
    for word in bag:
        if word in sentence:
            amount += 1
            
    return amount
# TODO: write the code that will find ALL sentences which contain all words of query
def exact_match(query):
    result = []
    stemmed_bag_query = [stemmer.stem(word.lower()) for word in query.split()]
    
    index = 0
    for sentence in lexemes:
        if amount_contains_sentence(stemmed_bag_query, sentence) == len(stemmed_bag_query):
            print(index)
            result.append(sentences[index])
        index += 1
    
    return result

# TODO: write the code that will find TOP sentences with THE BEST matches with query
def ranked_match(query, top=5):
    ranked_result = []
    stemmed_bag_query = [0] * len(indexed_stems)
    for word in query.split():
        stem = stemmer.stem(word.lower())
        stemmed_bag_query[indexed_stems[stem]] += 1 / total_vector[indexed_stems[stem]] 
    
    for index in range(len(lexemes)):
        simi = 1 - spatial.distance.cosine(stemmed_bag_query, tf_idf[index])
        ranked_result.append((simi, sentences[index]))
    
    ranked_result.sort(reverse=True)
    return [item for item in ranked_result[:top]]


In [95]:
print("Exact match: ")
print(*exact_match("yellow Gulf weeded"), sep='\n')
print()
print("Ranked match: ")
print(*ranked_match("yellow Gulf weeded"), sep='\n')

Exact match: 
709
1403
it was the yellow gulf weed that had made so much phosphorescence in the night.
so he hooked a patch of yellow gulf weed with the gaff as they passed and shook it so that the small shrimps that were in it fell onto the planking of the skiff.

Ranked match: 
(0.656776285338411, 'it was the yellow gulf weed that had made so much phosphorescence in the night.')
(0.4074825976942933, 'so he hooked a patch of yellow gulf weed with the gaff as they passed and shook it so that the small shrimps that were in it fell onto the planking of the skiff.')
(0.29698873034935236, 'there was yellow weed on the line but the old man knew that only made an added drag and he was pleased.')
(0.2915515596793602, 'the dark water of the true gulf is the greatest healer that there is.')
(0.23366898929817892, 'he saw the phosphorescence of the gulf weed in the water as he rowed over the part of the ocean that the fishermen called the great well because there was a sudden deep of seven hundre