In [5]:
import nltk
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer

def load_database(textfile):
    stemmer = SnowballStemmer("english")
    sentences = []
    words = []
    lexemes = []
    with open(textfile) as f:
        text = f.read().lower()
        sentences = tokenize.sent_tokenize(text)
        for sentence in sentences:
            s_words = [word for word
                        in tokenize.word_tokenize(sentence)
                        if word not in (',', '.', ':', '-', ';', '?', '!', '"', "``", "`", "''")
                    ]
            s_lexemes = [stemmer.stem(word) for word in s_words]
            words.append(s_words)
            lexemes.append(s_lexemes)
    return sentences, words, lexemes

In [4]:
sentences, words, lexemes = load_database("datasets/nlp/facts.txt")

In [6]:
def q2stem(query):
    stemmer = SnowballStemmer("english")
    return [stemmer.stem(word) for word in tokenize.word_tokenize(query.lower())]

# TODO: write the code that will find ALL sentences which contain all words of query
def exact_match(query):
    result = []
    q = set(q2stem(query))
    for i, sentence in enumerate(lexemes):
        if set(sentence).intersection(q) == q:
            result.append(sentences[i])
    return result


# TODO: write the code that will find TOP sentences with THE BEST matches with query
def ranked_match(query, top=5):
    ranked_result = []
    q = set(q2stem(query))
    for i, sent in enumerate(lexemes):
        ranked_result.append(
            (
                len(set(sent).intersection(q)),
                sentences[i]
            )
        )
    ranked_result.sort(reverse=True)
    return [item[1] for item in ranked_result[:top]]

In [14]:
def construct_lookup(lexemes):
    result = dict()
    # for each text representation
    for i, lx in enumerate(lexemes):
        # for each lexeme in text
        for lexeme in lx:
            # if first met
            if lexeme not in result: result[lexeme] = set()
            # add text index to posting list
            result[lexeme].add(i)
    return result

In [16]:
index = construct_lookup(lexemes)

In [21]:
q = "english languages"
proc = q2stem(q)
print(proc)
res = None
for lex in proc:
    pl = index[lex]
    print(lex, pl)
    if res is None: res = pl
    else: res = res & pl

print(res)
for i in res:
    print(sentences[i])

['english', 'languag']
english {99, 155}
languag {99}
{99}
94. of all the words in the english language, the word "set" has the most definitions.


In [22]:
print(exact_match("english languages"))
print("================================")
print(ranked_match("humans earth"))

['94. of all the words in the english language, the word "set" has the most definitions.']
['153. for every human on earth there are 1.6 million ants.', 'this means we have only seen 5% of the universe from earth.', 'that is the equivalent of a human jumping the empire state building.', '9. one in every five adults believe that aliens are hiding in our planet disguised as humans.', '88. earth is the only planet that is not named after a god.']


In [5]:
import itertools
import math
from collections import defaultdict

lexicon = set(itertools.chain(*lexemes))
inverted_lexicon = dict((word, i) for i, word in enumerate(lexicon))
print("Lexicon size =", len(lexicon))

WF = [defaultdict(int) for _ in lexemes]
DF = defaultdict(int)
for i, doc in enumerate(lexemes):
    for word in doc:
        WF[i][word] += 1
    for word in set(doc):
        DF[word] += 1
        

def tf(word, doc_i):
    return WF[doc_i][word] / len(lexemes[doc_i])


def idf(word):
    return -math.log(DF[word] / len(lexemes))


# use the same method to buid a vector for documents (just use lookup) and for new queries
def to_vector(tokens, i=None):
    result = list([0] * len(lexicon))
    if i is None:
        local_tf = dict((word, tokens.count(word)) for word in set(tokens))    
    for word in tokens:
        if word in lexicon:
            if i is None:
                result[inverted_lexicon[word]] = local_tf[word] * idf(word)    
            else:
                result[inverted_lexicon[word]] = tf(word, i) * idf(word)
    return result


def create_tdm(lexemes):
    result = []
    for i, sent in enumerate(lexemes):
        result.append(to_vector(sent, i))
    return result

Lexicon size = 1925


In [6]:
TDM = create_tdm(lexemes)

In [7]:
from numpy import dot

def search(query, top=5):
    ranked_result = []
    v = to_vector(q2stem(query))
    for i, vect in enumerate(TDM):
        ranked_result.append(
            (
                dot(v, vect) / (dot(v, v) * dot(vect, vect)), #  cosine
                sentences[i]
            )
        )
    ranked_result.sort(reverse=True)
    return [item[1] for item in ranked_result[:top]]
    
search("yellow Gulf weed")

['it was the yellow gulf weed that had made so much phosphorescence in the night.',
 'so he hooked a patch of yellow gulf weed with the gaff as they passed and shook it so that the small shrimps that were in it fell onto the planking of the skiff.',
 'there was yellow weed on the line but the old man knew that only made an added drag and he was pleased.',
 'just before it was dark, as they passed a great island of sargasso weed that heaved and swung in the light sea as though the ocean were making love with something under a yellow blanket, his small line was taken by a dolphin.',
 'he saw the phosphorescence of the gulf weed in the water as he rowed over the part of the ocean that the fishermen called the great well because there was a sudden deep of seven hundred fathoms where all sorts of fish congregated because of the swirl the current made against the steep walls of the floor of the ocean.']