In [137]:
def read_file(name):
    with open(name) as f:
        data = f.read()
    return data
        
documents_data = read_file('documents.txt')
keywords_data = read_file('keywords.txt')

def sum_squared(values):
    return reduce(lambda x, y: x + y, map(lambda x: x ** 2, list(values)))

def tokenify(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    tokens = [stemmer.stem(word) for word in words]
    
    return tokens
    

In [141]:
# create documents
from nltk import word_tokenize
from nltk.stem import *

docs = documents_data.split('\n\n')
documents = []

for i, doc in enumerate(docs): 
    doc_data = doc.split('\n')
    title = doc_data[0]
    body = ' '.join(doc_data[1:len(doc_data)])                        

    documents.append({'idx': i, 'title': title, 'body': body, 'tokens': tokenify(doc)})

In [140]:
# create keywords 

keywords = tokenify(keywords_data)
keywords_vector = {}

for keyword in keywords: 
    keywords_vector[keyword]= 0
    

In [None]:
# create bag of words
bag_of_words = []

for i, doc in enumerate(documents):
    keywords_vec = keywords_vector.copy()
    for token in doc['tokens']:
        if token in keywords:
            keywords_vec[token] += 1
            
    bag_of_words.append({'idx': i, 'vector': keywords_vec, 'max': max(keywords_vec.values())})
    
bag_of_words


In [130]:
# tf representation
from functools import reduce
from math import sqrt
tf = []

for bag in bag_of_words:
    idx = bag['idx']
    vec = bag['vector']
    max_n = bag['max']

    keywords_vec = keywords_vector.copy()
    
    for key in vec.keys():
        keywords_vec[key] = vec[key]/max_n
    
    vector_length = sqrt(sum_squared(keywords_vec.values()))
    
    tf.append({'idx': idx, 'vector': keywords_vec, 'vector_length': vector_length})
    

In [132]:
# idf representation
from math import log10

idf = keywords_vector.copy()

document_count = len(documents)

for key in idf.keys():
    word_count = 0
    
    for doc in documents:
        if key in doc['tokens']:
            word_count += 1
    
    idf[key] = word_count / document_count


In [135]:
# tfidf representation

tfidf = []

for i, doc in enumerate(tf):
    keywords_vec = keywords_vector.copy()

    for key in doc['vector'].keys():
        keywords_vec[key] = doc['vector'][key] * idf[key]
        
    vector_length = sqrt(sum_squared(keywords_vec.values()))
        
    tfidf.append({'idx': i, 'vector': keywords_vec, 'vector_length': vector_length})
    

In [148]:
# query 
query_string = 'Morgan Kaufmann Series in Machine Learning for Bioinformatics'

query = tokenify(query_string)

query_bag_of_words = {}
query_bag_of_words['vector'] = keywords_vector.copy()

for token in query_bag_of_words['vector']:
    if token in query:
        query_bag_of_words['vector'][token] += 1

query_bag_of_words['max'] = max(query_bag_of_words['vector'].values())


In [None]:
# query tf
vector = query_bag_of_words['vector']
max_n = query_bag_of_words['max']

query_tf = {}
query_tf['vector'] = keywords_vector.copy()

for key in vector.keys():
    query_tf['vector'][key] = vector[key]/max_n

query_tf['vector_length'] = sqrt(sum_squared(query_tf['vector'].values()))


In [154]:
#query tfidf
query_tfidf = {}
query_tfidf['vector'] = keywords_vector.copy()

for key in query_tf['vector'].keys():
    query_tfidf['vector'][key] = query_tf['vector'][key] * idf[key]
    
query_tfidf['vector_length'] = sqrt(sum_squared(query_tfidf['vector'].values()))


In [151]:
#measure similarity!

def cos_similiarity(a, b):
    ab_sum = 0

    for key in a['vector'].keys():
        ab_sum += a['vector'][key]*b['vector'][key]
    
    return ab_sum / a['vector_length'] * b['vector_length']
    

In [None]:
# calculate scores
scores = []
for doc_tfidf in tfidf:
    scores.append({'idx': doc_tfidf['idx'], 'score': cos_similiarity(query_tfidf, doc_tfidf)})
    

In [169]:
from operator import itemgetter

def show_best_matches(scores, docs, n = 10):
    sorted_list = sorted(scores, key=itemgetter('score'), reverse=True)
    
    for i, elem in enumerate(sorted_list):
        if i > 5:
            return
        
        print('Score: {} . Document title: {}'.format(elem['score'], docs[elem['idx']]['title']))

show_best_matches(scores, documents, 5)

Score: 2.0842203069370195 . Document title: Genetics-Based Machine Learning
Score: 2.0393249717459705 . Document title: Journal of Machine Learning Research Homepage
Score: 2.02627376418614 . Document title: The Machine Learning Systems Group at JPL | Home
Score: 2.0058696685300634 . Document title: ICML-99
Score: 1.9626210853613513 . Document title: Yahoo! Groups : machine-learning
Score: 1.9584403753474926 . Document title: Machine Learning Research Software


In [None]:
documents