In [3]:
import time

In [4]:
import logging
import sys
import os

def write_run(model_name, data, out_f,
              max_objects_per_query=sys.maxsize,
              skip_sorting=False):
    """
    Write a run to an output file.
    Parameters:
        - model_name: identifier of run.
        - data: dictionary mapping topic_id to object_assesments;
            object_assesments is an iterable (list or tuple) of
            (relevance, object_id) pairs.
            The object_assesments iterable is sorted by decreasing order.
        - out_f: output file stream.
        - max_objects_per_query: cut-off for number of objects per query.
    """
    for subject_id, object_assesments in data.items():
        if not object_assesments:
            logging.warning('Received empty ranking for %s; ignoring.',
                            subject_id)

            continue

        # Probe types, to make sure everything goes alright.
        # assert isinstance(object_assesments[0][0], float) or \
        #     isinstance(object_assesments[0][0], np.float32)
        assert isinstance(object_assesments[0][1], str) or \
            isinstance(object_assesments[0][1], bytes)

        if not skip_sorting:
            object_assesments = sorted(object_assesments, reverse=True)

        if max_objects_per_query < sys.maxsize:
            object_assesments = object_assesments[:max_objects_per_query]

        if isinstance(subject_id, bytes):
            subject_id = subject_id.decode('utf8')

        for rank, (relevance, object_id) in enumerate(object_assesments):
            if isinstance(object_id, bytes):
                object_id = object_id.decode('utf8')

            out_f.write(
                '{subject} Q0 {object} {rank} {relevance} '
                '{model_name}\n'.format(
                    subject=subject_id,
                    object=object_id,
                    rank=rank + 1,
                    relevance=relevance,
                    model_name=model_name))
            
# The following writes the run to standard output.
# In your code, you should write the runs to local
# storage in order to pass them to trec_eval.
write_run(
    model_name='example',
    data={
        'Q1': ((1.0, 'DOC1'), (0.5, 'DOC2'), (0.75, 'DOC3')),
        'Q2': ((-0.1, 'DOC1'), (1.25, 'DOC2'), (0.0, 'DOC3')),
    },
    out_f=sys.stdout,
    max_objects_per_query=1000)

Q2 Q0 DOC2 1 1.25 example
Q2 Q0 DOC3 2 0.0 example
Q2 Q0 DOC1 3 -0.1 example
Q1 Q0 DOC1 1 1.0 example
Q1 Q0 DOC3 2 0.75 example
Q1 Q0 DOC2 3 0.5 example


In [5]:
# ./trec_eval

In [6]:
import pyndri

index = pyndri.Index('index/')

ImportError: No module named 'pyndri'

In [7]:
print("There are %d documents in this collection." % (index.maximum_document() - index.document_base()))

NameError: name 'index' is not defined

In [8]:
token2id, id2token, _ = index.get_dictionary()
print(list(id2token.items())[:15])

NameError: name 'index' is not defined

In [9]:
print([id2token[word_id] for word_id in example_document[1] if word_id > 0])

NameError: name 'example_document' is not defined

In [10]:
# tokenizing
query_tokens = index.tokenize("University of Massachusetts")
print("Query by tokens:", query_tokens)
# get the token ids
query_id_tokens = [token2id.get(query_token,0) for query_token in query_tokens]
print("Query by ids with stopwords:", query_id_tokens)
# token ids without stop words (where token_id = 0)
query_id_tokens = [word_id for word_id in query_id_tokens if word_id > 0]
print("Query by ids without stopwords:", query_id_tokens)

NameError: name 'index' is not defined

In [11]:
matching_words = sum([True for word_id in example_document[1] if word_id in query_id_tokens])
print("Document %s has %d word matches with query: \"%s\"." % (example_document[0], matching_words, ' '.join(query_tokens)))
print("Document %s and query \"%s\" have a %.01f%% overlap." % (example_document[0], ' '.join(query_tokens),matching_words/float(len(example_document[1]))*100))

NameError: name 'example_document' is not defined

In [12]:
import collections
import io
import logging
import sys

def parse_topics(file_or_files,
                 max_topics=sys.maxsize, delimiter=';'):
    assert max_topics >= 0 or max_topics is None

    topics = collections.OrderedDict()

    if not isinstance(file_or_files, list) and \
            not isinstance(file_or_files, tuple):
        if hasattr(file_or_files, '__iter__'):
            file_or_files = list(file_or_files)
        else:
            file_or_files = [file_or_files]

    for f in file_or_files:
        assert isinstance(f, io.IOBase)

        for line in f:
            assert(isinstance(line, str))

            line = line.strip()

            if not line:
                continue

            topic_id, terms = line.split(delimiter, 1)

            if topic_id in topics and (topics[topic_id] != terms):
                    logging.error('Duplicate topic "%s" (%s vs. %s).',
                                  topic_id,
                                  topics[topic_id],
                                  terms)

            topics[topic_id] = terms

            if max_topics > 0 and len(topics) >= max_topics:
                break

    return topics

with open('./ap_88_89/topics_title', 'r') as f_topics:
    print(parse_topics([f_topics]))

FileNotFoundError: [Errno 2] No such file or directory: './ap_88_89/topics_title'

In [13]:
with open('./ap_88_89/topics_title', 'r') as f_topics:
    queries = parse_topics([f_topics])

index = pyndri.Index('index/')

num_documents = index.maximum_document() - index.document_base()

dictionary = pyndri.extract_dictionary(index)

tokenized_queries = {
    query_id: [dictionary.translate_token(token)
               for token in index.tokenize(query_string)
               if dictionary.has_token(token)]
    for query_id, query_string in queries.items()}

query_term_ids = set(
    query_term_id
    for query_term_ids in tokenized_queries.values()
    for query_term_id in query_term_ids)

print('Gathering statistics about', len(query_term_ids), 'terms.')

# inverted index creation.

document_lengths = {}
unique_terms_per_document = {}

inverted_index = collections.defaultdict(dict)
collection_frequencies = collections.defaultdict(int)

total_terms = 0

for int_doc_id in range(index.document_base(), index.maximum_document()):
    ext_doc_id, doc_token_ids = index.document(int_doc_id)

    document_bow = collections.Counter(
        token_id for token_id in doc_token_ids
        if token_id > 0)
    document_length = sum(document_bow.values())

    document_lengths[int_doc_id] = document_length
    total_terms += document_length

    unique_terms_per_document[int_doc_id] = len(document_bow)

    for query_term_id in query_term_ids:
        assert query_term_id is not None

        document_term_frequency = document_bow.get(query_term_id, 0)

        if document_term_frequency == 0:
            continue

        collection_frequencies[query_term_id] += document_term_frequency
        inverted_index[query_term_id][int_doc_id] = document_term_frequency

avg_doc_length = total_terms / num_documents
avg_doc_length

# print('Inverted index creation took', time.time() - start_time, 'seconds.')

FileNotFoundError: [Errno 2] No such file or directory: './ap_88_89/topics_title'

In [14]:
def run_retrieval(model_name, score_fn):
    """
    Runs a retrieval method for all the queries and writes the TREC-friendly results in a file.
    
    :param model_name: the name of the model (a string)
    :param score_fn: the scoring function (a function - see below for an example) 
    """
    run_out_path = '{}.run'.format(model_name)

    if os.path.exists(run_out_path):
        return

    retrieval_start_time = time.time()

    print('Retrieving using', model_name)

    data = {}

    # TODO: fill the data dictionary. 
    # The dictionary data should have the form: query_id --> (document_score, external_doc_id)
    
    
    
    with open(run_out_path, 'w') as f_out:
        write_run(
            model_name=model_name,
            data=data,
            out_f=f_out,
            max_objects_per_query=1000)

tf_idf = 0
for every term in query:
    tf_idf += tf-idf(term)
    
print(tf_idf)

In [15]:
import numpy as np

In [16]:
matching_words = sum([True for word_id in example_document[1] if word_id in query_id_tokens])
# word matches
example_document[0], matching_words, ' '.join(query_tokens)

NameError: name 'example_document' is not defined

In [17]:
doc = index.document(10)
query_term_id = 400
matching_words = sum([True for word_id in doc[1] if word_id == query_term_id])
# print(doc)
matching_words

NameError: name 'index' is not defined

In [18]:
with open('./ap_88_89/topics_title', 'r') as f_topics:
    queries = parse_topics([f_topics])

index = pyndri.Index('index/')

num_documents = index.maximum_document() - index.document_base()

dictionary = pyndri.extract_dictionary(index)

tokenized_queries = {
    query_id: [dictionary.translate_token(token)
               for token in index.tokenize(query_string)
               if dictionary.has_token(token)]
    for query_id, query_string in queries.items()}

query_term_ids = set(
    query_term_id
    for query_term_ids in tokenized_queries.values()
    for query_term_id in query_term_ids)

print('Gathering statistics about', len(query_term_ids), 'terms.')

#inverted index creation.

document_lengths = {}
unique_terms_per_document = {}

inverted_index = collections.defaultdict(dict)
collection_frequencies = collections.defaultdict(int)

total_terms = 0

for int_doc_id in range(index.document_base(), index.maximum_document()):     #for all documents
    ext_doc_id, doc_token_ids = index.document(int_doc_id)

    document_bow = collections.Counter(               #make a BoW for each document with all non-stop words
        token_id for token_id in doc_token_ids
        if token_id > 0)
    document_length = sum(document_bow.values())       #keep track of doc length

    document_lengths[int_doc_id] = document_length     # >>
    total_terms += document_length                     #keep track of total length of all documents

    unique_terms_per_document[int_doc_id] = len(document_bow)    #keep track of unique terms per document (w/o stop words)

    for query_term_id in query_term_ids:                #for each query term
        assert query_term_id is not None                #validity check

        document_term_frequency = document_bow.get(query_term_id, 0)      #get query term freq from doc BoW, else 0

        if document_term_frequency == 0:                #term doesnt exist in text
            continue

        collection_frequencies[query_term_id] += document_term_frequency       # total times each term exists in corpus
        inverted_index[query_term_id][int_doc_id] = document_term_frequency    # total times each term exists in a
#         if int_doc_id==1: #and query_term_id==5814:
#             print('YOLO')
#             print(inverted_index[query_term_id][int_doc_id])
                                                                               #      specific document
avg_doc_length = total_terms / num_documents
avg_doc_length

FileNotFoundError: [Errno 2] No such file or directory: './ap_88_89/topics_title'

In [2]:
# number of docs
num_docs = index.maximum_document() - index.document_base()

def tfidf(int_document_id, query_term_id, document_term_freq):
    """
    Scoring function for a document and a query term
    
    :param int_document_id: the document id
    :param query_token_id: the query term id (assuming you have split the query to tokens)
    :param document_term_freq: the document term frequency of the query term 
    """
    doc = index.document(int_document_id)
    
    # TODO implement the function
    idf = np.log(num_docs / document_term_freq)
    tf = np.log(1 + sum([True for word_id in doc[1] if word_id == query_term_id]))
    tf_idf = tf*idf
    return tf_idf

# combining the two functions above: 
run_retrieval('tfidf', tfidf)

# TODO implement tools to help you with the analysis of the results.

NameError: name 'index' is not defined

In [1]:
def scorefunction(doc_id, q_term_id, tf, mode):
    if mode==1:
        #print('Tfidf mothafucka')
        return tfidf(doc_id, q_term_id, tf)
#     else:
#         print('Not compatible mode type detected')
        
scorefunction(1, 190, 5, 1)

NameError: name 'tfidf' is not defined

In [None]:
tf_idfs = collections.defaultdict(dict)
bm25s = collections.defaultdict(dict)
mode=1 #tfidf

for doc_id in range(index.document_base(), index.maximum_document()):    
    for query_id in tokenized_queries: 
        print('Doc '+str(doc_id)+' with query '+str(query_id))
        query_doc_score = 0
        for term_id in tokenized_queries[query_id]:
            df = collection_frequencies[term_id]
            freq = inverted_index.get(term_id, {}).get(doc_id, -1)
            if freq!=-1:
                tf = inverted_index[term_id][doc_id]
                query_doc_score += scorefunction(doc_id, term_id, tf, mode)
                #print('Term '+str(term_id)+' agumented the q_d_score to '+str(query_doc_score))
        tf_idfs[query_id][int_doc_id] = query_doc_score

In [55]:
print(tf_idfs.values())

dict_values([{164597: 0}])
