In [1]:
import numpy as np
import pandas as pd
import math
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os
import nltk
import pickle
import csv
from scipy import linalg as LA
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# Ranked Retrieval and Document Vectorization

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def documentReader(path, queries = False):
    """
    DocString
    :return: Nothing
    """
    documents_path = os.path.join(os.getcwd(), path)
    documentos = {}
    for filename in os.listdir(documents_path):
        file_path = os.path.join(documents_path, filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = '' if queries else xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')

    return documentos
documentos = documentReader('docs/docs-raw-texts')
NRO_DOCS = len(documentos)
DOCS_IDs = list(documentos.keys())
print(list(documentos.items())[0])

('d001', 'William Beaumont and the Human Digestion William Beaumont and the Human Digestion.  William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and se

In [4]:
def tokenization(documentos):
    """
    :param documentos:
    :return:
    """
    nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

    word_tok = {key: nltk.word_tokenize(doc) for key, doc in documentos.items()}
    word_tok_sw = {key: [token for token in doc if token not in nltk_stop_words_en] for key, doc in word_tok.items()}
    nltk_lemmaList = {key: [wordnet_lemmatizer.lemmatize(word) for word in doc] for key, doc in word_tok_sw.items()}

    return nltk_lemmaList
tokenized_docs = tokenization(documentos)
print(list(tokenized_docs.items())[0])

('d001', ['William', 'Beaumont', 'Human', 'Digestion', 'William', 'Beaumont', 'Human', 'Digestion', '.', 'William', 'Beaumont', ':', 'Physiology', 'digestion', 'Image', 'Source', '.', 'On', 'November', '21', ',', '1785', ',', 'US-American', 'surgeon', 'William', 'Beaumont', 'born', '.', 'He', 'became', 'best', 'known', '“', 'Father', 'Gastric', 'Physiology', '”', 'following', 'research', 'human', 'digestion', '.', 'William', 'Beaumont', 'born', 'Lebanon', ',', 'Connecticut', 'became', 'physician', '.', 'He', 'served', 'surgeon', '’', 'mate', 'Army', 'War', '1812', '.', 'He', 'opened', 'private', 'practice', 'Plattsburgh', ',', 'New', 'York', ',', 'rejoined', 'Army', 'surgeon', '1819', '.', 'Beaumont', 'stationed', 'Fort', 'Mackinac', 'Mackinac', 'Island', 'Michigan', 'early', '1820s', 'existed', 'protect', 'interest', 'American', 'Fur', 'Company', '.', 'The', 'fort', 'became', 'refuge', 'wounded', '19-year-old', 'French-Canadian', 'fur', 'trader', 'named', 'Alexis', 'St.', 'Martin', 's

In [5]:
def makeInvertedIndex(tokenized_docs):
    index = {}

    for id, doc in tokenized_docs.items():
        id = int(id[-3:]) #paasa dnjk al entero njk.
        for token in doc:
            if token in index :
                if index[token]['posting'][-1][0] == id:
                    index[token]['posting'][-1][1] += 1
                else:
                    index[token]['posting'].append([id, 1])
                    index[token]['freq'] += 1

            else:
                index[token] = {
                    'posting': [[id, 1]],
                    'freq': 1
                }
    return index

invertedIndex = makeInvertedIndex(tokenized_docs)
print(list(invertedIndex.items())[0])


('William', {'posting': [[1, 6], [15, 6], [28, 4], [35, 2], [55, 4], [56, 5], [69, 6], [88, 3], [91, 1], [92, 1], [95, 1], [98, 2], [102, 5], [106, 1], [109, 1], [111, 1], [129, 1], [136, 8], [138, 3], [147, 1], [175, 1], [179, 2], [180, 1], [189, 2], [190, 1], [191, 1], [197, 1], [212, 1], [230, 1], [241, 2], [254, 1], [257, 1], [266, 2], [272, 1], [273, 8], [274, 1], [289, 1], [291, 1], [294, 1], [299, 1], [300, 1], [309, 1], [310, 5], [320, 6], [323, 1], [330, 7]], 'freq': 46})


In [6]:
print(list(invertedIndex.items())[1])
print(len(list(invertedIndex.keys())))

('Beaumont', {'posting': [[1, 13]], 'freq': 1})
20446


In [8]:
def tfidfWeightedVector(invertedIndex):

    weightedVectorMatrix = []
    index = []
    columns = []
    for term, term_dict in invertedIndex.items():
        weighted_vector = np.zeros(NRO_DOCS)
        freq = term_dict['freq']
        index.append(term)
        for id, t_freq in term_dict['posting']:
            tfidf = np.log(1 + t_freq) * np.log10(NRO_DOCS / freq)
            weighted_vector[ id - 1] = tfidf

        weightedVectorMatrix.append(weighted_vector)


    weighted_vector_df = pd.DataFrame.from_records(data=weightedVectorMatrix, index=index, columns=DOCS_IDs)
    return weighted_vector_df, index

weighted_vector_df, term_index = tfidfWeightedVector(invertedIndex)
weighted_vector_df.head()

Unnamed: 0,d001,d002,d003,d004,d005,d006,d007,d008,d009,d010,...,d322,d323,d324,d325,d326,d327,d328,d329,d330,d331
William,1.667782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.594076,0.0,0.0,0.0,0.0,0.0,0.0,1.782227,0.0
Beaumont,6.649971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Human,1.36346,0.860247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Digestion,3.493223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
print(f'Matriz tfidf de dimension {weighted_vector_df.shape}')

Matriz tfidf de dimension (20446, 331)


In [10]:
def norma(v):
    suma = sum(v[i]**2 for i in range(len(v)))
    return math.sqrt(suma)

def dot_product(v1, v2):
    product = sum( v1[0][i]*v2[i][0] for i in range(len(v2)) )
    return product

def cosine_Similarity(doc_vec1, doc_vec2):
    # print('.')
    return (dot_product(doc_vec1, doc_vec2)) / (norma(doc_vec1.flatten()) * norma(doc_vec2.flatten()))

def cosine_Similarity_normQ(query, doc):
    return dot_product(query, doc) / norma(doc.flatten())

# HAcer ejemplo a mano a ver si sirve

In [11]:
queries = documentReader('docs/queries-raw-texts', True)
print(list(queries.items())[0])

('q01', ' Fabrication of music instruments')


In [12]:
tokenized_queries = tokenization(queries)
print(list(tokenized_queries.items())[0])

('q01', ['Fabrication', 'music', 'instrument'])


In [14]:

def vectorize_queries(queries, term_index):
    vector_queries = []
    queries_index = []
    for id, query in queries.items():
        queries_index.append(id)
        query_vector = np.zeros(len(term_index)) #Vector de ceros de dimensión V
        len_query = len(query)
        for term in query:
            try:
                index = term_index.index(term)
                query_vector[index] = 1 / math.sqrt(len_query) #Pone en 1 la dimensión del vector correpondiente al termino en term
            except:
                print(f'El término "{term}" de la query {id} no está en los docs')

        vector_queries.append(query_vector)
    return vector_queries, queries_index

vector_queries, queries_index = vectorize_queries(tokenized_queries, term_index)

# vector_queries[0][:1000]


El término "Fabrication" de la query q01 no está en los docs
El término "Computers" de la query q24 no está en los docs
El término "WWII" de la query q25 no está en los docs
El término "Religious" de la query q38 no está en los docs
El término "Personalities" de la query q41 no está en los docs
El término "Campaign" de la query q44 no está en los docs
El término "Friends" de la query q45 no está en los docs


In [34]:
matrix_queries = pd.DataFrame.from_records(data=vector_queries, index=queries_index, columns=term_index)
matrix_queries.head()
# print(vector_queries)
# print(term_index.index('computer'))

# pdf = pd.DataFrame.from_records([[1, 2, 3], [5, 6, 7]], index=['a', 'b'], columns=['c', 'd', 'e'])
# pdf2 = pd.DataFrame.from_records([[1, 2, 3], [5, 6, 7]], index=['a', 'b'], columns=['c', 'd', 'e'])
# pdf

Unnamed: 0,William,Beaumont,Human,Digestion,.,:,Physiology,digestion,Image,Source,...,Gila,Viceroy,Arcángel,247,presidio,Assisi,Asiacutes,36.000,Commanche,Apache
q01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
q02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
q03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
q04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
q06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
print(matrix_queries.iloc[1].sum())

1.7320508075688776


In [36]:
def getCosineSimilarity(queries, documents, query_index, docs_index):
    similarity_matrix = []
    col = []
    row = []
    for query in query_index:
        print(query)
        row_query = queries.loc[[query]].values
        query_doc_sim = []
        # row = row_query
        # print(row_query.shape)
        for document in docs_index:
            col_document = documents[[document]].values
            # col = col_document
            # print(col_document)
            cos_sim = cosine_Similarity_normQ(row_query, col_document)
            # print(f'Cos sim: {cos_sim}')
            query_doc_sim.append(cos_sim)
            # break
        # print(query_doc_sim)
        # break
        similarity_matrix.append(query_doc_sim)
    # return pd.DataFrame.from_records(data=similarity_matrix, index=query_index, columns=docs_index)
    return similarity_matrix

similarity_matrix = getCosineSimilarity(matrix_queries, weighted_vector_df, queries_index, DOCS_IDs)
# similarity_matrix.head()

print(len(similarity_matrix))

q01
q02
q03
q04
q06
q07
q08
q09
q10
q12
q13
q14
q16
q17
q18
q19
q22
q23
q24
q25
q26
q27
q28
q29
q32
q34
q36
q37
q38
q40
q41
q42
q44
q45
q46
35


In [37]:
# pre_dot = [row[0][i]*col[i] for i in range(len(col)) ]
# sum(pre_dot)[0]
# cosine_Similarity(row, col)
# pdf = pd.DataFrame.from_records([[1, 2, 0], [5, 6, 0]], index=['a', 'b'], columns=['c', 'd', 'e'])
# pdf2 = pd.DataFrame.from_records([[1, 2, 0], [5, 6, 0]], index=['a', 'b'], columns=['c', 'd', 'e'])
# pdf
# print('hello')
# (pdf.loc[['b']].values @ pdf[['d']].values)
# np.dot(row, col)
# row.dot(col)
# copy = pdf.loc[['a']].sort_values(by='a', axis=1, ascending=False, inplace=False)
# copy.loc[:, (copy != 0 ).any(axis=0)]

In [38]:
# LA.norm(row.reshape(-1,1))
# LA.norm(row.flatten())
# norma(row.flatten())
# np.dot(pdf.loc[['b']].values,  pdf[['d']].values).sum()
# dot_product(pdf.loc[['b']].values,  pdf[['d']].values)
# len(row[0])

# cosine_Similarity(pdf.loc[['b']].values, pdf[['d']].values)
# pdf.loc[['a']].values.shape
# (10 + 36 )/((61)**(1/2)*(40)**(1/2))

 ## Save cosine similarity Matrix


In [39]:
with open('docs/cos_sim_matrix', 'wb') as picklefile:
    pickle.dump(similarity_matrix,picklefile)

## Read cosine similarity Matrix

In [15]:
with open('docs/cos_sim_matrix', 'rb') as matrix:
    similarity_matrix = pd.DataFrame.from_records(pickle.load(matrix), index=queries_index, columns=DOCS_IDs)


In [16]:
len(similarity_matrix)

35

### Retrieve ordered docs per query

In [17]:
def retrieve_docs(similarity_matrix, query_index):
    results = {}
    for query in query_index:
        order = similarity_matrix.loc[[query]].sort_values(by=query, axis=1, ascending=False, inplace=False)
        relevant = order.loc[:, (order != 0 ).any(axis=0)]
        results[query] = relevant.columns.values.tolist()
    return results

results = retrieve_docs(similarity_matrix, queries_index)
results['q01'][:5]


['d254', 'd016', 'd153', 'd209', 'd186']

In [18]:
len(results['q02'])

194

## Evaluation

In [19]:
def read_judgemnts_file():
    """
    DocString
    :return: Nothing
    """
    document_path = os.path.join(os.getcwd(), 'docs/relevance-judgments.tsv')
    tsv_file = open(document_path)
    read_tsv = csv.reader(tsv_file, delimiter="\t")
    relevance = {}
    for row in read_tsv:
        documents = row[1].split(',')
        query_relevance = {pair.split(':')[0] : pair.split(':')[1] for pair in documents }
        query_relevance = dict(sorted(query_relevance.items(), key=lambda item: item[0]))
        relevance[row[0]] = query_relevance
    return relevance


relevance = read_judgemnts_file()
print(relevance)

{'q01': {'d016': '5', 'd186': '4', 'd254': '5'}, 'q02': {'d136': '2', 'd139': '2', 'd143': '4', 'd147': '2', 'd149': '2', 'd164': '4', 'd228': '4', 'd283': '4', 'd291': '4', 'd293': '4', 'd318': '2'}, 'q03': {'d105': '2', 'd147': '3', 'd152': '3', 'd283': '4', 'd291': '4', 'd318': '2'}, 'q04': {'d010': '3', 'd019': '2', 'd049': '2', 'd270': '3', 'd275': '3', 'd286': '2', 'd330': '2'}, 'q06': {'d026': '4', 'd069': '2', 'd233': '3', 'd257': '2', 'd297': '3', 'd329': '5'}, 'q07': {'d004': '3', 'd077': '3', 'd179': '3', 'd266': '2'}, 'q08': {'d005': '4', 'd028': '3', 'd081': '2', 'd108': '3', 'd110': '4', 'd117': '3', 'd121': '2', 'd180': '2', 'd205': '2', 'd251': '5', 'd271': '3', 'd292': '2'}, 'q09': {'d177': '2', 'd198': '3', 'd199': '5', 'd205': '3', 'd217': '2', 'd223': '2'}, 'q10': {'d052': '2', 'd065': '3', 'd068': '2', 'd076': '3', 'd100': '2', 'd199': '4', 'd215': '2', 'd231': '4'}, 'q12': {'d239': '4', 'd250': '4', 'd258': '3', 'd277': '4'}, 'q13': {'d049': '4', 'd056': '4', 'd23

In [22]:
def make_binary_result(results, relevant_res):
    bin_relevant = {}
    rel_scale_repr = {}
    for query, relevant_docs in relevant_res.items():
        bin_repr = []
        scaled_repr = []
        M = len(relevant_docs)
        for doc_id, rel_scale in relevant_docs.items():
            bin = 1 if doc_id in results[query][:M] else 0
            bin_repr.append(bin)
            scaled_repr.append(bin * int(rel_scale))
        bin_relevant[query] = bin_repr
        rel_scale_repr[query] = scaled_repr
    return bin_relevant, rel_scale_repr

bin_results, scaled_results = make_binary_result(results, relevance)
bin_results

{'q01': [1, 0, 1],
 'q02': [0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1],
 'q03': [1, 1, 1, 1, 1, 1],
 'q04': [1, 1, 1, 1, 1, 1, 0],
 'q06': [1, 1, 1, 1, 1, 1],
 'q07': [1, 0, 0, 0],
 'q08': [1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1],
 'q09': [1, 1, 1, 0, 1, 1],
 'q10': [1, 0, 1, 0, 0, 0, 1, 1],
 'q12': [0, 1, 1, 1],
 'q13': [1, 1, 0, 0, 1],
 'q14': [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0],
 'q16': [0, 0],
 'q17': [1, 1, 1, 0],
 'q18': [1, 1, 1, 1, 1, 1, 0],
 'q19': [0, 1],
 'q22': [1, 1, 0, 0, 1, 1, 0],
 'q23': [0, 0, 0, 0, 0, 0, 1, 1],
 'q24': [0, 0, 0, 0, 0],
 'q25': [1, 0, 1, 0],
 'q26': [1],
 'q27': [1, 0, 1, 1, 0, 0, 0, 0],
 'q28': [0, 1, 0],
 'q29': [1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1],
 'q32': [1, 1, 1, 1, 1],
 'q34': [1],
 'q36': [0, 1, 1, 0, 0, 1, 1, 0, 1, 0],
 'q37': [1, 1, 0],
 'q38': [0, 0, 0, 0, 0, 1, 0, 0],
 'q40': [1, 1, 1, 1, 0, 0, 1, 1, 1],
 'q41': [1, 1, 1, 1, 1, 1, 0],
 'q42': [0, 1, 0],
 'q44': [0, 1, 0, 1, 0, 0, 1, 0, 1, 0],
 'q45': [1, 1, 1, 0, 1, 1, 1, 0],
 'q46': [0, 0, 1, 1, 1, 0]}

In [23]:
scaled_results['q01']

[5, 0, 5]

In [24]:
def precision_at_k(relevance: list, k: int):
    """
    DocString
    :return: Nothing
    """
    if k == 0:
        return 0
    l = np.array(relevance[:k]).sum()/k
    return l

def recall_at_k(relevance: list, nr_relevant: int, k: int):
    """
    DocString
    :return: Nothing
    """
    l = np.array(relevance[:k]).sum()/nr_relevant
    return l

def average_precision(relevance):
    """
    DocString
    :return: Nothing
    """

    length = len(relevance)
    sum = 0
    for i in range(length):
        if relevance[i]:
            sum += precision_at_k(relevance, i+1)
    if np.array(relevance).sum()==0:
        return 0
    else:
        return sum / np.array(relevance).sum()

def mean_avg_precision(l):
    """
    DocString
    :return: Nothing
    """
    average = 0
    for lista in l:
        #print(average_precision(lista))
        average+= average_precision(lista)

    mean = average / len(l)
    return mean

mean_avg_precision([[0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 1], [0, 1, 0, 1, 1, 1, 1]])

0.35468253968253965

In [27]:
def dcg_at_k(relevance, k: int):
    """
    DocString
    :return: Nothing
    """

    sum = 0
    i =  0
    for rel_i in relevance[: k]:
        i+= 1
        sum += rel_i/np.log2(max(i, 2))

    return sum

dcg_at_k([4, 4, 3, 0, 0, 1, 3, 3, 3, 0], 6)

def ndcg_at_k(relevance, k):
    """
    DocString
    :return: Nothing
    """
    rel_sorted = sorted(relevance, reverse=True)
    max = dcg_at_k(rel_sorted, k)
    real = dcg_at_k(relevance, k)

    return real/ max


ndcg_at_k([4, 4, 3, 0, 0, 1, 3, 3, 3, 0], 6)

0.7424602308163405

In [25]:
print(recall_at_k(bin_results['q01'], 3, 3))


0.6666666666666666


## Compute Evaluation Metrics for each query

In [28]:
def evaluation_metric(bin_queries, query_index, scaled_results):
    COLUMNS = ['P@M', 'R@M', 'NDCG@M']
    records = []
    for query, bin_vec in bin_queries.items():
        scaled = scaled_results[query]
        M = len(bin_vec)
        pm = precision_at_k(bin_vec, M)
        rm = recall_at_k(bin_vec, M, M)
        ndcg = ndcg_at_k(scaled, M)
        records.append([pm, rm, ndcg])
        
    return pd.DataFrame.from_records(records, index=query_index, columns=COLUMNS)
        
metrics = evaluation_metric(bin_results, queries_index, scaled_results)
metrics.head(10)
        



    

  return real/ max


Unnamed: 0,P@M,R@M,NDCG@M
q01,0.666667,0.666667,0.815465
q02,0.545455,0.545455,0.570012
q03,1.0,1.0,0.87422
q04,0.857143,0.857143,0.933486
q06,1.0,1.0,0.86393
q07,0.25,0.25,1.0
q08,0.75,0.75,0.84214
q09,0.833333,0.833333,0.880115
q10,0.5,0.5,0.642423
q12,0.75,0.75,0.797833


### MAP

In [29]:
def overall_map(bin_results):
    matrix = [vector for key, vector in bin_results.items() ]
    return mean_avg_precision(matrix)

print(f'MAP resultante de todas las queries: {overall_map(bin_results)}')

MAP resultante de todas las queries: 0.7209512864308784
