# HW 5 - Indexing and Document retrieval

In [None]:
!pip3 install scikit-learn
!pip3 install pandas
!pip3 install nltk

In [54]:
# top k relevant results parameter
k = 15

# number of documents
documentsCount = 1400

# number of queries and results
queryCount = 225

### Results

In [55]:
results = []

for resultId in range(1, queryCount + 1):
    file = open(f"../cranfield/r/{resultId}.txt")
    results.append([int(number) for number in file.read().split("\n") if number != ""])


In [56]:
def recall(queryId, retrievedDocuments):
    return float(len(set(retrievedDocuments).intersection(set(results[queryId - 1]))) / len(results[queryId - 1]))

In [57]:
def precision(queryId, retrievedDocuments):
    return float(len(set(retrievedDocuments).intersection(set(results[queryId - 1]))) / len(retrievedDocuments))

In [58]:
def fMeasure(precision, recall):
    if precision + recall == 0:
        return 0
    return 2 * precision * recall / (precision + recall)

## TF-IDF representation

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import pandas as pd

# prepare corpus
corpus = []

for documentId in range(1, documentsCount + 1):
    file = open(f"../cranfield/d/{documentId}.txt")
    corpus.append(file.read())

# dataframe for storing the results
df = pd.DataFrame([], columns=["query id",
                               "cosine recall",
                               "euclidean recall",
                               "cosine precision",
                               "euclidean precision",
                               "cosine f-measure",
                               "euclidean f-measure",
                               "cosine similarity results",
                               "euclidean similarity results",
                               ]
                  )  # [] are empty data

cosineRecallSum = 0
euclideanRecallSum = 0
cosinePrecisionSum = 0
euclideanPrecisionSum = 0
cosineFmeasureSum = 0
euclideanFmeasureSum = 0

# init vectorizer
tfidfVectorizer = TfidfVectorizer(stop_words="english")

# add query to corpus
for queryId in range(1, queryCount + 1):
    file = open(f"../cranfield/q/{queryId}.txt")
    corpus.append(file.read())

    # prepare matrix
    tfidfMatrix = tfidfVectorizer.fit_transform(corpus)

    query = tfidfMatrix[len(corpus) - 1]
    documents = tfidfMatrix[0: (len(corpus) - 1)]

    # compute cosine similarity and euclidean distance between query and all docs (tf-idf) and get top k relevant
    # [0] because otherwise return array like [[...]]
    cosineSimilarity = np.array(cosine_similarity(query, documents)[0])
    euclideanDistance = np.array(euclidean_distances(query, documents)[0])
    # get indexes of k most relevant documents, reverse them and add 1, because documents are indexed from 1
    topKRelevantCosineSimilarity = cosineSimilarity.argsort()[-k:][::-1] + 1 # reversed - top k most similar
    topKRelevantEuclideanDistance = euclideanDistance.argsort()[:k] + 1 # not reversed - top k smallest distances

    cosineRecall = recall(queryId, topKRelevantCosineSimilarity)
    euclideanRecall = recall(queryId, topKRelevantEuclideanDistance)
    cosinePrecision = precision(queryId, topKRelevantCosineSimilarity)
    euclideanPrecision = precision(queryId, topKRelevantEuclideanDistance)
    cosineFmeasure = fMeasure(cosinePrecision, cosineRecall)
    euclideanFmeasure = fMeasure(euclideanPrecision, euclideanRecall)
    cosineRecallSum += cosineRecall
    euclideanRecallSum += euclideanRecall
    cosinePrecisionSum += cosinePrecision
    euclideanPrecisionSum += euclideanPrecision
    cosineFmeasureSum += cosineFmeasure
    euclideanFmeasureSum += euclideanFmeasure

    df.loc[len(df.index)] = [queryId,
                             cosineRecall,
                             euclideanRecall,
                             cosinePrecisionSum,
                             euclideanPrecision,
                             cosineFmeasure,
                             euclideanFmeasure,
                             topKRelevantCosineSimilarity,
                             topKRelevantEuclideanDistance
                             ]
    # save result to dataframe

    corpus.pop()  # remove query

print(f"average recall cosine similarity: {cosineRecallSum / len(results)}")
print(
    f"average recall euclidean distance: {euclideanRecallSum / len(results)}")
print(
    f"average precision cosine similarity: {cosinePrecisionSum / len(results)}")
print(
    f"average precision euclidean distance: {euclideanPrecisionSum / len(results)}")
print(
    f"average f-measure cosine similarity: {cosineFmeasureSum / len(results)}")
print(
    f"average f-measure euclidean distance: {euclideanFmeasureSum / len(results)}")
display(df)

df.to_csv("../results/tfidf.csv", index=False)

average recall cosine similarity: 0.4118004319281962
average recall euclidean distance: 0.3952715661908679
average precision cosine similarity: 0.194962962962963
average precision euclidean distance: 0.18666666666666676
average f-measure cosine similarity: 0.24749918024035936
average f-measure euclidean distance: 0.2370569572850955


Unnamed: 0,query id,cosine recall,euclidean recall,cosine precision,euclidean precision,cosine f-measure,euclidean f-measure,cosine similarity results,euclidean similarity results
0,1,0.172414,0.172414,0.333333,0.333333,0.227273,0.227273,"[13, 184, 51, 12, 486, 359, 429, 327, 792, 746...","[995, 471, 13, 184, 51, 12, 486, 359, 429, 327..."
1,2,0.160000,0.160000,0.600000,0.266667,0.200000,0.200000,"[12, 51, 184, 746, 884, 875, 792, 578, 726, 10...","[995, 471, 12, 51, 184, 746, 884, 875, 792, 57..."
2,3,0.888889,0.888889,1.133333,0.533333,0.666667,0.666667,"[485, 5, 144, 181, 399, 542, 91, 707, 90, 584,...","[471, 995, 485, 5, 144, 181, 399, 542, 91, 707..."
3,4,1.000000,1.000000,1.333333,0.200000,0.333333,0.333333,"[166, 1275, 185, 236, 1189, 575, 317, 488, 137...","[995, 471, 166, 1275, 185, 236, 1189, 575, 317..."
4,5,0.400000,0.400000,1.466667,0.133333,0.200000,0.200000,"[103, 1374, 401, 575, 26, 360, 410, 568, 552, ...","[471, 995, 103, 1374, 401, 575, 26, 360, 410, ..."
...,...,...,...,...,...,...,...,...,...
220,221,0.105263,0.105263,42.933333,0.133333,0.117647,0.117647,"[1386, 388, 3, 1366, 55, 851, 1227, 395, 836, ...","[471, 995, 1386, 388, 3, 1366, 55, 851, 1227, ..."
221,222,0.300000,0.300000,43.133333,0.200000,0.240000,0.240000,"[1130, 400, 1070, 1399, 888, 1176, 393, 1048, ...","[995, 471, 1130, 400, 1070, 1399, 888, 1176, 3..."
222,223,0.400000,0.400000,43.266667,0.133333,0.200000,0.200000,"[400, 484, 393, 388, 1399, 943, 1244, 1387, 10...","[471, 995, 400, 484, 393, 388, 1399, 943, 1244..."
223,224,0.555556,0.444444,43.600000,0.266667,0.416667,0.333333,"[1312, 537, 317, 1139, 656, 1157, 1316, 1313, ...","[471, 995, 1312, 537, 317, 1139, 656, 1157, 13..."


## Pure term Frequency

In [60]:
import scipy
from sklearn.feature_extraction.text import CountVectorizer

df.drop(df.index, inplace=True)

cosineRecallSum = 0
euclideanRecallSum = 0
cosinePrecisionSum = 0
euclideanPrecisionSum = 0
cosineFmeasureSum = 0
euclideanFmeasureSum = 0

countVectorizer = CountVectorizer(stop_words="english")

def normalize(M):
    if (M > 0):
        return 1 / M
    return 0

vectorizedNormalization = np.vectorize(normalize)

# add query to corpus
for queryId in range(1, queryCount + 1):
    file = open(f"../cranfield/q/{queryId}.txt")
    corpus.append(file.read())

    # prepare matrix
    frequencyMatrix = countVectorizer.fit_transform(corpus)
    
    # row normalization
    sums = frequencyMatrix.sum(axis=1)
    norms = vectorizedNormalization(sums)
    normalizedFrequencyMatrix = frequencyMatrix.multiply(norms)

    # convert back to csr_matrix (compressed sparse row matrix)
    normalizedFrequencyMatrix = scipy.sparse.csr_matrix(normalizedFrequencyMatrix)

    query = frequencyMatrix[len(corpus) - 1]
    documents = frequencyMatrix[0: (len(corpus) - 1)]

    # compute cosine similarity and euclidean distance between query and all docs (tf-idf) and get top k relevant
    # [0] because otherwise return array like [[...]]
    cosineSimilarity = np.array(cosine_similarity(query, documents)[0])
    euclideanDistance = np.array(euclidean_distances(query, documents)[0])
    # get indexes of k most relevant documents and add 1, because documents are indexed from 1
    topKRelevantCosineSimilarity = cosineSimilarity.argsort()[-k:][::-1] + 1 # reversed - top k most similar
    topKRelevantEuclideanDistance = euclideanDistance.argsort()[:k] + 1 # not reversed - top k smallest distances

    cosineRecall = recall(queryId, topKRelevantCosineSimilarity)
    euclideanRecall = recall(queryId, topKRelevantEuclideanDistance)
    cosinePrecision = precision(queryId, topKRelevantCosineSimilarity)
    euclideanPrecision = precision(queryId, topKRelevantEuclideanDistance)
    cosineFmeasure = fMeasure(cosinePrecision, cosineRecall)
    euclideanFmeasure = fMeasure(euclideanPrecision, euclideanRecall)
    cosineRecallSum += cosineRecall
    euclideanRecallSum += euclideanRecall
    cosinePrecisionSum += cosinePrecision
    euclideanPrecisionSum += euclideanPrecision
    cosineFmeasureSum += cosineFmeasure
    euclideanFmeasureSum += euclideanFmeasure
    
    df.loc[len(df.index)] = [queryId,
                             cosineRecall,
                             euclideanRecall,
                             cosinePrecisionSum,
                             euclideanPrecision,
                             cosineFmeasure,
                             euclideanFmeasure,
                             topKRelevantCosineSimilarity,
                             topKRelevantEuclideanDistance
                             ]
    # save result to dataframe

    corpus.pop()  # remove query

print(f"average recall cosine similarity: {cosineRecallSum / len(results)}")
print(f"average recall euclidean distance: {euclideanRecallSum / len(results)}")
print(f"average precision cosine similarity: {cosinePrecisionSum / len(results)}")
print(f"average precision euclidean distance: {euclideanPrecisionSum / len(results)}")
print(f"average f-measure cosine similarity: {cosineFmeasureSum / len(results)}")
print(f"average f-measure euclidean distance: {euclideanFmeasureSum / len(results)}")
display(df)

df.to_csv("../results/tf.csv", index=False)

average recall cosine similarity: 0.3522248906441924
average recall euclidean distance: 0.02450924639430387
average precision cosine similarity: 0.16888888888888876
average precision euclidean distance: 0.011851851851851856
average f-measure cosine similarity: 0.2130027720756295
average f-measure euclidean distance: 0.015025596706319786


Unnamed: 0,query id,cosine recall,euclidean recall,cosine precision,euclidean precision,cosine f-measure,euclidean f-measure,cosine similarity results,euclidean similarity results
0,1,0.172414,0.034483,0.333333,0.066667,0.227273,0.045455,"[429, 12, 13, 184, 51, 792, 114, 1063, 578, 43...","[995, 471, 429, 670, 1045, 507, 320, 382, 3, 4..."
1,2,0.160000,0.000000,0.600000,0.000000,0.200000,0.000000,"[12, 578, 51, 429, 792, 588, 746, 114, 1063, 1...","[471, 995, 429, 670, 1045, 507, 320, 875, 854,..."
2,3,0.555556,0.222222,0.933333,0.133333,0.416667,0.166667,"[181, 5, 144, 485, 399, 542, 707, 159, 350, 55...","[995, 471, 670, 507, 1045, 320, 382, 854, 485,..."
3,4,1.000000,0.000000,1.133333,0.000000,0.333333,0.000000,"[317, 236, 166, 185, 575, 1286, 378, 1077, 975...","[995, 471, 507, 1045, 670, 320, 382, 3, 405, 8..."
4,5,0.200000,0.000000,1.200000,0.000000,0.100000,0.000000,"[103, 360, 26, 573, 1158, 568, 327, 1002, 1374...","[995, 471, 670, 507, 1045, 320, 382, 405, 3, 8..."
...,...,...,...,...,...,...,...,...,...
220,221,0.052632,0.000000,37.066667,0.000000,0.058824,0.000000,"[388, 1386, 404, 1283, 310, 3, 491, 498, 984, ...","[471, 995, 3, 320, 507, 382, 1045, 607, 670, 5..."
221,222,0.400000,0.000000,37.333333,0.000000,0.320000,0.000000,"[400, 1130, 1399, 1400, 888, 1048, 1070, 419, ...","[995, 471, 507, 1045, 3, 670, 320, 382, 854, 4..."
222,223,0.400000,0.000000,37.466667,0.000000,0.200000,0.000000,"[400, 484, 1399, 1400, 393, 1387, 1244, 388, 9...","[471, 995, 3, 854, 507, 1045, 320, 670, 382, 1..."
223,224,0.444444,0.000000,37.733333,0.000000,0.333333,0.000000,"[1312, 1157, 317, 1313, 329, 656, 1316, 1257, ...","[995, 471, 670, 1045, 507, 320, 405, 533, 382,..."


## Binary representation

In [61]:
import scipy
from sklearn.feature_extraction.text import CountVectorizer

df.drop(df.index, inplace=True)

cosineRecallSum = 0
euclideanRecallSum = 0
cosinePrecisionSum = 0
euclideanPrecisionSum = 0
cosineFmeasureSum = 0
euclideanFmeasureSum = 0

countVectorizer = CountVectorizer(stop_words="english", binary=True)

def normalize(M):
    if (M > 0):
        return 1 / M
    return 0

vectorizedNormalization = np.vectorize(normalize)

# add query to corpus
for queryId in range(1, queryCount + 1):
    file = open(f"../cranfield/q/{queryId}.txt")
    corpus.append(file.read())

    # prepare matrix
    frequencyMatrix = countVectorizer.fit_transform(corpus)
    
    # row normalization
    sums = frequencyMatrix.sum(axis=1)
    norms = vectorizedNormalization(sums)
    normalizedFrequencyMatrix = frequencyMatrix.multiply(norms)

    # convert back to csr_matrix (compressed sparse row matrix)
    normalizedFrequencyMatrix = scipy.sparse.csr_matrix(normalizedFrequencyMatrix)

    query = frequencyMatrix[len(corpus) - 1]
    documents = frequencyMatrix[0: (len(corpus) - 1)]

    # compute cosine similarity and euclidean distance between query and all docs (tf-idf) and get top k relevant
    # [0] because otherwise return array like [[...]]
    cosineSimilarity = np.array(cosine_similarity(query, documents)[0])
    euclideanDistance = np.array(euclidean_distances(query, documents)[0])
    # get indexes of k most relevant documents and add 1, because documents are indexed from 1
    topKRelevantCosineSimilarity = cosineSimilarity.argsort()[-k:][::-1] + 1 # reversed - top k most similar
    topKRelevantEuclideanDistance = euclideanDistance.argsort()[:k] + 1 # not reversed - top k smallest distances

    cosineRecall = recall(queryId, topKRelevantCosineSimilarity)
    euclideanRecall = recall(queryId, topKRelevantEuclideanDistance)
    cosinePrecision = precision(queryId, topKRelevantCosineSimilarity)
    euclideanPrecision = precision(queryId, topKRelevantEuclideanDistance)
    cosineFmeasure = fMeasure(cosinePrecision, cosineRecall)
    euclideanFmeasure = fMeasure(euclideanPrecision, euclideanRecall)
    cosineRecallSum += cosineRecall
    euclideanRecallSum += euclideanRecall
    cosinePrecisionSum += cosinePrecision
    euclideanPrecisionSum += euclideanPrecision
    cosineFmeasureSum += cosineFmeasure
    euclideanFmeasureSum += euclideanFmeasure
    
    df.loc[len(df.index)] = [queryId,
                             cosineRecall,
                             euclideanRecall,
                             cosinePrecisionSum,
                             euclideanPrecision,
                             cosineFmeasure,
                             euclideanFmeasure,
                             topKRelevantCosineSimilarity,
                             topKRelevantEuclideanDistance
                             ]
    # save result to dataframe

    corpus.pop()  # remove query

print(f"average recall cosine similarity: {cosineRecallSum / len(results)}")
print(f"average recall euclidean distance: {euclideanRecallSum / len(results)}")
print(f"average precision cosine similarity: {cosinePrecisionSum / len(results)}")
print(f"average precision euclidean distance: {euclideanPrecisionSum / len(results)}")
print(f"average f-measure cosine similarity: {cosineFmeasureSum / len(results)}")
print(f"average f-measure euclidean distance: {euclideanFmeasureSum / len(results)}")
display(df)

df.to_csv("../results/binary.csv", index=False)

average recall cosine similarity: 0.3489526181549953
average recall euclidean distance: 0.036008796227187036
average precision cosine similarity: 0.16177777777777771
average precision euclidean distance: 0.01659259259259261
average f-measure cosine similarity: 0.20678550947192909
average f-measure euclidean distance: 0.021485892465035034


Unnamed: 0,query id,cosine recall,euclidean recall,cosine precision,euclidean precision,cosine f-measure,euclidean f-measure,cosine similarity results,euclidean similarity results
0,1,0.172414,0.034483,0.333333,0.066667,0.227273,0.045455,"[12, 878, 429, 13, 1111, 345, 430, 184, 1063, ...","[995, 471, 3, 507, 1045, 320, 429, 405, 31, 38..."
1,2,0.080000,0.000000,0.466667,0.000000,0.100000,0.000000,"[12, 878, 578, 429, 700, 1087, 364, 321, 726, ...","[471, 995, 1045, 3, 507, 320, 429, 405, 31, 67..."
2,3,0.555556,0.333333,0.800000,0.200000,0.416667,0.250000,"[399, 181, 5, 485, 542, 159, 476, 144, 584, 35...","[995, 471, 485, 507, 3, 1045, 320, 405, 399, 2..."
3,4,0.666667,0.000000,0.933333,0.000000,0.222222,0.000000,"[166, 1011, 378, 1085, 1255, 517, 488, 1189, 5...","[995, 471, 507, 1045, 3, 320, 405, 31, 1152, 3..."
4,5,0.600000,0.000000,1.133333,0.000000,0.300000,0.000000,"[488, 355, 1296, 1272, 68, 1032, 103, 401, 625...","[995, 471, 1045, 3, 507, 320, 405, 31, 1152, 6..."
...,...,...,...,...,...,...,...,...,...
220,221,0.052632,0.000000,35.400000,0.000000,0.058824,0.000000,"[3, 1283, 388, 326, 637, 528, 1233, 587, 540, ...","[995, 471, 3, 507, 320, 1045, 405, 382, 31, 48..."
221,222,0.500000,0.000000,35.733333,0.000000,0.400000,0.000000,"[1399, 1048, 1130, 400, 1050, 31, 1400, 1396, ...","[471, 995, 3, 31, 507, 1045, 320, 405, 1358, 1..."
222,223,0.600000,0.000000,35.933333,0.000000,0.300000,0.000000,"[400, 1358, 1399, 1357, 1048, 1050, 31, 1400, ...","[471, 995, 3, 507, 31, 1045, 320, 1358, 405, 1..."
223,224,0.444444,0.111111,36.200000,0.066667,0.333333,0.083333,"[1318, 1395, 1312, 175, 1286, 1299, 401, 323, ...","[995, 471, 1045, 3, 507, 320, 405, 31, 1317, 6..."


## Existing model from Hugging face

In [62]:
!pip3 install -q sentence_transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [67]:

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 10.9MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.84MB/s]
Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 67.4MB/s]
Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<00:00, 6.20MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 1.13MB/s]
Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 362kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [03:01<00:00, 2.41MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 460kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 2.69MB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.44MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 3.74MB/s]
Downloading (…)8e1d/train_script.py: 100%|██

In [68]:
corpusEmbeddings = model.encode(corpus)

print(corpusEmbeddings)
corpusEmbeddings = [embedding.reshape(1, -1) for embedding in corpusEmbeddings]

[[-0.05654993 -0.04450176  0.0155262  ...  0.01719159  0.06158198
   0.01929863]
 [-0.0061557  -0.0629016   0.00471515 ... -0.00511769  0.01629838
   0.02362414]
 [-0.02843554 -0.04713926 -0.00580276 ... -0.0245406   0.01785917
   0.01753359]
 ...
 [ 0.00858319 -0.05427675  0.01913198 ...  0.06678689  0.01607494
  -0.06318463]
 [ 0.00231377 -0.05604492  0.01889677 ...  0.07238677 -0.00293132
  -0.05263372]
 [-0.00948589 -0.05798604  0.02810153 ...  0.07426447 -0.028294
  -0.04074294]]


In [71]:
df.drop(df.index, inplace=True)

cosineRecallSum = 0
euclideanRecallSum = 0
cosinePrecisionSum = 0
euclideanPrecisionSum = 0
cosineFmeasureSum = 0
euclideanFmeasureSum = 0

for queryId in range(1, queryCount + 1):
    file = open(f"../cranfield/q/{queryId}.txt")
    query = file.read()
    queryEmbedding = model.encode(query).reshape(1, -1)
    
    cosineSimilarity = np.array([cosine_similarity(queryEmbedding, embedding)[0] for embedding in corpusEmbeddings]).flatten()
    euclideanDistance = np.array([euclidean_distances(queryEmbedding, embedding) for embedding in corpusEmbeddings]).flatten()
    
    # get indexes of k most relevant documents and add 1, because documents are indexed from 1
    topKRelevantCosineSimilarity = cosineSimilarity.argsort()[-k:][::-1] + 1 # reversed - top k most similar
    topKRelevantEuclideanDistance = euclideanDistance.argsort()[:k] + 1 # not reversed - top k smallest distances

    cosineRecall = recall(queryId, topKRelevantCosineSimilarity)
    euclideanRecall = recall(queryId, topKRelevantEuclideanDistance)
    cosinePrecision = precision(queryId, topKRelevantCosineSimilarity)
    euclideanPrecision = precision(queryId, topKRelevantEuclideanDistance)
    cosineFmeasure = fMeasure(cosinePrecision, cosineRecall)
    euclideanFmeasure = fMeasure(euclideanPrecision, euclideanRecall)
    cosineRecallSum += cosineRecall
    euclideanRecallSum += euclideanRecall
    cosinePrecisionSum += cosinePrecision
    euclideanPrecisionSum += euclideanPrecision
    cosineFmeasureSum += cosineFmeasure
    euclideanFmeasureSum += euclideanFmeasure
    
    df.loc[len(df.index)] = [queryId,
                             cosineRecall,
                             euclideanRecall,
                             cosinePrecisionSum,
                             euclideanPrecision,
                             cosineFmeasure,
                             euclideanFmeasure,
                             topKRelevantCosineSimilarity,
                             topKRelevantEuclideanDistance
                             ]

print(f"average recall cosine similarity: {cosineRecallSum / len(results)}")
print(f"average recall euclidean distance: {euclideanRecallSum / len(results)}")
print(f"average precision cosine similarity: {cosinePrecisionSum / len(results)}")
print(f"average precision euclidean distance: {euclideanPrecisionSum / len(results)}")
print(f"average f-measure cosine similarity: {cosineFmeasureSum / len(results)}")
print(f"average f-measure euclidean distance: {euclideanFmeasureSum / len(results)}")
display(df)

df.to_csv("../results/huggingface.csv", index=False)

average recall cosine similarity: 0.5134356955669295
average recall euclidean distance: 0.5134356955669295
average precision cosine similarity: 0.24237037037037054
average precision euclidean distance: 0.24237037037037054
average f-measure cosine similarity: 0.30709821408436894
average f-measure euclidean distance: 0.30709821408436894


Unnamed: 0,query id,cosine recall,euclidean recall,cosine precision,euclidean precision,cosine f-measure,euclidean f-measure,cosine similarity results,euclidean similarity results
0,1,0.275862,0.275862,0.533333,0.533333,0.363636,0.363636,"[486, 184, 51, 13, 12, 746, 497, 1328, 860, 29...","[486, 184, 51, 13, 12, 746, 497, 1328, 860, 29..."
1,2,0.200000,0.200000,0.866667,0.333333,0.250000,0.250000,"[12, 746, 51, 792, 875, 253, 925, 658, 747, 14...","[12, 746, 51, 792, 875, 253, 925, 658, 747, 14..."
2,3,0.777778,0.777778,1.333333,0.466667,0.583333,0.583333,"[399, 485, 5, 144, 181, 91, 102, 978, 542, 982...","[399, 485, 5, 144, 181, 91, 102, 978, 542, 982..."
3,4,1.000000,1.000000,1.533333,0.200000,0.333333,0.333333,"[236, 166, 1295, 167, 488, 103, 1372, 1374, 69...","[236, 166, 1295, 167, 488, 103, 1372, 1374, 69..."
4,5,0.600000,0.600000,1.733333,0.200000,0.300000,0.300000,"[488, 401, 552, 574, 1061, 1374, 1391, 1204, 3...","[488, 401, 552, 574, 1061, 1374, 1391, 1204, 3..."
...,...,...,...,...,...,...,...,...,...
220,221,0.210526,0.210526,53.600000,0.266667,0.235294,0.235294,"[292, 1365, 376, 562, 611, 54, 943, 1182, 94, ...","[292, 1365, 376, 562, 611, 54, 943, 1182, 94, ..."
221,222,0.700000,0.700000,54.066667,0.466667,0.560000,0.560000,"[1399, 1400, 1396, 419, 1120, 412, 1357, 863, ...","[1399, 1400, 1396, 419, 1120, 412, 1357, 863, ..."
222,223,0.600000,0.600000,54.266667,0.200000,0.300000,0.300000,"[1399, 1398, 400, 1400, 412, 1396, 1120, 1357,...","[1399, 1398, 400, 1400, 412, 1396, 1120, 1357,..."
223,224,0.111111,0.111111,54.333333,0.066667,0.083333,0.083333,"[656, 575, 171, 541, 556, 329, 317, 1299, 318,...","[656, 575, 171, 541, 556, 329, 317, 1299, 318,..."
