In [16]:
import gensim
from elasticsearch import Elasticsearch
from gensim.models import FastText
from sklearn.datasets import fetch_20newsgroups

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all')

# Preprocess the documents
preprocessed_docs = []
for doc in newsgroups.data:
    # Tokenize the document
    tokens = gensim.utils.simple_preprocess(doc.lower())
    # Remove stop words and stem the tokens
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    # Join the stemmed tokens back into a string
    preprocessed_doc = ' '.join(stemmed_tokens)
    preprocessed_docs.append(preprocessed_doc)

# Train the FastText model
model = FastText(preprocessed_docs, vector_size=300, window=5, min_count=5, workers=4)

es = Elasticsearch(['http://localhost:9200'])

# Delete the index if it already exists
index_name = 'my_index'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Iterate over preprocessed documents and generate vectors
for i, doc in enumerate(preprocessed_docs):
    # Split the preprocessed document into tokens
    tokens = doc.split()

    # Try to generate the vector for the document
    try:
        vector_sum = 0
        count = 0
        for token in tokens:
            if token in model.wv:
                vector_sum += model.wv[token]
                count += 1
        vector = vector_sum / count
    except ZeroDivisionError:
        vector = None

    # Store the document and its vector in the Elasticsearch index
    if vector is not None:
        es.index(index=index_name, id=i, body={'text': doc, 'vector': vector.tolist()})


In [28]:
# print first 10 documents
for i in range(10):
    print(es.get(index=index_name, id=i))

{'_index': 'my_index', '_id': '0', '_version': 1, '_seq_no': 0, '_primary_term': 1, 'found': True, '_source': {'text': 'mamatha devineni ratnam mr andrew cmu edu subject pen fan reaction organ post offic carnegi mellon pittsburgh pa line nntp post host po andrew cmu edu sure basher pen fan pretti confus lack kind post recent pen massacr devil actual bit puzzl bit reliev go end non pittsburgh relief bit prais pen man kill devil wors thought jagr show better regular season stat lot fo fun watch playoff bowman let jagr lot fun coupl game pen go beat pulp jersei disappoint island lose final regular season game pen rule', 'vector': [-5.303909347276203e-05, 7.180315151344985e-05, 1.3399499948718585e-05, 1.995188995351782e-06, 0.00016512000001966953, -0.0001458563783671707, 4.174573768978007e-05, 5.5028234783094376e-05, -0.00010918670886894688, 9.83250793069601e-05, -1.875324232969433e-05, -5.182790118851699e-05, -4.935436663799919e-05, -6.43477906123735e-05, 2.2507061657961458e-05, 7.6989519

In [None]:
import numpy as np

# Load the pre-trained Word2Vec model
# word2vec_model = Word2Vec.load("word2vec_model.bin")
word2vec_model = model

# Example user query
user_query = "cricket"

# Tokenize the query
query_tokens = user_query.lower().split()

# Initialize an empty vector for the query
query_vector = np.zeros(word2vec_model.vector_size)

# Iterate through query tokens and create a vector representation
for token in query_tokens:
    if token in word2vec_model.wv:
        query_vector += word2vec_model.wv[token]

# Normalize the query vector
query_vector /= len(query_tokens)

# Now, 'query_vector' contains the vector representation of the user's query
print(query_vector)

In [48]:
from elasticsearch import Elasticsearch
import math

# Initialize the Elasticsearch client
es = Elasticsearch(['http://localhost:9200'])
index_name = 'my_index'

# Sample benchmark queries
queries = ["cricket", "tennis", "football"]

# Set the value of 'K' for top K documents to retrieve
K = 10

# Dictionary for retrieving the document IDs from the index
retrieval_results = {
    0: ["doc1", "doc2", "doc3", "doc4", "doc5", "doc6", "doc7", "doc8", "doc9", "doc10"],
    1: ["doc2", "doc5", "doc11", "doc12", "doc13", "doc14", "doc15", "doc16", "doc17", "doc18"],
    2: ["doc1", "doc6", "doc12", "doc19", "doc20", "doc21", "doc22", "doc23", "doc24", "doc25"],
}

# Initialize dictionaries to store evaluation results
relevance_judgments = {
    0: ["doc1", "doc2", "doc3", "doc4", "doc5"],
    1: ["doc11", "doc12", "doc13"],
    2: ["doc19", "doc20", "doc21", "doc22"],
}

# Initialize dictionaries to store evaluation results
precision_at_k = {}
recall_at_k = {}
average_precision = {}
ndcg = {}

# Iterate through each query
for query_id, query in enumerate(queries):
    # Get the relevance judgments and retrieved documents for the current query
    relevant_docs = set(relevance_judgments[query_id])
    retrieved_docs = retrieval_results[query_id]

    # Compute Precision@K
    K = 10  # You can adjust this value
    retrieved_at_k = retrieved_docs[:K]
    true_positives = len(set(retrieved_at_k).intersection(relevant_docs))
    precision_at_k[query] = true_positives / K

    # Compute Recall@K
    recall_at_k[query] = true_positives / len(relevant_docs)

    # Compute Average Precision (AP)
    average_precision[query] = 0.0
    num_relevant = len(relevant_docs)
    num_retrieved = len(retrieved_docs)
    if num_relevant > 0:
        for i, doc_id in enumerate(retrieved_docs):
            if doc_id in relevant_docs:
                average_precision[query] += (i + 1) / (i + 1 + num_relevant)

    if num_relevant > 0:
        average_precision[query] /= num_relevant

    # Compute Normalized Discounted Cumulative Gain (NDCG)
    dcg = 0.0
    idcg = 0.0
    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_docs:
            dcg += 1.0 / (math.log2(i + 2))
        idcg += 1.0 / (math.log2(i + 2))

    if idcg > 0:
        ndcg[query] = dcg / idcg
    else:
        ndcg[query] = 0.0

# Calculate mean values for each metric
mean_precision_at_k = sum(precision_at_k.values()) / len(precision_at_k)
mean_recall_at_k = sum(recall_at_k.values()) / len(recall_at_k)
mean_map = sum(average_precision.values()) / len(average_precision)
mean_ndcg = sum(ndcg.values()) / len(ndcg)

# Print or store the evaluation results
print(f"Mean Precision@{K}: {mean_precision_at_k}")
print(f"Mean Recall@{K}: {mean_recall_at_k}")
print(f"Mean Average Precision: {mean_map}")
print(f"Mean NDCG: {mean_ndcg}")


Mean Precision@10: 0.39999999999999997
Mean Recall@10: 1.0
Mean Average Precision: 0.49760702260702255
Mean NDCG: 0.4235342045270885
