In [None]:
!pip install rank_bm25
!git clone https://github.com/cr-nlp/project1-2023.git

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Cloning into 'project1-2023'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 8 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (8/8), 2.30 MiB | 5.22 MiB/s, done.


Reranking with Word2Vect and changing bm25 parameters

In [None]:
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.metrics import ndcg_score
from collections import defaultdict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Step 1: Load NFCorpus Dataset
def loadNFCorpus():
    dir = "./project1-2023/"
    filename_doc = dir + "dev.docs"
    filename_queries = dir + "dev.all.queries"
    filename_qrel = dir + "dev.2-1-0.qrel"

    dicDoc = dict()
    with open(filename_doc) as file:
        for line in file:
            key, value = line.split('\t')
            dicDoc[key] = value

    dicReq = dict()
    with open(filename_queries) as file:
        for line in file:
            key, value = line.split('\t')
            dicReq[key] = value

    dicReqDoc = defaultdict(dict)
    with open(filename_qrel) as file:
        for line in file:
            req, _, doc, score = line.strip().split('\t')
            dicReqDoc[req][doc] = int(score)

    return dicDoc, dicReq, dicReqDoc

# Function to preprocess text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

def train_word2vec_model(corpus):
    model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)
    return model

def get_vector(word2vec_model, word):
    return word2vec_model.wv[word] if word in word2vec_model.wv else np.zeros(word2vec_model.vector_size)

def document_vector(word2vec_model, doc_tokens):
    return np.mean([get_vector(word2vec_model, word) for word in doc_tokens], axis=0)


# Function to run BM25 retrieval
def run_bm25(dicDoc, dicReq, dicReqDoc, startDoc, endDoc, bm25_k1=3, bm25_b=0.80):
    docsToKeep = []
    reqsToKeep = []
    dicReqDocToKeep = defaultdict(dict)

    i = startDoc
    for reqId in dicReqDoc:
        if i > (endDoc - startDoc):
            break
        for docId in dicReqDoc[reqId]:
            dicReqDocToKeep[reqId][docId] = dicReqDoc[reqId][docId]
            docsToKeep.append(docId)
            i = i + 1
        reqsToKeep.append(reqId)
    docsToKeep = list(set(docsToKeep))

    # Preprocess documents
    allVocab = set()
    for k in docsToKeep:
        docTokenList = preprocess_text(dicDoc[k])
        allVocab.update(docTokenList)
    allVocabListDoc = list(allVocab)

    # Preprocess queries
    allVocab = set()
    for k in reqsToKeep:
        reqTokenList = preprocess_text(dicReq[k])
        allVocab.update(reqTokenList)
    allVocabListReq = list(allVocab)

    # BM25 indexing
    corpusDocTokenList = [preprocess_text(dicDoc[k]) for k in docsToKeep]
    bm25 = BM25Okapi(corpusDocTokenList, k1=bm25_k1, b=bm25_b)

    ndcgBM25Cumul = 0
    nbReq = 0


    # Run BM25 and then rerank with Word2Vec
    ndcgRerankCumul = 0
    for req in reqsToKeep:
        reqTokenList = preprocess_text(dicReq[req])
        doc_scores = bm25.get_scores(reqTokenList)
        top_doc_indices = np.argsort(doc_scores)[::-1][:5]  # Get top 5 documents

        query_vector = document_vector(word2vec_model, reqTokenList)
        rerank_scores = []
        trueDocs = []
        for idx in top_doc_indices:
            docId = docsToKeep[idx]
            doc_vector = document_vector(word2vec_model, corpusDocTokenList[idx])
            rerank_score = cosine_similarity([query_vector], [doc_vector])[0][0]
            rerank_scores.append(rerank_score)

            # Error handling for missing document IDs
            true_score = dicReqDocToKeep[req].get(docId, 0)
            trueDocs.append(true_score)

        ndcgRerankCumul += ndcg_score([trueDocs], [rerank_scores])

    ndcgRerankCumul /= len(reqsToKeep)
    print("Average NDCG after reranking with Word2Vec:", ndcgRerankCumul)
    return ndcgBM25Cumul

# Run BM25 retrieval using NFCorpus data
dicDoc, dicReq, dicReqDoc = loadNFCorpus()
# Train Word2Vec model
corpus = [preprocess_text(doc) for doc in dicDoc.values()]
word2vec_model = train_word2vec_model(corpus)

nb_docs = 3192  # all docs
run_bm25(dicDoc, dicReq, dicReqDoc, 0, nb_docs)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Average NDCG after reranking with Word2Vec: 0.59248720026296


0

Reranking with Word2Vec and adding lemmatization

In [None]:
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.metrics import ndcg_score
from collections import defaultdict

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Step 1: Load NFCorpus Dataset
def loadNFCorpus():
    dir = "./project1-2023/"
    filename_doc = dir + "dev.docs"
    filename_queries = dir + "dev.all.queries"
    filename_qrel = dir + "dev.2-1-0.qrel"

    dicDoc = dict()
    with open(filename_doc) as file:
        for line in file:
            key, value = line.split('\t')
            dicDoc[key] = value

    dicReq = dict()
    with open(filename_queries) as file:
        for line in file:
            key, value = line.split('\t')
            dicReq[key] = value

    dicReqDoc = defaultdict(dict)
    with open(filename_qrel) as file:
        for line in file:
            req, _, doc, score = line.strip().split('\t')
            dicReqDoc[req][doc] = int(score)

    return dicDoc, dicReq, dicReqDoc

# Function to preprocess text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))

    # Remove punctuation and convert to lowercase
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text.lower())

    # Tokenization using NLTK
    tokens = word_tokenize(text)

    # Remove stopwords and apply lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]

    return tokens

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

def train_word2vec_model(corpus):
    model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)
    return model

def get_vector(word2vec_model, word):
    return word2vec_model.wv[word] if word in word2vec_model.wv else np.zeros(word2vec_model.vector_size)

def document_vector(word2vec_model, doc_tokens):
    return np.mean([get_vector(word2vec_model, word) for word in doc_tokens], axis=0)


# Function to run BM25 retrieval
def run_bm25(dicDoc, dicReq, dicReqDoc, startDoc, endDoc, bm25_k1=3, bm25_b=0.80):
    docsToKeep = []
    reqsToKeep = []
    dicReqDocToKeep = defaultdict(dict)

    i = startDoc
    for reqId in dicReqDoc:
        if i > (endDoc - startDoc):
            break
        for docId in dicReqDoc[reqId]:
            dicReqDocToKeep[reqId][docId] = dicReqDoc[reqId][docId]
            docsToKeep.append(docId)
            i = i + 1
        reqsToKeep.append(reqId)
    docsToKeep = list(set(docsToKeep))

    # Preprocess documents
    allVocab = set()
    for k in docsToKeep:
        docTokenList = preprocess_text(dicDoc[k])
        allVocab.update(docTokenList)
    allVocabListDoc = list(allVocab)

    # Preprocess queries
    allVocab = set()
    for k in reqsToKeep:
        reqTokenList = preprocess_text(dicReq[k])
        allVocab.update(reqTokenList)
    allVocabListReq = list(allVocab)

    # BM25 indexing
    corpusDocTokenList = [preprocess_text(dicDoc[k]) for k in docsToKeep]
    bm25 = BM25Okapi(corpusDocTokenList, k1=bm25_k1, b=bm25_b)

    ndcgBM25Cumul = 0
    nbReq = 0


    # Run BM25 and then rerank with Word2Vec
    ndcgRerankCumul = 0
    for req in reqsToKeep:
        reqTokenList = preprocess_text(dicReq[req])
        doc_scores = bm25.get_scores(reqTokenList)
        top_doc_indices = np.argsort(doc_scores)[::-1][:5]  # Get top 5 documents

        query_vector = document_vector(word2vec_model, reqTokenList)
        rerank_scores = []
        trueDocs = []
        for idx in top_doc_indices:
            docId = docsToKeep[idx]
            doc_vector = document_vector(word2vec_model, corpusDocTokenList[idx])
            rerank_score = cosine_similarity([query_vector], [doc_vector])[0][0]
            rerank_scores.append(rerank_score)

            # Error handling for missing document IDs
            true_score = dicReqDocToKeep[req].get(docId, 0)
            trueDocs.append(true_score)

        ndcgRerankCumul += ndcg_score([trueDocs], [rerank_scores])

    ndcgRerankCumul /= len(reqsToKeep)
    print("Average NDCG after reranking with Word2Vec:", ndcgRerankCumul)
    return ndcgBM25Cumul

# Run BM25 retrieval using NFCorpus data
dicDoc, dicReq, dicReqDoc = loadNFCorpus()
# Train Word2Vec model
corpus = [preprocess_text(doc) for doc in dicDoc.values()]
word2vec_model = train_word2vec_model(corpus)

nb_docs = 3192  # all docs
run_bm25(dicDoc, dicReq, dicReqDoc, 0, nb_docs)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Average NDCG after reranking with Word2Vec: 0.5982509347285934


0

Reranking with BERT

In [None]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_trans

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string
import numpy as np
from sklearn.metrics import ndcg_score
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Download pre-trained BERT model
model_name = 'paraphrase-MiniLM-L6-v2'
bert_model = SentenceTransformer(model_name)


def loadNFCorpus():
    dir = "./project1-2023/"
    filename_doc = dir + "dev.docs"
    filename_queries = dir + "dev.all.queries"
    filename_qrel = dir + "dev.2-1-0.qrel"

    dicDoc = dict()
    with open(filename_doc) as file:
        for line in file:
            key, value = line.split('\t')
            dicDoc[key] = value

    dicReq = dict()
    with open(filename_queries) as file:
        for line in file:
            key, value = line.split('\t')
            dicReq[key] = value

    dicReqDoc = defaultdict(dict)
    with open(filename_qrel) as file:
        for line in file:
            req, _, doc, score = line.strip().split('\t')
            dicReqDoc[req][doc] = int(score)

    return dicDoc, dicReq, dicReqDoc

# Instantiate WordNetLemmatizer outside the function
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))

    # Remove punctuation and convert to lowercase
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text.lower())

    # Tokenization using NLTK
    tokens = word_tokenize(text)

    # Remove stopwords and apply lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]

    return tokens

# Function to get BERT embeddings
def get_bert_embedding(model, text):
    return model.encode(text)

# Function to run BM25 retrieval with BERT reranking
def run_bm25_with_bert(dicDoc, dicReq, dicReqDoc, startDoc, endDoc, bm25_k1=1.5, bm25_b=0.75):
    docsToKeep = []
    reqsToKeep = []
    dicReqDocToKeep = defaultdict(dict)

    doc_count = startDoc
    for reqId in dicReqDoc:
        if doc_count > (endDoc - startDoc):
            break
        for docId in dicReqDoc[reqId]:
            dicReqDocToKeep[reqId][docId] = dicReqDoc[reqId][docId]
            docsToKeep.append(docId)
            doc_count += 1
        reqsToKeep.append(reqId)
    docsToKeep = list(set(docsToKeep))

    # Preprocess documents
    allVocab = set()
    for k in docsToKeep:
        docTokenList = preprocess_text(dicDoc[k])
        allVocab.update(docTokenList)
    allVocabListDoc = list(allVocab)

    # Preprocess queries
    allVocab = set()
    for k in reqsToKeep:
        reqTokenList = preprocess_text(dicReq[k])
        allVocab.update(reqTokenList)
    allVocabListReq = list(allVocab)

    # BM25 indexing
    corpusDocTokenList = np.array([preprocess_text(dicDoc[k]) for k in docsToKeep])
    bm25 = BM25Okapi(corpusDocTokenList, k1=bm25_k1, b=bm25_b)

    ndcgBM25Cumul = 0
    nbReq = 0

    # Run BM25 and then rerank with BERT
    ndcgRerankCumul = 0
    for req in reqsToKeep:
        reqTokenList = preprocess_text(dicReq[req])

        # Get BERT embeddings for the query
        query_embedding = get_bert_embedding(bert_model, ' '.join(reqTokenList))

        # Get BM25 scores
        doc_scores = bm25.get_scores(reqTokenList)
        top_doc_indices = np.argsort(doc_scores)[::-1][:5]  # Get top 5 documents

        rerank_scores = []
        trueDocs = []
        for idx in top_doc_indices:
            docId = docsToKeep[idx]

            # Get BERT embeddings for the document
            doc_embedding = get_bert_embedding(bert_model, dicDoc[docId])

            # Calculate cosine similarity between query and document embeddings
            rerank_score = cosine_similarity([query_embedding], [doc_embedding])[0][0]
            rerank_scores.append(rerank_score)

            # Error handling for missing document IDs
            true_score = dicReqDocToKeep[req].get(docId, 0)
            trueDocs.append(true_score)

        ndcgRerankCumul += ndcg_score([trueDocs], [rerank_scores])

    ndcgRerankCumul /= len(reqsToKeep)
    print("Average NDCG after reranking with BERT:", ndcgRerankCumul)
    return ndcgBM25Cumul

# Run BM25 retrieval using NFCorpus data
dicDoc, dicReq, dicReqDoc = loadNFCorpus()

# Specify the number of documents to process
nb_docs = 3192  # all docs

# Run BM25 retrieval with BERT reranking
run_bm25_with_bert(dicDoc, dicReq, dicReqDoc, 0, nb_docs)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  corpusDocTokenList = np.array([preprocess_text(dicDoc[k]) for k in docsToKeep])


Average NDCG after reranking with BERT: 0.675554045841826


0

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive
