In [None]:
!pip install datasets rank_bm25 scikit-learn tqdm

In [None]:
import os
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import lib
# import reload
import importlib
importlib.reload(lib)

In [None]:
conn_string = "postgresql://postgres:postgres@localhost:6666"
import mwxml
import psycopg2
from psycopg2.extras import execute_batch
from tqdm import tqdm
from datetime import datetime, timezone
import os

# Database connection URL
db_url = conn_string
# Connect to the PostgreSQL database
conn = psycopg2.connect(db_url)
def SQL(query, conn_string = conn_string, conn = None, queryargs=()):
    def execute(cursor):
        cursor.execute(query, queryargs)
        try:
            res = cursor.fetchall()
            return res
        except Exception as e:
            if str(e) == 'no results to fetch':
                return None
            raise e

    if conn:
        with conn.cursor() as cursor:
            return execute(cursor)

    with psycopg2.connect(conn_string) as conn:
        with conn.cursor() as cursor:
            return execute(cursor)


In [None]:
# Step 1: Download the dataset
DATASET = "BeIR/quora"
corpus = load_dataset(DATASET, "corpus")["corpus"]
queries = load_dataset(DATASET, "queries")["queries"]

In [None]:
import os
import json
from typing import Iterable, List
import psycopg


DATASET = os.getenv("DATASET", "quora")

def read_file(file_name: str) -> Iterable[str]:
    with open(file_name, "r") as file:
        for line in file:
            row = json.loads(line)
            yield row["_id"], row["text"]

# convert jsonl fil that has specific schema into csv with the schema columns as csv columns
def jsonl_to_csv(jsonl_file, csv_file, columns=None):
    import json
    import csv
    with open(jsonl_file, 'r') as f:
        with open(csv_file, 'w') as f1:
            csv_writer = csv.writer(f1)
            if columns is None:
                columns = data.keys()

            csv_writer.writerow(columns)

            for line in f:
                data = json.loads(line)
                values = [data[col] for col in columns]
                csv_writer.writerow(values)


def main():
    file_name = f"data/{DATASET}/corpus.jsonl"  # DATASET collection
    jsonl_to_csv(file_name, 'data/quora/corpus.csv', columns = ["_id", "text"])

    # convert the above into postgres schema - creat etable if it does not exist
    # delete it if it exists. use SQL() function
    SQL("""
        DROP TABLE IF EXISTS quora_corpus_tmp;
        CREATE UNLOGGED TABLE quora_corpus_tmp (
            doc_id INTEGER,
            body TEXT
        );     
        """)
    with open('data/quora/corpus.csv', 'r') as f:
        with psycopg.connect(conn_string) as conn:
            with conn.cursor() as cursor:
                with cursor.copy("COPY quora_corpus_tmp FROM STDIN CSV HEADER") as ff:
                    for line in f:
                        ff.write(line)
    

if __name__ == '__main__':
    main()

In [None]:
lib.add_tsvector_column('quora_corpus_tmp', 'body')

In [None]:
lib.calculate_term_frequency('quora_corpus_tmp', 'body_simple', id_column='doc_id')

In [None]:
lib.total_words('quora_corpus_tmp', 'body_simple', id_column='doc_id')

In [None]:
importlib.reload(lib)
lib.inverted_frequency('quora_corpus_tmp', 'body_simple', id_column='doc_id')

In [None]:
importlib.reload(lib)

lib.bm25_query('was the kohinoor really gifted to england queen', 'quora_corpus_tmp', 'body_simple',  id_column='doc_id')

In [None]:
import re

def load_queries():
    queries = {}

    with open(f"data/{DATASET}/queries.jsonl", "r") as file:
        for line in file:
            row = json.loads(line)
            queries[row["_id"]] = { **row, "doc_ids": [] }

    with open(f"data/{DATASET}/qrels/test.tsv", "r") as file:
        next(file)
        for line in file:
            query_id, doc_id, score = line.strip().split("\t")
            if int(score) > 0:
                queries[query_id]["doc_ids"].append(doc_id)

    queries_filtered = {}
    for query_id, query in queries.items():
        if len(query["doc_ids"]) > 0:
            queries_filtered[query_id] = query

    return queries_filtered


def sanitize_query_for_tantivy(query):
    # escape special characters
    query = re.sub(r'([+\-!(){}\[\]^"~*?:\\<])', r' ', query)
    return query

def search_bm25(query, limit, conn=conn):
    query = index.parse_query(sanitize_query_for_tantivy(query), ['body'])
    hits = searcher.search(query, limit).hits
    docs = [
        searcher.doc(doc_address)
        for (score, doc_address) in hits
    ]
    return docs



def main():


    n = 0
    hits = 0
    limit = 10
    number_of_queries = 100_000

    queries = load_queries()

    num_queries = 0
    num_responses = 0

    recalls = []
    precisions = []

    for idx, query in enumerate(queries.values()):
        if idx >= number_of_queries:
            break

        num_queries += 1

        result = search_bm25(query["text"], limit)
        num_responses += len(result)

        found_ids = []

        for hit in result:
            found_ids.append(hit["doc_id"][0])

        query_hits = 0
        for doc_id in query["doc_ids"]:
            n += 1
            if doc_id in found_ids:
                hits += 1
                query_hits += 1

        recalls.append(
            query_hits / len(query["doc_ids"])
        )
        precisions.append(
            query_hits / limit
        )

        print(f"Processing query: {query}, hits: {query_hits}")

    print(f"Total hits: {hits} out of {n}, which is {hits/n}")

    print(f"Precision: {hits/(num_queries * limit)}")

    average_precision = sum(precisions) / len(precisions)

    print(f"Average precision: {average_precision}")

    average_recall = sum(recalls) / len(recalls)

    print(f"Average recall: {average_recall}")


if __name__ == "__main__":
    main()