In [1]:
class QuoraConstant:
    link = 'beir/quora/test'
    name = 'quora'
    cosine_threshold = 0.3
    ngram = (1, 2)
    min_df = 2
    max_df = 0.8
    vectorized_path = 'quora_vectorizer.pkl'
    cluster_path = 'quora_cluster.pkl'
    tfidf_path = ''


class ClinicalConstant:
    link = 'clinicaltrials/2017/trec-pm-2017'
    name = 'clinical'
    cosine_threshold = 0.001
    ngram = (1, 3)
    min_df = 2
    max_df = 0.8
    vectorized_path = 'clinical_vectorizer.pkl'
    cluster_path = 'clinical_cluster.pkl'
    tfidf_path = ''

In [2]:
from nltk.corpus import wordnet
import numpy as np
from datetime import datetime
import re
from nltk import WordNetLemmatizer, pos_tag
import num2words
import numpy as np
import pycountry
from nltk import RegexpTokenizer
from nltk.corpus import stopwords

def replace_under_score_with_space(text: str) -> str:
    new_tokens = []
    for token in text.split():
        new_tokens.append(re.sub(r'_', ' ', token))
    return " ".join(new_tokens)


def remove_stopwords(text: str, dataset: str) -> str:
    stop_words = set(stopwords.words('english'))
    questions = {'what', 'which', 'who', 'where', 'why', 'when', 'how', 'whose', 'how often', 'how long', 'how far',
                 'how old', 'how come', 'how much', 'how many', 'what type', 'what kind', 'which type', 'which kind'}
    if dataset == 'quora':
        stop_words = stop_words - questions
    else:
        stop_words = stop_words - {'a'}
    return " ".join([word for word in str(text).split() if word not in stop_words])

def remove_punctuations(text: str) -> str:
    tokenizer = RegexpTokenizer(r'\w+')
    result = tokenizer.tokenize(text)
    return " ".join(result)


def remove_markers(text: str) -> str:
    normalized_tokens = []
    for token in text.split():
        normalized_tokens.append(re.sub(r'\u00AE', '', token))
    return " ".join(normalized_tokens)

def _normalize_country_names(text: str) -> str:
    normalized_tokens = []
    for token in text.split():
        upper_token = token.upper()
        country_name = None

        # Try to lookup by alpha-2 code
        country = pycountry.countries.get(alpha_2=upper_token)
        if not country:
            # Try to lookup by alpha-3 code if alpha-2 lookup fails
            country = pycountry.countries.get(alpha_3=upper_token)

        if country:
            country_name = country.name

        # Append the found country name or the original token if not found
        normalized_tokens.append(country_name if country_name else token)

    return " ".join(normalized_tokens)



def convert_numbers(text: str) -> str:
    new_text = []
    for w in text.split():
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text.append(w)
    new_text = np.char.replace(new_text, "-", " ")
    return " ".join(new_text)


def remove_commas_from_numbers(text: str) -> str:
    # Define the regex pattern
    pattern = r'(?<=\d),(?=\d)'
    new_text = []
    # Process each string in the list
    for w in text.split():
        # Replace commas that are between two digits
        w = re.sub(pattern, '', w)
        new_text.append(w)

    return " ".join(new_text)



def lowercase_letters(text: str) -> str:
    return text.lower()

def lemmatize_words(text: str) -> str:
    lemmatizer = WordNetLemmatizer()
    tagged_tokens = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged_tokens])


# perform part-of-speech (POS) tagging on the tokens.
def get_wordnet_pos(tag: str) -> str:
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def _normalize_dates(text: str) -> str:
    date_pattern = r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|' \
                   r'(\d{4}[-/]\d{1,2}[-/]\d{1,2})|' \
                   r'(\d{1,2}\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{2,4})|' \
                   r'((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{2,4})|' \
                   r'(\d{1,2}\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{2,4})|' \
                   r'((January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{2,4})'
    format_strings = ['%d-%m-%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y-%m-%d', '%Y/%m/%d', '%Y.%m.%d', '%d %b %Y', '%b %d, %Y',
                      '%d %B %Y', '%B %d, %Y', '%m/%d/%Y', '%m-%d-%Y', '%m.%d.%Y', '%d-%m-%y', '%d/%m/%y', '%d.%m.%y',
                      '%y-%m-%d', '%y/%m/%d', '%y.%m.%d', '%d %b %y', '%b %d, %y',
                      '%d %B %y', '%B %d, %y', '%m/%d/%y', '%m-%d-%y', '%m.%d.%y']
    normalized_tokens = []
    for token in text.split():
        matches = re.findall(date_pattern, token)
        if matches:
            match = matches[0][0]
            for fmt in format_strings:
                try:
                    date_obj = datetime.strptime(match, fmt)
                    break
                except ValueError:
                    pass
            else:
                continue
            normalized_date = date_obj.strftime('%Y-%m-%d')
            token = token.replace(match, normalized_date)
        normalized_tokens.append(token)

    return " ".join(normalized_tokens)


def remove_apostrophe(text: str) -> str:
    new_tokens = []
    for token in text.split():
        new_tokens.append(str(np.char.replace(token, "'", " ")))
    return " ".join(new_tokens)


def normalize_abbreviations(text: str) -> str:
    resolved_terms = {}
    new_tokens = []

    for token in text.split():
        synsets = wordnet.synsets(token)
        if synsets:
            resolved_term = synsets[0].lemmas()[0].name()
            resolved_terms[token] = resolved_term
            new_tokens.append(resolved_term)
        else:
            new_tokens.append(token)

    return " ".join(new_tokens)


def process_text_quora(text: str) -> str:
    lowercase = lowercase_letters(text)
    normalize_numbers = remove_commas_from_numbers(lowercase)
    num2word = convert_numbers(normalize_numbers)
    punctuations_removed = remove_punctuations(num2word)
    apostrophe_removed = remove_apostrophe(punctuations_removed)
    stopwords_removed = remove_stopwords(apostrophe_removed, 'quora')
    # markers_removed = remove_markers(stopwords_removed)
    # stemmed = stem_words(markers_removed)
    normalized_dates = _normalize_dates(stopwords_removed)
    normalized_country_names = _normalize_country_names(normalized_dates)
    abbreviations = normalize_abbreviations(normalized_country_names)
    lowercase = lowercase_letters(abbreviations)
    new_tokens = replace_under_score_with_space(lowercase)
    lemmatized = lemmatize_words(new_tokens)
    new_tokens = lemmatized
    return new_tokens


def process_text_clinical(text: str) -> str:
    lowercase = lowercase_letters(text)
    normalize_numbers = remove_commas_from_numbers(lowercase)
    num2word = convert_numbers(normalize_numbers)
    punctuations_removed = remove_punctuations(num2word)
    apostrophe_removed = remove_apostrophe(punctuations_removed)
    stopwords_removed = remove_stopwords(apostrophe_removed, 'clinical')
    markers_removed = remove_markers(stopwords_removed)
    # stemmed = stem_words(markers_removed)
    normalized_dates = _normalize_dates(markers_removed)
    normalized_country_names = _normalize_country_names(normalized_dates)
    abbreviations = normalize_abbreviations(normalized_country_names)
    lowercase = lowercase_letters(abbreviations)
    new_tokens = replace_under_score_with_space(lowercase)
    lemmatized = lemmatize_words(new_tokens)
    new_tokens = lemmatized
    return new_tokens

In [3]:
from typing import Dict
from pymongo import MongoClient
import ir_datasets

def get_dataset_docs(dataset_name: str) -> Dict[str, str]:
    print("get dataset docs " + dataset_name)
    i = 0
    if dataset_name == "clinical":
        dataset = ir_datasets.load("clinicaltrials/2017/trec-pm-2017")
        docs_iter = dataset.docs_iter()
        random_corpus = {}
        for doc in docs_iter:
            doc_id = doc[0]
            detailed_description = (doc[1] + ' ' + doc[2] + ' ' + doc[3] + ' '
                                    + doc[4] + ' ' + doc[5])
            random_corpus[doc_id] = detailed_description
        random_corpus_ids = set(random_corpus.keys())
        search_qrels = list(ir_datasets.load("clinicaltrials/2017/trec-pm-2017").qrels_iter())
        qrels_docs_ids = set(qrel.doc_id for qrel in search_qrels)
        docs_ids = random_corpus_ids.union(qrels_docs_ids)
        docs_store = ir_datasets.load("clinicaltrials/2017/trec-pm-2017").docs_store()
        mapped_docs = dict(docs_store.get_many(docs_ids))
        corpus = {
            doc_id: doc.title + ' ' + doc.condition + ' ' + doc.summary +
                    ' ' + doc.detailed_description + ' ' + doc.eligibility
            for doc_id, doc in mapped_docs.items()
        }
    else:  # dataset_name == "quora":
        random_corpus = dict(ir_datasets.load("beir/quora/test").docs_iter())
        random_corpus_ids = set(random_corpus.keys())
        qrels = list(ir_datasets.load("beir/quora/test").qrels_iter())
        qrels_docs_ids = set(qrel.doc_id for qrel in qrels)
        docs_ids = random_corpus_ids.union(qrels_docs_ids)
        docs_store = ir_datasets.load("beir/quora/test").docs_store()
        mapped_docs = dict(docs_store.get_many(docs_ids))
        corpus = {}
        for doc_id, doc in mapped_docs.items():
            # i += 1
            # if i >= 10000:
            #     break
            doc_id = doc[0]
            detailed_description = (doc[1])
            corpus[doc_id] = detailed_description
        # corpus = {doc_id: doc.text for doc_id, doc in mapped_docs.items()}

    print(len(corpus))
    mongo_client = MongoClient('localhost', 27017)
    db = mongo_client['IR_DOCS']
    collection = db[dataset_name]
    collection.drop()
    documents_to_insert = []
    index = 0
    for doc_id, text in corpus.items():
        documents_to_insert.append({'index': index, "_id": doc_id, "text": text})
        index += 1
    collection.insert_many(documents_to_insert)
    return corpus

In [4]:
import pickle
def save_tfidf_matrix(dataset_name: str, inverted_index: dict):
    with open(dataset_name + '_inverted_index.pickle', 'wb') as file:
        pickle.dump(inverted_index, file)

In [5]:
import os

import joblib
from nltk import word_tokenize
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

def create_inverted_index(dataset: str):
    print("create inverted index " + dataset)
    corpus = get_dataset_docs(dataset)
    print("after get_dataset_docs " + dataset)
    if dataset == QuoraConstant.name:
        vectorizer = TfidfVectorizer(preprocessor=process_text_quora, tokenizer=word_tokenize,
                                     # max_df=QuoraConstant.max_df,
                                     # min_df=QuoraConstant.min_df,
                                     # norm="l2",
                                     # ngram_range=QuoraConstant.ngram,
                                     # sublinear_tf=True,
                                     # use_idf=True,
                                     # use_idf=True,

                                     )

    else:
        vectorizer = TfidfVectorizer(preprocessor=process_text_clinical, tokenizer=word_tokenize,
                                     # max_df=ClinicalConstant.max_df,
                                     # min_df=ClinicalConstant.min_df,
                                     norm="l2",
                                     # ngram_range=ClinicalConstant.ngram,
                                     sublinear_tf=True,
                                     use_idf=True,
                                     stop_words='english',
                                     lowercase=False,
                                     )

    print("before fit transform " + dataset)
    tfidf_matrix = vectorizer.fit_transform(corpus.values())
    if dataset == QuoraConstant.name:
        joblib.dump(vectorizer, QuoraConstant.vectorized_path)
    else:
        joblib.dump(vectorizer, ClinicalConstant.vectorized_path)
    print("after fit transform " + dataset)
    save_tfidf_matrix(dataset, tfidf_matrix)
    # save_doc_ids(dataset, corpus.keys())
    return tfidf_matrix

In [6]:
def load_tfidf_matrix(dataset_name: str) -> dict:
    with open(dataset_name + '_inverted_index.pickle', 'rb') as file:
        tfidf_matrix = pickle.load(file)
    return tfidf_matrix

In [7]:
def customize_result_from_index_to_doc_id(result, dataset: str):
    mongo_client = MongoClient('localhost', 27017)
    db = mongo_client['IR_DOCS']
    collection = db[dataset]
    data = collection.find({"index": {"$in": [int(s) for s in list(result)]}}, {'_id': 1, 'index': 1})
    new_result = {}
    for item in data:
        new_result[item['_id']] = result[str(item['index'])]
    return new_result

def retrieve(query, dataset, tfidf_matrix) -> Dict[str, float]:
    result = ranking(query, dataset, tfidf_matrix)
    return customize_result_from_index_to_doc_id(result, dataset)

In [8]:
import os

import ir_measures
import joblib
from ir_measures import R, AP, P, RR

def evaluate(dataset_collection, dataset_name):
    qrels = dataset_collection.qrels_iter()
    queries = dataset_collection.queries_iter()
    ranking_docs = dict()
    i = 0
    tfidf_matrix = load_tfidf_matrix(dataset_name)
    for query in queries:
        if dataset_name == QuoraConstant.name:
            retrieved_docs = retrieve(query.text, dataset_name, tfidf_matrix)
        else:
            if query.other == 'None':
                text = query.disease + ' ' + query.gene + ' ' + query.demographic
            else:
                text = query.disease + ' ' + query.gene + ' ' + query.demographic + ' ' + query.other
            print(text)
            retrieved_docs = retrieve(text, dataset_name, tfidf_matrix)
        ranking_docs[query.query_id] = retrieved_docs
        i += 1
        if i % 100 == 99:
            break
    metrics = [AP(rel=1), P(rel=1) @ 10, R(rel=1) @ 10, RR(rel=1)]
    qrels_map = dict()
    for qrel in qrels:
        if qrel.query_id in ranking_docs.keys():
            if qrel.query_id in qrels_map:
                qrels_map[qrel.query_id].update({qrel.doc_id: qrel.relevance})
            else:
                qrels_map[qrel.query_id] = {qrel.doc_id: qrel.relevance}
    score = ir_measures.calc_aggregate(metrics, qrels_map, ranking_docs)
    return score

In [9]:
create_inverted_index("clinical")

create inverted index clinical
get dataset docs clinical
241006
after get_dataset_docs clinical
before fit transform clinical




after fit transform clinical


<241006x265248 sparse matrix of type '<class 'numpy.float64'>'
	with 39668590 stored elements in Compressed Sparse Row format>

In [10]:
import numpy as np
from typing import Dict
from sklearn.metrics.pairwise import cosine_similarity

class VectorizeQuery:
    quora_vectorizer = joblib.load(QuoraConstant.vectorized_path)
    clinica_vectorizer = joblib.load(ClinicalConstant.vectorized_path)

    @staticmethod
    def get_vectorize(query, dataset):
        if dataset == QuoraConstant.name:
            query_vector = VectorizeQuery.quora_vectorizer
        else:
            query_vector = VectorizeQuery.clinica_vectorizer
        return query_vector.transform([query])

def ranking(query: str, dataset: str, tfidf_matrix) -> Dict[str, float]:
    if dataset == QuoraConstant.name:
        cosine_threshold = QuoraConstant.cosine_threshold
    else:
        cosine_threshold = ClinicalConstant.cosine_threshold
    query_vector = VectorizeQuery.get_vectorize(query, dataset)
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    matched_indices = similarities.argsort()[0][::-1].flatten()
    top_indices = []
    for i in matched_indices:
        if similarities[0][i] >= cosine_threshold:
            top_indices.append(i.item())
    results = {}
    for index in top_indices:
        doc_id = str(index)
        similarity = similarities[0][index]
        results[doc_id] = similarity
    return results



In [11]:
dataset_quora = ir_datasets.load("beir/quora/test")
score_quora = evaluate(dataset_quora, "quora")
print("Evaluation score for beir/quora/test:", score_quora)

Evaluation score for beir/quora/test: {AP: 0.6761926014667079, RR: 0.7591320983854364, R@10: 0.756593331354097, P@10: 0.12323232323232308}


In [12]:
dataset_clinical = ir_datasets.load("clinicaltrials/2017/trec-pm-2017")
score_clinical = evaluate(dataset_clinical, "clinical")
print("Evaluation score for clinicaltrials/2017/trec-pm-2017:", score_clinical)

Liposarcoma CDK4 Amplification 38-year-old male GERD
Colon cancer KRAS (G13D), BRAF (V600E) 52-year-old male Type II Diabetes, Hypertension
Meningioma NF2 (K322), AKT1(E17K) 45-year-old female
Breast cancer FGFR1 Amplification, PTEN (Q171) 67-year-old female Depression, Hypertension, Heart Disease
Melanoma BRAF (V600E), CDKN2A Deletion 45-year-old female
Melanoma NRAS (Q61K) 55-year-old male Hypertension
Lung cancer EGFR (L858R) 50-year-old female Lupus
Lung cancer EML4-ALK Fusion transcript 52-year-old male Hypertension, Osteoarthritis
Gastrointestinal stromal tumor KIT Exon 9 (A502_Y503dup) 49-year-old female
Lung adenocarcinoma KRAS (G12C) 61-year-old female Hypertension, Hypercholesterolemia
Gastric cancer PIK3CA (E545K) 54-year-old male Depression
Colon cancer BRAF (V600E) 35-year-old male
Cholangiocarcinoma BRCA2 72-year-old male Diabetes
Cholangiocarcinoma IDH1 (R132H) 64-year-old male Neuropathy
Cervical cancer STK11 26-year-old female
Pancreatic cancer CDKN2A 54-year-old male 