In [1]:
class QuoraConstant:
    link = 'beir/quora/test'
    name = 'quora'
    cosine_threshold = 0.3
    ngram = (1, 2)
    min_df = 2
    max_df = 0.8
    vectorized_path = 'quora_vectorizer.pkl'
    cluster_path = 'quora_cluster.pkl'

class ClinicalConstant:
    link = 'clinicaltrials/2017/trec-pm-2017'
    name = 'clinical'
    cosine_threshold = 0.001
    ngram = (1, 3)
    min_df = 2
    max_df = 0.8
    vectorized_path = 'clinical_vectorizer.pkl'
    cluster_path = 'clinical_cluster.pkl'

In [2]:
from nltk.corpus import wordnet
import numpy as np
from datetime import datetime
import re
from nltk import WordNetLemmatizer, pos_tag
import num2words
import numpy as np
import pycountry
from nltk import RegexpTokenizer
from nltk.corpus import stopwords

def replace_under_score_with_space(text: str) -> str:
    new_tokens = []
    for token in text.split():
        new_tokens.append(re.sub(r'_', ' ', token))
    return " ".join(new_tokens)


def remove_stopwords(text: str, dataset: str) -> str:
    stop_words = set(stopwords.words('english'))
    questions = {'what', 'which', 'who', 'where', 'why', 'when', 'how', 'whose', 'how often', 'how long', 'how far',
                 'how old', 'how come', 'how much', 'how many', 'what type', 'what kind', 'which type', 'which kind'}
    if dataset == 'quora':
        stop_words = stop_words - questions
    else:
        stop_words = stop_words - {'a'}
    return " ".join([word for word in str(text).split() if word not in stop_words])

def remove_punctuations(text: str) -> str:
    tokenizer = RegexpTokenizer(r'\w+')
    result = tokenizer.tokenize(text)
    return " ".join(result)


def remove_markers(text: str) -> str:
    normalized_tokens = []
    for token in text.split():
        normalized_tokens.append(re.sub(r'\u00AE', '', token))
    return " ".join(normalized_tokens)

def _normalize_country_names(text: str) -> str:
    normalized_tokens = []
    for token in text.split():
        upper_token = token.upper()
        country_name = None

        # Try to lookup by alpha-2 code
        country = pycountry.countries.get(alpha_2=upper_token)
        if not country:
            # Try to lookup by alpha-3 code if alpha-2 lookup fails
            country = pycountry.countries.get(alpha_3=upper_token)

        if country:
            country_name = country.name

        # Append the found country name or the original token if not found
        normalized_tokens.append(country_name if country_name else token)

    return " ".join(normalized_tokens)



def convert_numbers(text: str) -> str:
    new_text = []
    for w in text.split():
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text.append(w)
    new_text = np.char.replace(new_text, "-", " ")
    return " ".join(new_text)


def remove_commas_from_numbers(text: str) -> str:
    # Define the regex pattern
    pattern = r'(?<=\d),(?=\d)'
    new_text = []
    # Process each string in the list
    for w in text.split():
        # Replace commas that are between two digits
        w = re.sub(pattern, '', w)
        new_text.append(w)

    return " ".join(new_text)



def lowercase_letters(text: str) -> str:
    return text.lower()

def lemmatize_words(text: str) -> str:
    lemmatizer = WordNetLemmatizer()
    tagged_tokens = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged_tokens])


# perform part-of-speech (POS) tagging on the tokens.
def get_wordnet_pos(tag: str) -> str:
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def _normalize_dates(text: str) -> str:
    date_pattern = r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|' \
                   r'(\d{4}[-/]\d{1,2}[-/]\d{1,2})|' \
                   r'(\d{1,2}\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{2,4})|' \
                   r'((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{2,4})|' \
                   r'(\d{1,2}\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{2,4})|' \
                   r'((January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{2,4})'
    format_strings = ['%d-%m-%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y-%m-%d', '%Y/%m/%d', '%Y.%m.%d', '%d %b %Y', '%b %d, %Y',
                      '%d %B %Y', '%B %d, %Y', '%m/%d/%Y', '%m-%d-%Y', '%m.%d.%Y', '%d-%m-%y', '%d/%m/%y', '%d.%m.%y',
                      '%y-%m-%d', '%y/%m/%d', '%y.%m.%d', '%d %b %y', '%b %d, %y',
                      '%d %B %y', '%B %d, %y', '%m/%d/%y', '%m-%d-%y', '%m.%d.%y']
    normalized_tokens = []
    for token in text.split():
        matches = re.findall(date_pattern, token)
        if matches:
            match = matches[0][0]
            for fmt in format_strings:
                try:
                    date_obj = datetime.strptime(match, fmt)
                    break
                except ValueError:
                    pass
            else:
                continue
            normalized_date = date_obj.strftime('%Y-%m-%d')
            token = token.replace(match, normalized_date)
        normalized_tokens.append(token)

    return " ".join(normalized_tokens)


def remove_apostrophe(text: str) -> str:
    new_tokens = []
    for token in text.split():
        new_tokens.append(str(np.char.replace(token, "'", " ")))
    return " ".join(new_tokens)


def normalize_abbreviations(text: str) -> str:
    resolved_terms = {}
    new_tokens = []

    for token in text.split():
        synsets = wordnet.synsets(token)
        if synsets:
            resolved_term = synsets[0].lemmas()[0].name()
            resolved_terms[token] = resolved_term
            new_tokens.append(resolved_term)
        else:
            new_tokens.append(token)

    return " ".join(new_tokens)


def process_text_quora(text: str) -> str:
    lowercase = lowercase_letters(text)
    # normalize_numbers = remove_commas_from_numbers(lowercase)
    # num2word = convert_numbers(normalize_numbers)
    punctuations_removed = remove_punctuations(lowercase)
    apostrophe_removed = remove_apostrophe(punctuations_removed)
    stopwords_removed = remove_stopwords(apostrophe_removed, 'quora')
    # markers_removed = remove_markers(stopwords_removed)
    # stemmed = stem_words(markers_removed)
    normalized_dates = _normalize_dates(stopwords_removed)
    normalized_country_names = _normalize_country_names(normalized_dates)
    abbreviations = normalize_abbreviations(normalized_country_names)
    lowercase = lowercase_letters(abbreviations)
    new_tokens = replace_under_score_with_space(lowercase)
    lemmatized = lemmatize_words(new_tokens)
    new_tokens = lemmatized
    return new_tokens


def process_text_clinical(text: str) -> str:
    lowercase = lowercase_letters(text)
    normalize_numbers = remove_commas_from_numbers(lowercase)
    num2word = convert_numbers(normalize_numbers)
    punctuations_removed = remove_punctuations(num2word)
    apostrophe_removed = remove_apostrophe(punctuations_removed)
    stopwords_removed = remove_stopwords(apostrophe_removed, 'clinical')
    markers_removed = remove_markers(stopwords_removed)
    # stemmed = stem_words(markers_removed)
    normalized_dates = _normalize_dates(markers_removed)
    normalized_country_names = _normalize_country_names(normalized_dates)
    abbreviations = normalize_abbreviations(normalized_country_names)
    lowercase = lowercase_letters(abbreviations)
    new_tokens = replace_under_score_with_space(lowercase)
    lemmatized = lemmatize_words(new_tokens)
    new_tokens = lemmatized
    return new_tokens

In [29]:
import gensim
import ir_datasets
from nltk import word_tokenize
import pickle
import numpy as np

quora_dataset = ir_datasets.load("beir/quora/test")
clinical_dataset = ir_datasets.load("clinicaltrials/2017/trec-pm-2017")

def save_model(obj, file_name: str):
    with open(file_name, 'wb') as file:
        pickle.dump(obj, file)


def load_model(file_name):
    with open(file_name, 'rb') as file:
        model = pickle.load(file)
    return model


class QuoraMyCorpus:
    documents = []
    documents_ids = []
    i = 0
    def __iter__(self):
        for doc in quora_dataset.docs_iter():
            if QuoraMyCorpus.i >= 522931:
                break
            tokens = process_text_quora(doc[1])
            QuoraMyCorpus.i += 1
            QuoraMyCorpus.documents.append(tokens)
            QuoraMyCorpus.documents_ids.append(doc[0])
            yield tokens


def get_embedding_vector(model, tokens):
    embeddings = []
    size = model.vector_size
    if len(tokens) < 1:
        return np.zeros(size)
    else:
        for token in tokens:
            if token in model.wv.index_to_key:
                embeddings.append(model.wv.get_vector(token))
            else:
                embeddings.append(np.zeros(size))
    return np.mean(embeddings, axis=0)


# save word 2 vec model for quora
def word_2_vector_for_quora():
    sentences = QuoraMyCorpus()
    z = [line.split() for line in sentences]
    print("before word 2 vec model")
    model = gensim.models.Word2Vec(sentences=z, min_count=1, workers=4)
    save_model(model, 'quora_word_2_vector.pickle')
    print("after word 2 vec model")
    documents = QuoraMyCorpus.documents
    print("before train_matrix")
    train_matrix = [get_embedding_vector(model, word_tokenize(d)) for d in documents]
    save_model(train_matrix, 'matrix_word_2_vector.pickle')
    print("after train_matrix")



class ClinicalMyCorpus:
    documents = []
    documents_ids = []

    def __iter__(self):
        for doc in clinical_dataset.docs_iter():
            tokens = process_text_clinical(doc[1] + ' ' + doc[2] + ' ' + doc[3] + ' ' + doc[4] + ' ' + doc[5])
            ClinicalMyCorpus.documents.append(tokens)
            ClinicalMyCorpus.documents_ids.append(doc[0])
            yield tokens


# save word 2 vec model for clinical
def word_2_vector_for_clinical():
    sentences = ClinicalMyCorpus()
    z = [line.split() for line in sentences]
    print("before word 2 vec model")
    model = gensim.models.Word2Vec(sentences=z, min_count=1, workers=4)
    save_model(model, 'clinical_word_2_vector.pickle')
    print("after word 2 vec model")
    documents = ClinicalMyCorpus.documents
    print("before train_matrix")
    train_matrix = [get_embedding_vector(model, word_tokenize(d)) for d in documents]
    save_model(train_matrix, 'clinical_matrix_word_2_vector.pickle')
    print("after train_matrix")


In [31]:
import ir_measures
from ir_measures import AP, P, R, RR
from nltk import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity


def match_query(dataset, query: str, matrix, model, docs_id) -> dict:
    if dataset == QuoraConstant.name:
        query = process_text_quora(query)
        similarity_ratio = QuoraConstant.cosine_threshold
    else:
        query = process_text_clinical(query)
        similarity_ratio = ClinicalConstant.cosine_threshold
    query_vector = get_embedding_vector(model, word_tokenize(query))
    matched_documents = {}
    similarity = cosine_similarity(matrix, [query_vector])
    for i, s in enumerate(similarity):
        if s >= similarity_ratio:
            matched_documents[docs_id[i]] = float(s[0])
    return matched_documents



def evaluate(dataset_collection, dataset_name):
    qrels = dataset_collection.qrels_iter()
    queries = dataset_collection.queries_iter()
    if dataset_name == QuoraConstant.name:
        model = load_model('quora_word_2_vector.pickle')
        matrix = load_model('matrix_word_2_vector.pickle')
    else:
        model = load_model('clinical_word_2_vector.pickle')
        matrix = load_model('clinical_matrix_word_2_vector.pickle')
    documents_ids = []
    for d in dataset_collection.docs_iter():
        documents_ids.append(d[0])
    print('start evaluation')
    i = 0
    ranking_docs = dict()
    for query in queries:
        i += 1
        print(i)
        if dataset_name == QuoraConstant.name:
            retrieved_docs = match_query(dataset_name, query.text, matrix, model, documents_ids)
        else:
            if query.other == 'None':
                text = query.disease + ' ' + query.gene + ' ' + query.demographic
            else:
                text = query.disease + ' ' + query.gene + ' ' + query.demographic + ' ' + query.other
            retrieved_docs = match_query(dataset_name, text, matrix, model, documents_ids)
        ranking_docs[query.query_id] = retrieved_docs
    metrics = [AP(rel=1), P(rel=1) @ 10, R(rel=1) @ 10, RR(rel=1)]
    qrels_map = dict()
    for qrel in qrels:
        if qrel.query_id in ranking_docs.keys():
            if qrel.query_id in qrels_map:
                qrels_map[qrel.query_id].update({qrel.doc_id: qrel.relevance})
            else:
                qrels_map[qrel.query_id] = {qrel.doc_id: qrel.relevance}
    score = ir_measures.calc_aggregate(metrics, qrels_map, ranking_docs)
    return score


In [32]:
c_score = evaluate(clinical_dataset, 'clinical')
print("Evaluation score with word embedding for clinical:", c_score)

start evaluation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
Evaluation score with word embedding for clinical: {AP: 0.0026224844654136143, P@10: 0.006896551724137932, R@10: 0.0010756495949714488, RR: 0.03169902802997383}


In [None]:
score = evaluate(quora_dataset, 'quora')
print("Evaluation score with word embedding for beir/quora/test:", score)

start evaluation
1
2
