# Tansformer BM25

Using the classic Cranfield dataset, this notebook shows how to use BM25 to calculate the similarity scores between a query and the documents and show the evaluation scores, i.e., precision and recall. Note that the ranking of the returned documents is not yet considered.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# load data into dataframes
docs = pd.read_json("data/cranfield_docs.json")
queries = pd.read_json("data/cranfield_queries.json")
relevance = pd.read_json("data/cranfield_relevance.json")

In [3]:
docs.head()

Unnamed: 0,author,bibliography,body,id,title
0,"brenckman,m.","j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...,1,experimental investigation of the aerodynamics...
1,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,2,simple shear flow past a flat plate in an inco...
2,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,3,the boundary layer in simple shear flow past a...
3,"yen,k.t.","j. ae. scs. 22, 1955, 728.",approximate solutions of the incompressible la...,4,approximate solutions of the incompressible la...
4,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,5,one-dimensional transient heat conduction into...


In [4]:
queries.head()

Unnamed: 0,query,query_id
0,what similarity laws must be obeyed when const...,1
1,what are the structural and aeroelastic proble...,2
2,what problems of heat conduction in composite ...,3
3,can a criterion be developed to show empirical...,4
4,what chemical kinetic system is applicable to ...,5


In [5]:
queries.set_index('query_id', inplace=True)
queries.head()

Unnamed: 0_level_0,query
query_id,Unnamed: 1_level_1
1,what similarity laws must be obeyed when const...
2,what are the structural and aeroelastic proble...
3,what problems of heat conduction in composite ...
4,can a criterion be developed to show empirical...
5,what chemical kinetic system is applicable to ...


In [6]:
queries['query'][1]

'what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .'

In [7]:
relevance.head()

Unnamed: 0,doc_id,query_id,r_score
0,184,1,2
1,29,1,2
2,31,1,2
3,12,1,3
4,51,1,3


In [8]:
set(relevance['r_score'].values)

{1, 2, 3, 4}

In [9]:
train_docs = docs['body'].tolist()
print(len(train_docs))
train_docs[:1]

1400


['experimental investigation of the aerodynamics of a wing in a slipstream .   an experimental study of a wing in a propeller slipstream was made in order to determine the spanwise distribution of the lift increase due to slipstream at different angles of attack of the wing and at different free stream to slipstream velocity ratios .  the results were intended in part as an evaluation basis for different theoretical treatments of this problem .   the comparative span loading curves, together with supporting evidence, showed that a substantial part of the lift increment produced by the slipstream was due to a /destalling/ or boundary-layer-control effect .  the integrated remaining lift increment, after subtracting this destalling lift, was found to agree well with a potential flow theory .   an empirical evaluation of the destalling effects was made for the specific configuration of the experiment .']

In [10]:
# https://tartarus.org/martin/PorterStemmer/def.txt
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
import re # regular expression


def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split() # delete non letter charactors
    #words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split() # include numbers
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [11]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

class BM25Transformer(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    use_idf : boolean, optional (default=True)
    k1 : float, optional (default=2.0)
    b : float, optional (default=0.75)
    References
    ----------
    Okapi BM25: a non-binary model - Introduction to Information Retrieval
    http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
    """
    def __init__(self, use_idf=True, k1=2.0, b=0.75):
        self.use_idf = use_idf
        self.k1 = k1
        self.b = b

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)
        return self

    def transform(self, X, copy=True):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        copy : boolean, optional (default=True)
        """
        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
            # preserve float family dtype
            X = sp.csr_matrix(X, copy=copy)
        else:
            # convert counts or binary occurrences to floats
            X = sp.csr_matrix(X, dtype=np.float64, copy=copy)

        n_samples, n_features = X.shape

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        dl = X.sum(axis=1)
        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        sz = X.indptr[1:] - X.indptr[0:-1]
        # In each row, repeat `dl` for `sz` times
        # Shape is (sum(sz), )
        # Example
        # -------
        # dl = [4, 5, 6]
        # sz = [1, 2, 3]
        # rep = [4, 5, 5, 6, 6, 6]
        rep = np.repeat(np.asarray(dl), sz)
        # Average document length
        # Scalar value
        avgdl = np.average(dl)
        # Compute BM25 score only for non-zero elements
        data = X.data * (self.k1 + 1) / (X.data + self.k1 * (1 - self.b + self.b * rep / avgdl))
        X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            # *= doesn't work
            X = X * self._idf_diag

        return X

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=stemming_tokenizer, max_df=0.9, min_df=0.1, stop_words='english', ngram_range=(1, 3))
bm25_transformer = BM25Transformer(k1=2.0, b=0.75)
bm25_transformer.fit(vectorizer.fit_transform(train_docs))

  'stop_words.' % sorted(inconsistent))


BM25Transformer(b=0.75, k1=2.0, use_idf=True)

In [13]:
# testing data
# first sentence is the query
# rest are the documents
test_texts = [
    "photo-thermoelastic investigation",
    "a simple model study of transient temperature and thermal stress distribution due to aerodynamic heating .   the present work is concerned with the determination of transient temperatures and thermal stresses in simple models intended to simulate parts or the whole of an aircraft structure of the built- up variety subjected to aerodynamic heating .   the first case considered is that of convective heat transfer into one side of a flat plate, representing a thick skin, and the effect of the resulting temperature distribution in inducing thermal stresses associated with bending restraint at the plate edges . numerical results are presented for the transient temperature differentials in the plate when the environment temperature first increases linearly with time and then remains constant, the period of linear increase representing the time of acceleration of the aircraft .  corresponding thermal stress information is presented .   the second case is that of the wide-flanged i-beam with convective heat transfer into the outer faces of the flanges .  numerical results are presented for transient temperature differentials for a wide range of values of the applicable parameters and for an environment temperature variation as described above . corresponding thermal stresses in a beam of infinite length are determined .  a theoretical analysis of the stress distribution in a beam of finite length is carried out and numerical results obtained for one case .",
    "photo-thermoelastic investigation of transient thermal stresses in a multiweb wing structure .   photothermoelastic experiments were performed on a long multiweb wing model for which a theoretical analysis is available in the literature .  the experimental procedures utilized to simulate the conditions prescribed in the theory are fully described .   correlation of theory and experiment in terms of dimensionless temperature, stress, time, and biot number revealed that the theory predicted values higher than the experimentally observed maximum thermal stresses at the center of the web .  ",
]

test_X = vectorizer.transform(test_texts)

test_result = bm25_transformer.transform(test_X)
pd.DataFrame(test_result.toarray())



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,128,129
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.021937,0.0,0.0,0.879272,0.0,0.0,1.058348,0.0,0.0,0.0,...,0.0,0.0,0.797763,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.580463,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.433954,0.0,0.0,0.0,0.0,0.0,0.0,2.78382


In [14]:
# cosine similarity for the testing data
from sklearn.metrics.pairwise import cosine_similarity 

test_similarity = cosine_similarity(test_result)
pd.DataFrame(test_similarity)

Unnamed: 0,0,1,2
0,1.0,0.0,0.136481
1,0.0,1.0,0.411099
2,0.136481,0.411099,1.0


In [15]:
# given a query and similarity_thershold return the ids of docs
# this is the relevant docs that based on bm25 algorithm for the query
def get_doc_ids(query_id, similarity_threshold):
    
    all_docs = train_docs.copy() # make a copy of all docs list
    all_docs.insert(0, queries['query'][query_id]) # inser the current query at the begining of the list
    
    test_result = bm25_transformer.transform(vectorizer.transform(all_docs)) # generate the tfidf matrix
    # pd.DataFrame(test_result.toarray())
    df = pd.DataFrame(cosine_similarity(test_result)) # calculate the pair-wise similarity and convert to df
    df = df.rename(columns = {0:'similarity'}) # rename the first column 
    df = df.sort_values('similarity', ascending=False) # sort the result based on similarity score
    df_filtered = df[df['similarity']>similarity_threshold] # filter the result based on similarity threshold
    returned_doc_ids_list = df_filtered.index.tolist() # get the ids of the returned docs
    return returned_doc_ids_list

In [16]:
# get the doc ids with relevance score 
# this is the ground truth of relevance docs for the query based on the human annotated data
def get_true_doc_ids(query_id, relevance_threshold):
    # filter based on r_score (1, 2, 3, or 4) and relevance_threshold
    true_doc_ids = relevance[(relevance['query_id']==query_id) & (relevance['r_score']<=relevance_threshold)]
    true_doc_ids_list = true_doc_ids['doc_id'].to_list()
    return true_doc_ids_list

In [17]:
# calculate evaluation scores: precision and recall

def show_eval_scores(returned_doc_ids_list, true_doc_ids_list):
    
    # true positive 
    tp = [x for x in true_doc_ids_list if x in returned_doc_ids_list]
    #tp.sort()
    #print(tp, len(tp))
    
    # false positive
    fp = [x for x in returned_doc_ids_list if x not in tp]
    #fp.sort()
    #print(fp, len(fp))
    
    # false negative
    fn = [x for x in true_doc_ids_list if x not in tp]
    #fn.sort()
    #print(fn, len(fn))
    
    precision = len(tp)/(len(tp)+len(fp))
    recall = len(tp)/(len(tp)+len(fn))


    print(f'precision is {precision:.3%}')
    print(f'recall is {recall:.3%}')
    
    return precision, recall

In [18]:
# utility function to put everything together
def show_result(query_id, similarity_threshold, relevance_threshold):
    
    print(f"query: {queries['query'][query_id]}")
    print('calculating results......')
    # we set a similarity threshold and get the ids of those documents
    # similarity_threshold 0.65 is used in https://www.aaai.org/Papers/FLAIRS/2003/Flairs03-082.pdf
    returned_doc_ids_list = get_doc_ids(query_id, similarity_threshold)

    # we choose relevance_threshold = 3, relevance 1, 2 and 3 mean relevant for returned documents
    # see readme for definitions about r_score
    true_doc_ids_list = get_true_doc_ids(query_id, relevance_threshold)

    print(f'similarity_threshold is {similarity_threshold} and relevance_threshold is {relevance_threshold}')

    show_eval_scores(returned_doc_ids_list, true_doc_ids_list)

In [19]:
# query_id = 1, similarity_threshold = 0.3, relevance_threshold = 3
show_result(1, 0.3, 3)

query: what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .
calculating results......
similarity_threshold is 0.3 and relevance_threshold is 3
precision is 22.581%
recall is 31.818%




In [20]:
# query_id = 2, similarity_threshold = 0.3, relevance_threshold = 3
show_result(2, 0.3, 3)

query: what are the structural and aeroelastic problems associated with flight of high speed aircraft .
calculating results......
similarity_threshold is 0.3 and relevance_threshold is 3
precision is 10.256%
recall is 19.048%


