In [2]:
# This file aims to create a function to measure the similarity of the text fields of two bugs. It's an extension of bm25fe measure. The derivative
# is used to tune the parameters in the function. For the formulas, please refer to the techinical report.The final file used in Triage Robot is 
# 'bm25fe.py'

import numpy as np

class bm25fe():
    """Implements the BM25F scoring algorithm.
    """
 
    def __init__(self, K1=1.2, d_B=(0.75, 0.75), d_W = (1, 1), K3=1.2, q_B=(0.75, 0.75), q_W=(1, 1)):
        """
        
        :param B: free parameter, see the BM25 literature. Keyword arguments of
            the form ``fieldname_B`` (for example, ``body_B``) set field-
            specific values for B.
        :param K1: free parameter, see the BM25 literature.
        """
        
        self.K1 = K1
        self.d_B = d_B
        self.d_W = d_W
        -
        self.K3 = K3
        self.q_B = q_B
        self.q_W = q_W
        
        self.fields = len(d_B)
        
        
    def score(self, idf, avgfl, doc, query):
        
        idf = numpy.array(idf)
        avgfl = numpy.array(avgfl)
        doc = numpy.array(doc)
        query = numpy.array(query)
        
        doc_n = numpy.zeros(doc[0].size)
        query_n = numpy.zeros(query[0].size)
        for i in xrange(self.fields):
            doc_n += self.d_W[i]/(1-self.d_B[i]+self.d_B[i]*doc[i].sum()/avgfl[i])*doc[i]
            query_n += self.q_W[i]/(1-self.q_B[i]+self.q_B[i]*query[i].sum()/avgfl[i])*query[i]
        
        return (idf*(self.K1+1)*doc_n/(self.K1+doc_n)*(self.K3+1)*query_n/(self.K3+query_n)).sum()
    
    
    def derivative(self, idf, avgfl, doc, query):
        
        idf = numpy.array(idf)
        avgfl = numpy.array(avgfl)
        doc = numpy.array(doc)
        query = numpy.array(query)
        
        doc_n = numpy.zeros(doc[0].size)
        query_n = numpy.zeros(query[0].size)
        for i in xrange(self.fields):
            doc_n += self.d_W[i]/(1-self.d_B[i]+self.d_B[i]*doc[i].sum()/avgfl[i])*doc[i]
            query_n += self.q_W[i]/(1-self.q_B[i]+self.q_B[i]*query[i].sum()/avgfl[i])*query[i]
        
        der =[]
        
        der.append((idf*doc_n*(doc_n-1)/((self.K1+doc_n)**2)*(self.K3+1)*query_n/(self.K3+query_n)).sum())
        
        for i in xrange(self.fields):
            der.append((idf*self.K1*(self.K1+1)/((self.K1+doc_n)**2)*self.d_W[i]*(1-doc[i].sum()/avgfl[i])/((1-self.d_B[i]+self.d_B[i]*doc[i].sum()/avgfl[i])**2)*doc[i]
                        *(self.K3+1)*query_n/(self.K3+query_n)).sum())
            
        for i in xrange(self.fields):
            der.append((idf*self.K1*(self.K1+1)/((self.K1+doc_n)**2)/(1-self.d_B[i]+self.d_B[i]*doc[i].sum()/avgfl[i])*doc[i]*(self.K3+1)*query_n/(self.K3+query_n)).sum())
        
        
        der.append((idf*(self.K1+1)*doc_n/(self.K1+doc_n)*query_n*(query_n-1)/((self.K3+query_n)**2)).sum())
        
        for i in xrange(self.fields):
            der.append((idf*(self.K1+1)*doc_n/(self.K1+doc_n)*self.K3*(self.K3+1)/((self.K3+query_n)**2)
                        *self.q_W[i]*(1-query[i].sum()/avgfl[i])/((1-self.q_B[i]+self.q_B[i]*query[i].sum()/avgfl[i])**2)*query[i]).sum())
            
        for i in xrange(self.fields):
            der.append((idf*(self.K1+1)*doc_n/(self.K1+doc_n)*self.K3*(self.K3+1)/((self.K3+query_n)**2)/(1-self.q_B[i]+self.q_B[i]*query[i].sum()/avgfl[i])*query[i]).sum())
        
        return np.array(der)
        
        

In [1]:
# test a bug in system_test.ipynb when use the data in the table bugs_cpdplatform_ff

import MySQLdb
import pandas
import itertools
import numpy as np
# import bm25fe
import pickle
# import subprocess
# import jsd

conn = MySQLdb.connect(host='10.117.8.41', port=3306, user='root', passwd='vmware', db='bugfeature')
cur = conn.cursor()

sql = '''SELECT *
FROM bugs_cpdplatform_ff'''

bugs = pandas.io.sql.read_sql(sql, conn)

from gensim import corpora
from gensim import matutils

# dictionary = corpora.Dictionary(list(bugs_train['text']))
# dictionary.filter_extremes(no_below = 10, no_above = 0.9, keep_n = 100000)
dictionary = corpora.Dictionary.load_from_text('dictionary.txt')
num_terms = len(dictionary)
print num_terms

# bugs['summary'] = bugs['short_desc']
bugs['text'] = (bugs['short_desc'] +' '+ bugs['long_desc']).map(lambda x: dictionary.doc2bow(x.split()))
bugs.loc[:,'short_desc'] = bugs['short_desc'].map(lambda x: matutils.corpus2dense([dictionary.doc2bow(x.split())], num_terms, 1)[:,0])
bugs.loc[:,'long_desc'] = bugs['long_desc'].map(lambda x: matutils.corpus2dense([dictionary.doc2bow(x.split())], num_terms, 1)[:,0])

appearance = np.array(list(bugs['text'].map(lambda x: matutils.corpus2dense([x], num_terms, 1)[:,0]>0)))
df = appearance.sum(0)
idf = np.log(bugs.shape[0]/df)
avgfl = np.array([np.array(list(bugs['short_desc'])).sum(1).mean(), np.array(list(bugs['long_desc'])).sum(1).mean()])

bugs = bugs.set_index(['bug_id'])

with open('test.txt', 'rb') as f:
    test = pickle.load(f)

doc = np.array([bugs.iloc[0,13], bugs.iloc[0,14]])

8647


In [5]:
doc = np.array([bugs.iloc[0,13], bugs.iloc[0,14]])
query = np.array([bugs.loc[test[0]['query'],'short_desc'], bugs.loc[test[0]['query'],'long_desc']])
doc_n = np.zeros(doc[0].size)
query_n = np.zeros(query[0].size)
K1=1.2
d_B=(0.75, 0.75)
d_W = (1, 1)
K3=1.2
q_B=(0.75, 0.75)
q_W=(1, 1)
for i in xrange(2):
    doc_n += d_W[i]/(1-d_B[i]+d_B[i]*doc[i].sum()/avgfl[i])*doc[i]
    query_n += q_W[i]/(1-q_B[i]+q_B[i]*query[i].sum()/avgfl[i])*query[i]
    print doc_n
    print doc_n.sum()
    print query_n
    print query_n.sum()
        
score = (idf*(K1+1)*doc_n/(K1+doc_n)*(K3+1)*query_n/(K3+query_n)).sum()
print score

[ 0.  0.  0. ...,  0.  0.  0.]
9.39240860939
[ 0.  0.  0. ...,  0.  0.  0.]
8.55537128448
[ 0.  0.  0. ...,  0.  0.  0.]
1072.04653031
[ 0.  0.  0. ...,  0.  0.  0.]
54.7104203701
nan


In [25]:
# temp = idf*(K1+1)*doc_n/(K1+doc_n)*(K3+1)#*query_n/(K3+query_n)
# print temp.sum()
print df[7474]
print idf[7474]
# for idx,item in enumerate(temp):
#     if item != 0:
#         print idx, item

0
-inf
