In [2]:
# Train a learn-to-rank model using duplicate bugs assigned to CPD platform team, using  the LambdaMART algorithm (the trained 
# model is stored in 'LambdaMART_top100.txt'). The difference with 'learn_to_rank.ipynb' is that when generate the learn-to-rank train data 
# it uses the top 100 most similar (measured by bm25fe score) irrelevant bugs rather than randomly selected 100 irrelevant bugs

import MySQLdb
import pandas
import itertools
import numpy as np
import bm25fe
import pickle
import subprocess
import jsd

In [3]:
conn = MySQLdb.connect(host='10.117.8.41', port=3306, user='root', passwd='vmware', db='bugfeature')
cur = conn.cursor()

sql = '''SELECT *
FROM bugs_cpdplatform'''

# bugs = pandas.io.sql.read_sql(sql, conn).set_index(['bug_id'])
bugs = pandas.io.sql.read_sql(sql, conn)

In [4]:
# change the text fields of bugs into bag-of-words vectors

from gensim import corpora
from gensim import matutils

# dictionary = corpora.Dictionary(list(bugs_train['text']))
# dictionary.filter_extremes(no_below = 10, no_above = 0.9, keep_n = 100000)
dictionary = corpora.Dictionary.load_from_text('dictionary.txt')
num_terms = len(dictionary)
print num_terms

bugs['text'] = (bugs['short_desc'] +' '+ bugs['long_desc']).map(lambda x: dictionary.doc2bow(x.split()))
bugs.loc[:,'short_desc'] = bugs['short_desc'].map(lambda x: matutils.corpus2dense([dictionary.doc2bow(x.split())], num_terms, 1)[:,0])
bugs.loc[:,'long_desc'] = bugs['long_desc'].map(lambda x: matutils.corpus2dense([dictionary.doc2bow(x.split())], num_terms, 1)[:,0])

8647


In [5]:
conn = MySQLdb.connect(host='10.117.8.41', port=3306, user='root', passwd='vmware', db='bugfeature')
cur = conn.cursor()

sql = '''SELECT dup_of, dup
FROM dups_cpdplatform left join bugs_cpdplatform as b1 on dups_cpdplatform.dup=b1.bug_id left join bugs_cpdplatform as b2 on dups_cpdplatform.dup_of=b2.bug_id where b1.long_desc != '' and b2.long_desc != '' '''

cur.execute(sql)
dups = cur.fetchall()

dupset = set(itertools.chain(*dups))

In [6]:
criterion = bugs['bug_id'].map(lambda x:  not x in dupset)
bugs_filtered = bugs[criterion]

In [7]:
# print bugs_train['short_desc'][0].sum()
# print bugs_train['short_desc'].as_matrix()[4].sum()
# print bugs_train['short_desc'].as_matrix().shape

In [8]:
appearance = np.array(list(bugs['text'].map(lambda x: matutils.corpus2dense([x], num_terms, 1)[:,0]>0)))
df = appearance.sum(0)
idf = np.log(bugs.shape[0]/df)
avgfl = np.array([np.array(list(bugs['short_desc'])).sum(1).mean(), np.array(list(bugs['long_desc'])).sum(1).mean()])

bugs = bugs.set_index(['bug_id'])
bugs_filtered = bugs_filtered.set_index(['bug_id'])

In [9]:
# print appearance[1].sum()
# bugs.columns
# print bugs.index[0]

In [17]:
# generate the learn-to-rank train data, following the format of the Ranklib tool

from gensim.models import hdpmodel
hdp = hdpmodel.HdpModel.load('hdpmodel.txt')

bm = bm25fe.bm25fe(K1=1.2, d_B=(0.75, 0.75), d_W = (2, 1), K3=1.2, q_B=(0.75, 0.75), q_W=(2, 1))

with open('train100.txt', 'rb') as f:
    train = pickle.load(f)

# num_query = len(test)
# num_hit = 0

lines = []
for item in train:
    target = 3
    sim_title = bugs.loc[item['rel'],'short_desc'][bugs.loc[item['query'],'short_desc']>0].sum()/max(bugs.loc[item['rel'],'short_desc'].sum(), 1)
    score = bm.score(idf, avgfl, [bugs.loc[item['rel'],'short_desc'], bugs.loc[item['rel'],'long_desc']],[bugs.loc[item['query'],'short_desc'], bugs.loc[item['query'],'long_desc']])
    cluster = hdp.inference([bugs.loc[item['rel'],'text'], bugs.loc[item['query'],'text']])
    dis_topic = jsd.JSD(cluster[0], cluster[1])
    sim_hos = False
    if (bugs.loc[item['rel'],'host_op_sys'] == bugs.loc[item['query'],'host_op_sys']) and (bugs.loc[item['rel'],'host_op_sys'] != 'Unknown'):
        sim_hos = True
    sim_gos = False
    if (bugs.loc[item['rel'],'guest_op_sys'] == bugs.loc[item['query'],'guest_op_sys']) and (bugs.loc[item['rel'],'guest_op_sys'] != 'Unknown'):
        sim_gos = True
    sim_pd = False
    if (bugs.loc[item['rel'],'product_id'] == bugs.loc[item['query'],'product_id']):
        sim_pd = True
    sim_cg = False
    if (bugs.loc[item['rel'],'category_id'] == bugs.loc[item['query'],'category_id']):
        sim_cg = True
    sim_cp = False
    if (bugs.loc[item['rel'],'component_id'] == bugs.loc[item['query'],'component_id']):
        sim_cp = True
    lines.append(str(target)+' qid:'+str(item['query'])+' 1:'+str(sim_title)+' 2:'+str(score)+' 3:'+str(dis_topic)+' 4:'+str(int(sim_hos))+' 5:'+str(int(sim_gos))+' 6:'+str(int(sim_pd))+' 7:'+str(int(sim_cg))+' 8:'+str(int(sim_cp))+' # '+str(item['rel'])+'\n')
    
    bugs_filtered['score'] = bugs_filtered.apply(lambda x: bm.score(idf, avgfl, [x[1], x[7]],[bugs.loc[item['query'],'short_desc'], bugs.loc[item['query'],'long_desc']]), axis = 1)
    bugs_sorted = bugs_filtered.sort(['score'], ascending = False).iloc[:100]
    idx = 0
    for idx in xrange(100):
        target = 0
        sim_title = bugs_sorted.iloc[idx]['short_desc'][bugs.loc[item['query'],'short_desc']>0].sum()/max(bugs_sorted.iloc[idx]['short_desc'].sum(), 1)
        score = bugs_sorted.iloc[idx]['score']
        cluster = hdp.inference([bugs_sorted.iloc[idx]['text'], bugs.loc[item['query'],'text']])
        dis_topic = jsd.JSD(cluster[0], cluster[1])
        sim_hos = False
        if (bugs_sorted.iloc[idx]['host_op_sys'] == bugs.loc[item['query'],'host_op_sys']) and (bugs_sorted.iloc[idx]['host_op_sys'] != 'Unknown'):
            sim_hos = True
        sim_gos = False
        if (bugs_sorted.iloc[idx]['guest_op_sys'] == bugs.loc[item['query'],'guest_op_sys']) and (bugs_sorted.iloc[idx]['guest_op_sys'] != 'Unknown'):
            sim_gos = True
        sim_pd = False
        if (bugs_sorted.iloc[idx]['product_id'] == bugs.loc[item['query'],'product_id']):
            sim_pd = True
        sim_cg = False
        if (bugs_sorted.iloc[idx]['category_id'] == bugs.loc[item['query'],'category_id']):
            sim_cg = True
        sim_cp = False
        if (bugs_sorted.iloc[idx]['component_id'] == bugs.loc[item['query'],'component_id']):
            sim_cp = True
        # if sim_pd and sim_cg and sim_cp:
        #     target = 2
        # elif sim_pd and sim_cg:
        #     target = 1
        lines.append(str(target)+' qid:'+str(item['query'])+' 1:'+str(sim_title)+' 2:'+str(score)+' 3:'+str(dis_topic)+' 4:'+str(int(sim_hos))+' 5:'+str(int(sim_gos))+' 6:'+str(int(sim_pd))+' 7:'+str(int(sim_cg))+' 8:'+str(int(sim_cp))+' # '+str(bugs.index[idx])+'\n')

In [18]:
# print len(bugs)
print len(lines)

10100


In [21]:
f = open('train_top100.txt', 'wb')

f.writelines(lines)

In [22]:
# call the Ranklib tool to train a LambdaMART model

subprocess.call(('java', '-jar', 'RankLib-2.1-patched.jar', '-train', 'train_top100.txt', '-ranker', '6', '-metric2t', 'NDCG@5', '-save', 'LambdaMART_top100.txt'))

0

In [73]:
# bugs_train

In [74]:
# print bugs_train['guest_op_sys']

In [100]:
print len(lines)
print lines[10053]


10100
0 qid:1224296 1:0.142857 2:77.9385298109 3:0.0788403087743 4:1 5:0 6:1 7:0 8:0 # 1425012



In [87]:
print bugs_train.loc[485186,'short_desc'].sum()

0.0
