### Preprocessing Eclipse Bug reports

#### Description of attributes
    Size of data: (103805, 13)
    Bug ID: Unique Identifier
    Important attributes: Product, Component, Assignee, Summary, Changed (possibly the frequency it has been changed with difference between opened and changed), Summary, Severity, Priority:
    Unimportant attributes: Status and Resolution. [We will not know them for the test samples]
    
    Summary has 103802 values i.e. 3 missing values will have to investigate them.


### Fixing of data formatting in the csv
    Some rows have shifted right by a few columns due the the data delimiting in the csv like rows 3613, 258
    That has been fixed and a new file total_fixed.csv has been saved

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/total.csv')

In [6]:
df.shape

(103805, 13)

In [17]:
import numpy as np

In [19]:
len(np.unique(df['Bug ID']))

103805

In [5]:
df.drop('Bug ID',axis=1,inplace=False).describe()

Unnamed: 0,Product,Component,Assignee,Status,Resolution,Summary,Changed,Priority,Severity,Keywords,Summary.1,Opened
count,103805,103805,103805,103805,103805,103802,103805,103805,103805,6814,103802,103805
unique,4,27,307,6,7,102339,88794,5,7,106,102339,96130
top,Platform,UI,jdt-ui-inbox@eclipse.org,RESOLVED,FIXED,#NAME?,9/27/05 9:12,P3,normal,performance,#NAME?,10/10/01 23:05
freq,60644,37694,3668,75695,53211,37,759,88902,71546,1272,37,61


In [10]:
df.loc[df.Summary.isnull(),:]

Unnamed: 0,Bug ID,Product,Component,Assignee,Status,Resolution,Summary,Changed,Priority,Severity,Keywords,Summary.1,Opened
538,1719,JDT,Debug,darin.eclipse@gmail.com,CLOSED,INVALID,,11/2/01 8:56,P3,normal,,,10/10/01 22:18
26852,38524,Platform,SWT,platform-swt-inbox@eclipse.org,CLOSED,INVALID,,6/5/03 13:04,P3,normal,,,6/5/03 11:35
44151,65246,JDT,UI,dirk_baeumer@ch.ibm.com,VERIFIED,FIXED,,6/11/04 10:41,P3,normal,,,6/2/04 5:39


In [13]:
df.iloc[[257,258],:]

Unnamed: 0,Bug ID,Product,Component,Assignee,Status,Resolution,Summary,Changed,Priority,Severity,Keywords,Summary.1,Opened
3610,5706,JDT,Core,philippe_mulet@fr.ibm.com,RESOLVED,WORKSFORME,Cannot add two folders w/ same name but diff p...,2/7/02 8:46,P1,normal,,Cannot add two folders w/ same name but diff p...,11/9/01 8:59
3611,8230,Platform,Debug,darin.eclipse@gmail.com,VERIFIED,FIXED,Console does not show process as <terminated>;...,2/7/02 9:14,P3,normal,,Console does not show process as <terminated>;...,1/23/02 14:09


In [14]:
df.to_csv('../data/total_fixed.csv',index=False,header=True)

### Text preprocessing according to the Paper

1. Tokenization - word level
2. Stop-Word removal - 30 stop words: TIFIDF has that feature
3. Stemming - Porter Stemming.

This will be done only on the Summary attribute as others are categorical attributes.


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/total_fixed.csv')

In [3]:
df.drop(['Keywords','Summary.1','Status','Resolution','Changed','Opened'],axis=1,inplace=True)

In [4]:
df.columns

Index([u'Bug ID', u'Product', u'Component', u'Assignee', u'Summary',
       u'Priority', u'Severity'],
      dtype='object')

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

Bug ID       0
Product      0
Component    0
Assignee     0
Summary      0
Priority     0
Severity     0
dtype: int64

In [7]:
df.reset_index(inplace=True)

### Null values removed and only important features are kept

In [65]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re

tokenizer = RegexpTokenizer(r'\w+')
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = PorterStemmer()

In [66]:
# tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')
def tokenize_stop_stem(text):
    try:
        tokens = tokenizer.tokenize(text)
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) and stem
        filtered_tokens = []
        for token in tokens:
            token = token.lower()
            if token not in stopwords:
                if not re.search('[0-9]', token):
                    try:
                        token = stemmer.stem(token)
                        filtered_tokens.append(token)
                    except UnicodeDecodeError:
                        print 'illeagal token ignored:',token
                        pass
    except UnicodeDecodeError:
        print 'illeagal token ignored:',token
        pass
    return filtered_tokens

### Run sklearn countvectorizer

In [115]:
from sklearn.feature_extraction.text import CountVectorizer

In [116]:
countVect = CountVectorizer(input='content',lowercase=False, max_features=395996, tokenizer=tokenize_stop_stem, decode_error='ignore')

In [117]:
countVector = countVect.fit_transform(df.Summary)

In [118]:
countVector

<103802x17868 sparse matrix of type '<type 'numpy.int64'>'
	with 584285 stored elements in Compressed Sparse Row format>

In [119]:
countVector.shape

(103802, 17868)

### Summary attribute has been preprocessed. Get dummy variables and join them as CSR and join with countvectors

In [16]:
dummy_df = pd.get_dummies( df[['Product','Component','Assignee','Severity']] )

In [17]:
dummy_df.head()

Unnamed: 0,Product_Equinox,Product_JDT,Product_PDE,Product_Platform,Component_APT,Component_Ant,Component_Build,Component_CVS,Component_Compare,Component_Compendium,...,Assignee_wtp-dev@eclipse.org,Assignee_ymnk@jcraft.com,Assignee_zhiyongl@ca.ibm.com,Severity_blocker,Severity_critical,Severity_enhancement,Severity_major,Severity_minor,Severity_normal,Severity_trivial
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [120]:
from scipy import sparse

In [19]:
dummy_sparse = sparse.csr_matrix(dummy_df.values)

In [121]:
other_sparse = sparse.csr_matrix(other_features.values)

In [20]:
inputDF = sparse.hstack((dummy_sparse,countVector),format="csr")

In [122]:
inputDF = sparse.hstack((other_features,countVector),format="csr")

In [75]:
print inputDF[0]

  (0, 3)	1
  (0, 26)	1
  (0, 49)	1
  (0, 343)	1
  (0, 345)	2973
  (0, 346)	2024
  (0, 347)	2338
  (0, 348)	2973
  (0, 349)	2024
  (0, 350)	2338
  (0, 351)	2973
  (0, 352)	2024
  (0, 353)	2338
  (0, 354)	2973
  (0, 355)	2024
  (0, 356)	2338
  (0, 15006)	1
  (0, 15974)	1
  (0, 16071)	1


In [76]:
print countVector[0]

  (0, 15714)	1
  (0, 14649)	1
  (0, 15617)	1


In [77]:
print other_sparse[0]

  (0, 3)	1
  (0, 26)	1
  (0, 49)	1
  (0, 343)	1
  (0, 345)	2973
  (0, 346)	2024
  (0, 347)	2338
  (0, 348)	2973
  (0, 349)	2024
  (0, 350)	2338
  (0, 351)	2973
  (0, 352)	2024
  (0, 353)	2338
  (0, 354)	2973
  (0, 355)	2024
  (0, 356)	2338


### Input Data is ready for algorithms to process

##### Random Forest with training accuracy

In [123]:
from sklearn.ensemble import RandomForestClassifier

In [124]:
clf = RandomForestClassifier(n_jobs=-1)

In [80]:
%time clf.fit(inputDF,df['Priority'])

CPU times: user 42.9 s, sys: 224 ms, total: 43.1 s
Wall time: 8.7 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [81]:
clf.score(inputDF,df['Priority'])

0.9900579950289975

In [82]:
print 'P1:',clf.score(inputDF[list(df.loc[df.Priority=='P1',:].index)], df['Priority'].loc[df.Priority=='P1'] ),sum(df.Priority=='P1')
print 'P2:',clf.score(inputDF[list(df.loc[df.Priority=='P2',:].index)], df['Priority'].loc[df.Priority=='P2'] ),sum(df.Priority=='P2')
print 'P3:',clf.score(inputDF[list(df.loc[df.Priority=='P3',:].index)], df['Priority'].loc[df.Priority=='P3'] ),sum(df.Priority=='P3')
print 'P4:',clf.score(inputDF[list(df.loc[df.Priority=='P4',:].index)], df['Priority'].loc[df.Priority=='P4'] ),sum(df.Priority=='P4')
print 'P5:',clf.score(inputDF[list(df.loc[df.Priority=='P5',:].index)], df['Priority'].loc[df.Priority=='P5'] ),sum(df.Priority=='P5')

P1: 0.949629171817 3236
P2: 0.948781441418 7673
P3: 0.999651289666 88899
P4: 0.878724297104 2383
P5: 0.903165735568 1611


In [33]:
print 'P1:',clf.score(inputDF[list(df.loc[df.Priority=='P1',:].index)], df['Priority'].loc[df.Priority=='P1'] ),sum(df.Priority=='P1')
print 'P2:',clf.score(inputDF[list(df.loc[df.Priority=='P2',:].index)], df['Priority'].loc[df.Priority=='P2'] ),sum(df.Priority=='P2')
print 'P3:',clf.score(inputDF[list(df.loc[df.Priority=='P3',:].index)], df['Priority'].loc[df.Priority=='P3'] ),sum(df.Priority=='P3')
print 'P4:',clf.score(inputDF[list(df.loc[df.Priority=='P4',:].index)], df['Priority'].loc[df.Priority=='P4'] ),sum(df.Priority=='P4')
print 'P5:',clf.score(inputDF[list(df.loc[df.Priority=='P5',:].index)], df['Priority'].loc[df.Priority=='P5'] ),sum(df.Priority=='P5')

P1: 0.936032138443 3236
P2: 0.943829010817 7673
P3: 0.999550051182 88899
P4: 0.844733529165 2383
P5: 0.889509621353 1611


## Cross validation on the dataset with F1-Score (our evaluation metric)

In [83]:
from sklearn.naive_bayes import MultinomialNB

In [84]:
clf = MultinomialNB()

In [85]:
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score



In [86]:
f1scorer_macro = make_scorer(f1_score, average='macro')
f1scorer_weighted = make_scorer(f1_score, average='weighted')
f1scorer_perclass = make_scorer(f1_score, average=None)

In [87]:
f1scorer_perclass

make_scorer(f1_score, average=None)

In [40]:
%time cvscores_wf1 = cross_val_score(clf, inputDF, df['Priority'], scoring=f1scorer_weighted , cv=10, n_jobs=-1)
print cvscores_wf1
print 'Weighted F1 mean,std:', np.mean(cvscores), np.std(cvscores)

CPU times: user 832 ms, sys: 76 ms, total: 908 ms
Wall time: 2min 4s


In [89]:
import numpy as np




In [90]:
%time cvscores_f1 = cross_val_score(clf, inputDF, df['Priority'], scoring=f1scorer_macro , cv=10, n_jobs=-1)
print cvscores_f1
print 'F1 mean,std:', np.mean(cvscores_f1), np.std(cvscores_f1)

CPU times: user 948 ms, sys: 92 ms, total: 1.04 s
Wall time: 1.61 s
[ 0.19922428  0.2453444   0.29564239  0.292726    0.25145097  0.25054827
  0.25181577  0.24395555  0.22604281  0.21915404]
F1 mean,std: 0.247590448633 0.0283164018714


In [92]:
cvscores_acc = cross_val_score(clf, inputDF, df['Priority'],scoring='accuracy', cv=10, n_jobs=-1)

In [127]:
np.mean(cvscores_acc),np.std(cvscores_acc)

(0.83135657064570512, 0.043975080942427851)

### Measuring Per Class F1 scores as per the paper using KFold

In [125]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [126]:
#clf = RandomForestClassifier(n_jobs=-1)
#clf = AdaBoostClassifier()
#clf = GradientBoostingClassifier()
clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
kf = KFold(n_splits=10)

In [127]:
cvscores = []
for train,test in kf.split(inputDF):
    train_input = inputDF[train]
    test_input = inputDF[test]
    train_output = df['Priority'][train]
    test_output = df['Priority'][test]
    %time clf.fit(train_input,train_output)
    prediction = clf.predict(test_input)
    scores = f1_score(test_output, prediction, average=None)
    print scores
    cvscores.append(scores)

CPU times: user 42.6 s, sys: 296 ms, total: 42.9 s
Wall time: 9.35 s
[ 0.302267    0.18350515  0.92603378  0.23529412  0.15525114]
CPU times: user 40.6 s, sys: 96 ms, total: 40.7 s
Wall time: 8.38 s
[ 0.26213592  0.14945055  0.92646593  0.17275748  0.23923445]
CPU times: user 40.5 s, sys: 36 ms, total: 40.5 s
Wall time: 8.19 s
[ 0.28078818  0.16107383  0.92836827  0.16153846  0.27102804]
CPU times: user 41.2 s, sys: 68 ms, total: 41.3 s
Wall time: 8.53 s
[ 0.25118483  0.13333333  0.92848115  0.18532819  0.2815534 ]
CPU times: user 45 s, sys: 8 ms, total: 45 s
Wall time: 9.31 s
[ 0.26633166  0.14790997  0.9264115   0.21568627  0.31884058]
CPU times: user 53.3 s, sys: 624 ms, total: 53.9 s
Wall time: 11.8 s
[ 0.22828784  0.15350389  0.92756072  0.22857143  0.28571429]
CPU times: user 52.5 s, sys: 268 ms, total: 52.7 s
Wall time: 11.1 s
[ 0.22429907  0.16768916  0.92261082  0.18181818  0.27160494]
CPU times: user 50.1 s, sys: 140 ms, total: 50.3 s
Wall time: 10.4 s
[ 0.25        0.1844127

In [105]:
clf.feature_importances_[345:357]

array([ 0.02804619,  0.02679268,  0.04025911,  0.02318562,  0.03207399,
        0.02952609,  0.02597535,  0.02539812,  0.03137977,  0.02445142,
        0.02760929,  0.02762818])

In [129]:
np.mean(cvscores)

0.35995002947810001

In [128]:
print '\tP1\t\tP2\t\tP3\t\tP4\tP5'
print '\t',np.mean(cvscores,axis=0)
cvscores

	P1		P2		P3		P4	P5
	[ 0.25789875  0.15352413  0.92717314  0.19530204  0.26585209]


[array([ 0.302267  ,  0.18350515,  0.92603378,  0.23529412,  0.15525114]),
 array([ 0.26213592,  0.14945055,  0.92646593,  0.17275748,  0.23923445]),
 array([ 0.28078818,  0.16107383,  0.92836827,  0.16153846,  0.27102804]),
 array([ 0.25118483,  0.13333333,  0.92848115,  0.18532819,  0.2815534 ]),
 array([ 0.26633166,  0.14790997,  0.9264115 ,  0.21568627,  0.31884058]),
 array([ 0.22828784,  0.15350389,  0.92756072,  0.22857143,  0.28571429]),
 array([ 0.22429907,  0.16768916,  0.92261082,  0.18181818,  0.27160494]),
 array([ 0.25      ,  0.18441273,  0.92873345,  0.15827338,  0.24210526]),
 array([ 0.26065163,  0.13125   ,  0.92667443,  0.21678322,  0.3286385 ]),
 array([ 0.25304136,  0.12311266,  0.93039138,  0.1969697 ,  0.26455026])]

In [97]:
print '\tP1\t\tP2\t\tP3\t\tP4\tP5'
print '\t',np.mean(cvscores,axis=0)
cvscores

	P1		P2		P3		P4	P5
	[ 0.08830426  0.07866991  0.92049622  0.14514483  0.22485906]


[array([ 0.08044383,  0.04022627,  0.79617796,  0.41274238,  0.45138889]),
 array([ 0.16113744,  0.07654987,  0.84787319,  0.16027875,  0.32773109]),
 array([ 0.12345679,  0.07969349,  0.9126399 ,  0.13333333,  0.30188679]),
 array([ 0.12903226,  0.09279609,  0.94214536,  0.20833333,  0.23809524]),
 array([ 0.11570248,  0.07446809,  0.93965829,  0.1048951 ,  0.35514019]),
 array([ 0.17777778,  0.12815126,  0.93686122,  0.15770609,  0.18918919]),
 array([ 0.032     ,  0.08979592,  0.96259477,  0.09782609,  0.13636364]),
 array([ 0.06349206,  0.06392694,  0.95530839,  0.10084034,  0.16949153]),
 array([ 0.        ,  0.07246377,  0.97441481,  0.03488372,  0.07692308]),
 array([ 0.        ,  0.06862745,  0.93728831,  0.04060914,  0.00238095])]

In [98]:
np.mean(cvscores)

0.29149485632471661

In [96]:
print '\tP1\t\tP2\t\tP3\t\tP4\tP5'
print '\t',np.mean(cvscores,axis=0)
cvscores
#np.mean(cvscores)

	P1		P2		P3		P4	P5
	[ 0.11279759  0.09646663  0.91850115  0.08118626  0.01135761]


[array([ 0.04953999,  0.08259587,  0.79588336,  0.02331002,  0.04402516]),
 array([ 0.22727273,  0.09788769,  0.8428645 ,  0.04580153,  0.02409639]),
 array([ 0.14044944,  0.10748003,  0.90991423,  0.07407407,  0.        ]),
 array([ 0.11034483,  0.12962963,  0.94013304,  0.09195402,  0.        ]),
 array([ 0.10833333,  0.10899873,  0.93713224,  0.11347518,  0.        ]),
 array([ 0.17045455,  0.13226453,  0.93455905,  0.14545455,  0.        ]),
 array([ 0.09230769,  0.12015504,  0.96227744,  0.12765957,  0.04545455]),
 array([ 0.11347518,  0.05504587,  0.95473876,  0.13513514,  0.        ]),
 array([ 0.06451613,  0.07453416,  0.97260002,  0.03529412,  0.        ]),
 array([ 0.05128205,  0.05607477,  0.93490881,  0.01970443,  0.        ])]

### Running algorithms on  Data With Temporal features

In [58]:
import pandas as pd




In [59]:
df = pd.read_csv('../data/data_with_temporal.csv')
df.drop(['Keywords','Summary.1','Status','Resolution','Changed','Opened'],axis=1,inplace=True)
print df.shape
df.dropna(inplace=True)
print df.shape
df.reset_index(inplace=True)
df.isnull().sum()

(103805, 19)
(103802, 19)


index        0
Bug ID       0
Product      0
Component    0
Assignee     0
Summary      0
Priority     0
Severity     0
TMP1         0
TMP2         0
TMP3         0
TMP4         0
TMP5         0
TMP6         0
TMP7         0
TMP8         0
TMP9         0
TMP10        0
TMP11        0
TMP12        0
dtype: int64

In [106]:
df = df.sample(frac=1.0)

In [107]:
df.reset_index(inplace=True)

In [108]:
dummy_df = pd.get_dummies( df[['Product','Component','Assignee','Severity']] )

In [110]:
df.columns[9:]

Index([u'TMP1', u'TMP2', u'TMP3', u'TMP4', u'TMP5', u'TMP6', u'TMP7', u'TMP8',
       u'TMP9', u'TMP10', u'TMP11', u'TMP12'],
      dtype='object')

In [111]:
other_features = pd.concat([dummy_df, df[df.columns[9:]]],axis=1)

In [112]:
other_features.shape

(103802, 357)

In [113]:
other_features.dropna().shape

(103802, 357)

In [114]:
dummy_df.shape

(103802, 345)