Import libraries required

In [8]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from time import time
from sklearn.model_selection import train_test_split

## Read data in

In [2]:
df = pd.read_csv('../data/total_fixed.csv', parse_dates=['Changed', 'Opened'])
cols = df.columns

### Train / Test Split - 80:20

In [45]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Priority', axis=1), df.Priority, test_size=0.20, random_state=42, stratify=df.Priority)

In [51]:
X_train['Priority'] = y_train
X_test['Priority'] = y_test

print X_train.shape
print X_test.shape

(83044, 13)
(20761, 13)


In [53]:
# Save Splitted data to disk
X_train.to_csv('../data/train.csv', index=False)
X_test.to_csv('../data/test.csv', index=False)

In [56]:
data = X_train
n_rows = len(data)
n_cols = len(cols)
bugIDs = data['Bug ID']
# print data.head()
# print bugIDs.head()

### Read in train and test data separately

In [3]:
data= pd.read_csv('../data/train.csv', parse_dates=['Changed', 'Opened'])
X_test = pd.read_csv('../data/test.csv', parse_dates=['Changed', 'Opened'])

In [4]:
print df.shape
print data.shape
print X_test.shape

(103805, 13)
(83044, 13)
(20761, 13)


In [5]:
print data['Changed'].head()
print set(data['Severity'])

severity_levels = {'enhancement': 0, 'trivial' : 1, 'minor' : 2, 'normal' : 3, 'major' : 4, 'critical' : 5, 'blocker' : 6}

0   2004-06-02 14:34:00
1   2002-04-30 10:39:00
2   2004-04-22 11:32:00
3   2006-05-20 06:05:00
4   2004-07-15 11:40:00
Name: Changed, dtype: datetime64[ns]
set(['major', 'normal', 'blocker', 'critical', 'enhancement', 'trivial', 'minor'])


In [6]:
print set(data['Priority'])

set(['P2', 'P3', 'P1', 'P4', 'P5'])


In [19]:
print data['Changed'].head()
print set(data['Severity'])

severity_levels = {'enhancement': 0, 'trivial' : 1, 'minor' : 2, 'normal' : 3, 'major' : 4, 'critical' : 5, 'blocker' : 6}
np_opened = np.array(data['Opened'])
np_opened = np_opened.reshape(len(np_opened), 1)
# severity_levels = ['enhancement', 'trivial', 'minor', 'normal', 'major', 'critical', 'blocker']

0   2004-06-02 14:34:00
1   2002-04-30 10:39:00
2   2004-04-22 11:32:00
3   2006-05-20 06:05:00
4   2004-07-15 11:40:00
Name: Changed, dtype: datetime64[ns]
set(['major', 'normal', 'blocker', 'critical', 'enhancement', 'trivial', 'minor'])


### Temporal Feature Generation - TMP1 to 12.

In [59]:
temporal_dict = {}

# Generates temporal dict which is used to generate temporal functions
# This should be based only on training data
for i in range(len(data)):
    row = data.iloc[i]
    t = (row['Opened'].date() - datetime(1970,1,1).date()).days
    s = row['Severity']
    if (t not in temporal_dict):
        temporal_dict[t] = {}
    if (s not in temporal_dict[t]):
        temporal_dict[t][s] = 0
    temporal_dict[t][s] += 1

    

In [60]:
print len(temporal_dict)

# Function for generating temporal features (Fast version)
def temporal_feature1(x, n):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    total = 0
    for t in range(d-n, d+1):
        if t in temporal_dict:
            total += sum(temporal_dict[t].values())
    return total

def temporal_feature2(x, n):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    s = x['Severity']
    total = 0
    for t in range(d-n, d+1):
        if t in temporal_dict:
            if s in temporal_dict[t]:
                total += (temporal_dict[t][s])
    return total

def temporal_feature3(x, n):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    sev = x['Severity']
    ind = severity_levels.index(sev)
    greater_s = severity_levels[ind:]
    total = 0
    for t in range(d-n, d+1):
        if t in temporal_dict:
            for s in greater_s:
                if s in temporal_dict[t]:
                    total += (temporal_dict[t][s])
    return total
    

2248


In [61]:
start = time()
print temporal_feature1(data.iloc[0], 7)
end = time() - start
print "Time taken = ", end

823
Time taken =  0.000661849975586


In [64]:
def generate_temporal_features(data2):
    TMP1 = data2.apply(lambda x: temporal_feature1(x, 7), axis=1)
    TMP2 = data2.apply(lambda x: temporal_feature2(x, 7), axis=1)
    TMP3 = data2.apply(lambda x: temporal_feature3(x, 7), axis=1)
    TMP4 = data2.apply(lambda x: temporal_feature1(x, 30), axis=1)
    TMP5 = data2.apply(lambda x: temporal_feature2(x, 30), axis=1)
    TMP6 = data2.apply(lambda x: temporal_feature3(x, 30), axis=1)
    TMP7 = data2.apply(lambda x: temporal_feature1(x, 1), axis=1)
    TMP8 = data2.apply(lambda x: temporal_feature2(x, 1), axis=1)
    TMP9 = data2.apply(lambda x: temporal_feature3(x, 1), axis=1)
    TMP10 = data2.apply(lambda x: temporal_feature1(x, 3), axis=1)
    TMP11 = data2.apply(lambda x: temporal_feature2(x, 3), axis=1)
    TMP12 = data2.apply(lambda x: temporal_feature3(x, 3), axis=1)
    temporal_cols = ['TMP1', 'TMP2', 'TMP3', 'TMP4', 'TMP5', 'TMP6', 'TMP7', 'TMP8', 'TMP9', 'TMP10', 'TMP11', 'TMP12']
    temporal_features = [TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8, TMP9, TMP10, TMP11, TMP12]

    temporal_data = pd.concat(temporal_features, axis=1)
    temporal_data.columns = temporal_cols
    return temporal_data

#### The function above uses the tempral_feature functions and temporal_dict (created using training data) to generate features by using DataFrame.apply function. The lines below use this function to generate these features for both training and test dataset. 

In [65]:
temporal_train_data = generate_temporal_features(data)

<b> Do this again for test data </b>

In [66]:
temoral_test_data = generate_temporal_features(X_test)

In [67]:
print temporal_train_data.shape
print temoral_test_data.shape

(83044, 12)
(20761, 12)


In [69]:
# Concatenating temporal data features with existing data
train_data_with_temporal = pd.concat([data, temporal_train_data], axis=1)
test_data_with_temporal = pd.concat([X_test, temoral_test_data], axis=1)

In [70]:
# Save data with temporal features
train_data_with_temporal.to_csv('../data/train_data_with_temporal.csv', index=False)
test_data_with_temporal.to_csv('../data/test_data_with_temporal.csv', index=False)

### Author Realted Feature Generation

In [73]:
print len(set(data['Assignee']))

296


In [76]:
print data.shape

(83044, 13)


In [9]:
author_dict = {}

# Generates temporal dict which is used to generate temporal functions
for i in range(len(data)):
    row = data.iloc[i]
    t = (row['Opened'].date() - datetime(1970,1,1).date()).days
    a = row['Assignee']
    p = row['Priority']
    if (a not in author_dict):
        author_dict[a] = {}
    if (p not in author_dict[a]):
        author_dict[a][p] = {}
    if (t not in author_dict[a][p]):
        author_dict[a][p][t] = 0
    author_dict[a][p][t] += 1

In [20]:
priority_levels = {'P1': 1, 'P2': 2, 'P3': 3, 'P4': 4, 'P5': 5}

In [55]:
print len(author_dict)
import math

# Function for generating temporal features (Fast version)
def author_feature1(x):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    a = x['Assignee']
    total = 0
    if a in author_dict:
        sub_data = author_dict[a]
    else:
        return 5
    
    pcount = {}
    
    for p in sub_data:
        s = 0
        for k in sub_data[p]:
            if k <= d:
                s += sub_data[p][k]
        
        pcount[p] = s
    
    N = 0;
    S = 0.0;
    for p in pcount:
        N += pcount[p]
        S += priority_levels[p]*pcount[p]
    
    if (N == 0):
        return 5
    return int(math.ceil(S/N))

def author_feature2(x):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    a = x['Assignee']
    total = 0
    if a in author_dict:
        sub_data = author_dict[a]
    else:
        return 5
    
    pcount = {}
    
    for p in sub_data:
        s = 0
        for k in sub_data[p]:
            if k <= d:
                s += sub_data[p][k]
        
        pcount[p] = s
    
    plist = []
    for p in pcount:
        plist += [priority_levels[p]]*pcount[p]

    med = np.median(plist)
    if np.isnan(med):
        return 5
    return int(med)

def author_feature3(x):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    a = x['Assignee']
    total = 0
    if a in author_dict:
        sub_data = author_dict[a]
    else:
        return 5
    
    pcount = 0
    
    for p in sub_data:
        s = 0
        for k in sub_data[p]:
            if k <= d:
                s += sub_data[p][k]
        
        pcount += s
        
    return pcount

296


In [None]:
AUT1 = data.apply(lambda x: author_feature3(x), axis=1)
print AUT1.head()

In [35]:
print author_feature3(data.iloc[0])

715


In [38]:
def generate_author_features(data2):
    AUT1 = data2.apply(lambda x: author_feature1(x), axis=1)
    AUT2 = data2.apply(lambda x: author_feature2(x), axis=1)
    AUT3 = data2.apply(lambda x: author_feature3(x), axis=1)
    author_cols = ['AUT1', 'AUT2', 'AUT3']
    author_features = [AUT1, AUT2, AUT3]

    author_data = pd.concat(author_features, axis=1)
    author_data.columns = author_cols
    return author_data

In [52]:
author_train_data = generate_author_features(data)

In [56]:
author_test_data = generate_author_features(X_test)

In [57]:
print author_train_data.head()
print author_test_data.head()

   AUT1  AUT2  AUT3
0     3     3   715
1     3     2   126
2     4     3   255
3     4     3  2039
4     4     3   154
   AUT1  AUT2  AUT3
0     3     3  1036
1     3     3    99
2     4     3  1262
3     4     3  1584
4     3     3  1555


In [58]:
# Save data with temporal features
author_train_data.to_csv('../data/train_author_features.csv', index=False)
author_test_data.to_csv('../data/test_author_features.csv', index=False)

## Product Features

- This has a total of 22 features.
- 11 are for 'Product' Feature and 11 are for 'Component' Feature

In [15]:
product_dict = {}

# Generates temporal dict which is used to generate temporal functions
for i in range(len(data)):
    row = data.iloc[i]
    t = (row['Opened'].date() - datetime(1970,1,1).date()).days
    p = row['Component'] # or Product
    s = row['Severity']
    
    if p not in product_dict:
        product_dict[p] = {}
    if s not in product_dict[p]:
        product_dict[p][s] = {}
    if t not in product_dict[p][s]:
        product_dict[p][s][t] = 0
    product_dict[p][s][t] += 1
    

In [16]:
product_dict_priority = {}
for i in range(len(data)):
    row = data.iloc[i]
    t = (row['Opened'].date() - datetime(1970,1,1).date()).days
    p = row['Component'] # or Product
    s = row['Priority']
    
    if p not in product_dict_priority:
        product_dict_priority[p] = {}
    if s not in product_dict_priority[p]:
        product_dict_priority[p][s] = {}
    if t not in product_dict_priority[p][s]:
        product_dict_priority[p][s][t] = 0
    product_dict_priority[p][s][t] += 1

In [36]:
print len(product_dict)
import math

# Function for generating product features

def product_feature1(x, f='Product'):
    return x[f]

def product_feature2(x, f='Product'):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    p = x[f]
    sev = x['Severity']
    sub_data = product_dict[p]
    
    total = 0
    for s in sub_data:
        for t in sub_data[s]:
            if (t <= d):
                total += sub_data[s][t]
    return total

def product_feature3(x, f):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    p = x[f]
    sev = x['Severity']
    sub_data = product_dict[p]
    
    total = 0
    if sev in sub_data:
        for t in sub_data[sev]:
            if (t <= d):
                total += sub_data[sev][t]
    return total

def product_feature4(x, f):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    p = x[f]
    sev = x['Severity']
    sub_data = product_dict[p]
    
    total = 0
    for s in sub_data:
        if (severity_levels[s] >= severity_levels[sev]):
            for t in sub_data[s]:
                if (t <= d):
                    total += sub_data[s][t]
    return total

def product_feature5to9(x, f, P):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    prod = x[f]
    sev = x['Priority']
    sub_data = product_dict_priority[prod]

    total = 0
    N = 0
    if P in sub_data:
        for t in sub_data[P]:
            if (t <= d):
                total += sub_data[P][t]
    for P in sub_data:
        for t in sub_data[P]:
            if (t <= d):
                N += sub_data[P][t]
    
    if N == 0:
        return 0.0
    return float(total)/N

def product_feature11(x, f):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    p = x[f]
    total = 0
    if p in product_dict_priority:
        sub_data = product_dict_priority[p]
    else:
        return 5
    
    pcount = {}
    
    for p in sub_data:
        s = 0
        for k in sub_data[p]:
            if k <= d:
                s += sub_data[p][k]
        
        pcount[p] = s
    
    plist = []
    for p in pcount:
        plist += [priority_levels[p]]*pcount[p]

    med = np.median(plist)
    if np.isnan(med):
        return 5
    return int(med)

def product_feature10(x, f):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    a = x[f]
    total = 0
    if a in product_dict_priority:
        sub_data = product_dict_priority[a]
    else:
        return 5
    
    pcount = {}
    
    for p in sub_data:
        s = 0
        for k in sub_data[p]:
            if k <= d:
                s += sub_data[p][k]
        
        pcount[p] = s
    
    N = 0;
    S = 0.0;
    for p in pcount:
        N += pcount[p]
        S += priority_levels[p]*pcount[p]
    
    if (N == 0):
        return 5
    return int(math.ceil(S/N))

27


In [69]:
PRO2 = data.apply(lambda x: product_feature2(x, 'Product'), axis=1)

In [89]:
PRO2.head()

0    24034
1     5329
2    22457
3    21441
4    26242
dtype: int64

In [13]:
def generate_product_features1(data2, f):
    product_features= []
    PRO1 = data2.apply(lambda x: product_feature1(x, f), axis=1)
    product_features.append(PRO1)
    PRO2 = data2.apply(lambda x: product_feature2(x, f), axis=1)
    product_features.append(PRO2)
    PRO3 = data2.apply(lambda x: product_feature3(x, f), axis=1)
    product_features.append(PRO3)
    PRO4 = data2.apply(lambda x: product_feature4(x, f), axis=1)
    product_features.append(PRO4)
    PRO5 = data2.apply(lambda x: product_feature5to9(x, f, 'P1'), axis=1)
    product_features.append(PRO5)
    
    product_cols = ['PRO1', 'PRO2', 'PRO3', 'PRO4', 'PRO5']

    product_data = pd.concat(product_features, axis=1)
    product_data.columns = product_cols
    return product_data


def generate_product_features2(data2, f):
    product_features= []
    PRO6 = data2.apply(lambda x: product_feature5to9(x, f, 'P2'), axis=1)
    product_features.append(PRO6)
    PRO7 = data2.apply(lambda x: product_feature5to9(x, f, 'P3'), axis=1)
    product_features.append(PRO7)
    PRO8 = data2.apply(lambda x: product_feature5to9(x, f, 'P4'), axis=1)
    product_features.append(PRO8)
    PRO9 = data2.apply(lambda x: product_feature5to9(x, f, 'P5'), axis=1)
    product_features.append(PRO9)
    PRO10 = data2.apply(lambda x: product_feature10(x, f), axis=1)
    product_features.append(PRO10)
    PRO11 = data2.apply(lambda x: product_feature11(x, f), axis=1)
    product_features.append(PRO11)
    
    product_cols = ['PRO6', 'PRO7', 'PRO8', 'PRO9', 'PRO10', 'PRO11']

    product_data = pd.concat(product_features, axis=1)
    product_data.columns = product_cols
    return product_data

### Generating Product related features for Train Data

In [78]:
prod1 = generate_product_features1(data, 'Product')

In [90]:
prod1.head()

Unnamed: 0,PRO1,PRO2,PRO3,PRO4,PRO5
0,Platform,24034,16103,19852,0
1,Platform,5329,3984,4415,0
2,Platform,22457,3118,22457,0
3,JDT,21441,14871,17275,0
4,Platform,26242,17591,21728,0


In [91]:
print data.iloc[0]
print product_feature5to9(data.iloc[0], 'Product', 'P1')

Bug ID                                             63706
Product                                         Platform
Component                                           Team
Assignee                             valentam@ca.ibm.com
Status                                          RESOLVED
Resolution                                     DUPLICATE
Summary       Invalid dirty indicator after checking out
Changed                              2004-06-02 14:34:00
Severity                                          normal
Keywords                                             NaN
Summary.1     Invalid dirty indicator after checking out
Opened                               2004-05-24 11:58:00
Priority                                              P3
Name: 0, dtype: object
0.0552550553383


In [92]:
PRO5 = data.apply(lambda x: product_feature5to9(x, 'Product', 'P1'), axis=1)

In [93]:
prod1.PRO5 = PRO5

In [94]:
prod1.head()

Unnamed: 0,PRO1,PRO2,PRO3,PRO4,PRO5
0,Platform,24034,16103,19852,0.055255
1,Platform,5329,3984,4415,0.077876
2,Platform,22457,3118,22457,0.056775
3,JDT,21441,14871,17275,0.039084
4,Platform,26242,17591,21728,0.052473


In [97]:
prod2 = generate_product_features2(data, 'Product')

In [98]:
prod2.head()

Unnamed: 0,PRO6,PRO7,PRO8,PRO9,PRO10,PRO11
0,0.105975,0.775235,0.037613,0.025922,3,5
1,0.102271,0.688122,0.050854,0.080878,3,5
2,0.105446,0.772187,0.038696,0.026896,3,5
3,0.097384,0.830745,0.028357,0.004431,3,5
4,0.103955,0.781876,0.036735,0.02496,3,5


In [102]:
prod_features = pd.concat([prod1, prod2], axis=1)

In [104]:
prod_features.to_csv('../data/train_prod_features.csv', index=False)

### Generating Product related features for Test Data

In [105]:
tprod1 = generate_product_features1(X_test, 'Product') 

In [106]:
tprod2 = generate_product_features2(X_test, 'Product') `

In [107]:
tprod_features = pd.concat([tprod1, tprod2], axis=1)
tprod_features.shape

(20761, 11)

In [108]:
tprod_features.to_csv('../data/test_prod_features.csv', index=False)

### Generating component related features for Training Data

In [21]:
prod3 = generate_product_features1(data, 'Component')

In [24]:
prod4 = generate_product_features2(data, 'Component')

In [None]:
prod4.PRO10

In [39]:
prod_features = pd.concat([prod3, prod4], axis=1)
prod_features.shape
prod_features.columns = ['PRO12', 'PRO12', 'PRO14', 'PRO15', 'PRO16', 'PRO17', 'PRO18', 'PRO19', 'PRO20', 'PRO21', 'PRO22']

In [41]:
prod_features.head()

Unnamed: 0,PRO12,PRO12.1,PRO14,PRO15,PRO16,PRO17,PRO18,PRO19,PRO20,PRO21,PRO22
0,Team,3411,2612,2928,0.04251,0.045441,0.807681,0.033128,0.07124,4,3
1,Debug,1247,947,1081,0.276664,0.198877,0.510826,0.012831,0.000802,3,3
2,Debug,3963,560,3963,0.107494,0.150391,0.725965,0.01514,0.001009,3,3
3,UI,23377,15564,18723,0.046584,0.116696,0.773196,0.049536,0.013988,3,3
4,User Assistance,818,577,686,0.026895,0.074572,0.845966,0.0489,0.003667,3,3


In [44]:
prod_features.to_csv('../data/train_component_features.csv', index=False)

### Generating component related features for Test Data

In [37]:
tprod3 = generate_product_features1(X_test, 'Component')

In [38]:
tprod4 = generate_product_features2(X_test, 'Component')



In [42]:
tprod_features = pd.concat([tprod3, tprod4], axis=1)
tprod_features.shape
tprod_features.columns = ['PRO12', 'PRO12', 'PRO14', 'PRO15', 'PRO16', 'PRO17', 'PRO18', 'PRO19', 'PRO20', 'PRO21', 'PRO22']

In [43]:
tprod_features.to_csv('../data/test_component_features.csv', index=False)

## Merge all components into single file

In [46]:
# Original Training and testing data. For getting columns, priority, severity, bugID
train_original= pd.read_csv('../data/train.csv', parse_dates=['Changed', 'Opened'])
test_original = pd.read_csv('../data/test.csv', parse_dates=['Changed', 'Opened'])

In [45]:
# Temporal Features
train_temporal = pd.read_csv('../data/train_data_with_temporal.csv')
test_temporal = pd.read_csv('../data/test_data_with_temporal.csv')

In [47]:
# author Features
train_author = pd.read_csv('../data/train_author_features.csv')
test_author = pd.read_csv('../data/test_author_features.csv')

In [48]:
# Product Features
train_prod = pd.read_csv('../data/train_prod_features.csv')
test_prod = pd.read_csv('../data/test_prod_features.csv')

In [49]:
# Component Features
train_comp = pd.read_csv('../data/train_component_features.csv')
test_comp = pd.read_csv('../data/test_component_features.csv')

In [57]:
tmp_cols = list(train_temporal.columns)[13:]

In [59]:
train_prod.columns

Index([u'PRO1', u'PRO2', u'PRO3', u'PRO4', u'PRO5', u'PRO6', u'PRO7', u'PRO8',
       u'PRO9', u'PRO10', u'PRO11'],
      dtype='object')

In [60]:
train_final = [train_original.loc[:,['Bug ID', 'Severity', 'Priority']], train_temporal.loc[:, tmp_cols], 
               train_author,  train_prod, train_comp]

train_final = pd.concat(train_final, axis=1)

In [61]:
train_final.columns

Index([u'Bug ID', u'Severity', u'Priority', u'TMP1', u'TMP2', u'TMP3', u'TMP4',
       u'TMP5', u'TMP6', u'TMP7', u'TMP8', u'TMP9', u'TMP10', u'TMP11',
       u'TMP12', u'AUT1', u'AUT2', u'AUT3', u'PRO1', u'PRO2', u'PRO3', u'PRO4',
       u'PRO5', u'PRO6', u'PRO7', u'PRO8', u'PRO9', u'PRO10', u'PRO11',
       u'PRO12', u'PRO12.1', u'PRO14', u'PRO15', u'PRO16', u'PRO17', u'PRO18',
       u'PRO19', u'PRO20', u'PRO21', u'PRO22'],
      dtype='object')

In [62]:
train_final.to_csv('../data/train_processed.csv', index=False)

In [63]:
test_final = [test_original.loc[:,['Bug ID', 'Severity', 'Priority']], test_temporal.loc[:, tmp_cols], 
               test_author,  test_prod, test_comp]

test_final = pd.concat(test_final, axis=1)

In [64]:
test_final.to_csv('../data/test_processed.csv', index=False)