Import libraries required

In [41]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from time import time
from sklearn.model_selection import train_test_split

Read data in

In [43]:
df = pd.read_csv('../data/total_fixed.csv', parse_dates=['Changed', 'Opened'])
cols = df.columns

In [45]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Priority', axis=1), df.Priority, test_size=0.20, random_state=42, stratify=df.Priority)

In [51]:
X_train['Priority'] = y_train
X_test['Priority'] = y_test

print X_train.shape
print X_test.shape

(83044, 13)
(20761, 13)


In [53]:
# Save Splitted data to disk
X_train.to_csv('../data/train.csv', index=False)
X_test.to_csv('../data/test.csv', index=False)

In [56]:
data = X_train
n_rows = len(data)
n_cols = len(cols)
bugIDs = data['Bug ID']
# print data.head()
# print bugIDs.head()

In [57]:
print data['Changed'].head()
print set(data['Severity'])

severity_levels = {'enhancement': 0, 'trivial' : 1, 'minor' : 2, 'normal' : 3, 'major' : 4, 'critical' : 5, 'blocker' : 6}

43114   2004-06-02 14:34:00
8262    2002-04-30 10:39:00
39223   2004-04-22 11:32:00
79672   2006-05-20 06:05:00
46042   2004-07-15 11:40:00
Name: Changed, dtype: datetime64[ns]
set(['major', 'normal', 'blocker', 'critical', 'enhancement', 'trivial', 'minor'])


In [58]:
print data['Changed'].head()
print set(data['Severity'])

severity_levels = {'enhancement': 0, 'trivial' : 1, 'minor' : 2, 'normal' : 3, 'major' : 4, 'critical' : 5, 'blocker' : 6}
np_opened = np.array(data['Opened'])
np_opened = np_opened.reshape(len(np_opened), 1)
severity_levels = ['enhancement', 'trivial', 'minor', 'normal', 'major', 'critical', 'blocker']

43114   2004-06-02 14:34:00
8262    2002-04-30 10:39:00
39223   2004-04-22 11:32:00
79672   2006-05-20 06:05:00
46042   2004-07-15 11:40:00
Name: Changed, dtype: datetime64[ns]
set(['major', 'normal', 'blocker', 'critical', 'enhancement', 'trivial', 'minor'])


Temporal Feature Generation - TMP1 to 12.

In [59]:
temporal_dict = {}

# Generates temporal dict which is used to generate temporal functions
# This should be based only on training data
for i in range(len(data)):
    row = data.iloc[i]
    t = (row['Opened'].date() - datetime(1970,1,1).date()).days
    s = row['Severity']
    if (t not in temporal_dict):
        temporal_dict[t] = {}
    if (s not in temporal_dict[t]):
        temporal_dict[t][s] = 0
    temporal_dict[t][s] += 1

    

In [60]:
print len(temporal_dict)

# Function for generating temporal features (Fast version)
def temporal_feature1(x, n):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    total = 0
    for t in range(d-n, d+1):
        if t in temporal_dict:
            total += sum(temporal_dict[t].values())
    return total

def temporal_feature2(x, n):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    s = x['Severity']
    total = 0
    for t in range(d-n, d+1):
        if t in temporal_dict:
            if s in temporal_dict[t]:
                total += (temporal_dict[t][s])
    return total

def temporal_feature3(x, n):
    d = (x['Opened'].date() - datetime(1970,1,1).date()).days
    sev = x['Severity']
    ind = severity_levels.index(sev)
    greater_s = severity_levels[ind:]
    total = 0
    for t in range(d-n, d+1):
        if t in temporal_dict:
            for s in greater_s:
                if s in temporal_dict[t]:
                    total += (temporal_dict[t][s])
    return total
    

2248


In [61]:
start = time()
print temporal_feature1(data.iloc[0], 7)
end = time() - start
print "Time taken = ", end

823
Time taken =  0.000661849975586


In [64]:
def generate_temporal_features(data2):
    TMP1 = data2.apply(lambda x: temporal_feature1(x, 7), axis=1)
    TMP2 = data2.apply(lambda x: temporal_feature2(x, 7), axis=1)
    TMP3 = data2.apply(lambda x: temporal_feature3(x, 7), axis=1)
    TMP4 = data2.apply(lambda x: temporal_feature1(x, 30), axis=1)
    TMP5 = data2.apply(lambda x: temporal_feature2(x, 30), axis=1)
    TMP6 = data2.apply(lambda x: temporal_feature3(x, 30), axis=1)
    TMP7 = data2.apply(lambda x: temporal_feature1(x, 1), axis=1)
    TMP8 = data2.apply(lambda x: temporal_feature2(x, 1), axis=1)
    TMP9 = data2.apply(lambda x: temporal_feature3(x, 1), axis=1)
    TMP10 = data2.apply(lambda x: temporal_feature1(x, 3), axis=1)
    TMP11 = data2.apply(lambda x: temporal_feature2(x, 3), axis=1)
    TMP12 = data2.apply(lambda x: temporal_feature3(x, 3), axis=1)
    temporal_cols = ['TMP1', 'TMP2', 'TMP3', 'TMP4', 'TMP5', 'TMP6', 'TMP7', 'TMP8', 'TMP9', 'TMP10', 'TMP11', 'TMP12']
    temporal_features = [TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8, TMP9, TMP10, TMP11, TMP12]

    temporal_data = pd.concat(temporal_features, axis=1)
    temporal_data.columns = temporal_cols
    return temporal_data

The function above uses the tempral_feature functions and temporal_dict (created using training data) to generate features by using DataFrame.apply function. The lines below use this function to generate these features for both training and test dataset. 

In [65]:
temporal_train_data = generate_temporal_features(data)

In [66]:
temoral_test_data = generate_temporal_features(X_test)

In [67]:
print temporal_train_data.shape
print temoral_test_data.shape

(83044, 12)
(20761, 12)


In [69]:
# Concatenating temporal data features with existing data
train_data_with_temporal = pd.concat([data, temporal_train_data], axis=1)
test_data_with_temporal = pd.concat([X_test, temoral_test_data], axis=1)

In [40]:
# Save data with temporal features
train_data_with_temporal.to_csv('../data/train_data_with_temporal.csv', index=False)
test_data_with_temporal.to_csv('../data/test_data_with_temporal.csv', index=False)

Author Realted Feature Generation

In [None]:
author_dict = {}

# Generates temporal dict which is used to generate temporal functions
for i in range(len(data)):
    row = data.iloc[i]
    t = (row['Opened'].date() - datetime(1970,1,1).date()).days
    a = row['Author']
    if (t not in author_dict):
        author_dict[t] = {}
    if (s not in author_dict[t]):
        author_dict[t][s] = 0
    author_dict[t][s] += 1