In [61]:
import pandas as pd
import numpy as np
import scipy
from copy import deepcopy

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
df = pd.read_csv('../data/individual_features/train_data_with_temporal.csv')
df.drop(['Keywords','Summary.1','Status','Resolution','Changed','Opened'],axis=1,inplace=True)
print df.shape
df.dropna(inplace=True)
print df.shape
df.reset_index(inplace=True)
df.isnull().sum()

(83044, 19)
(83041, 19)


index        0
Bug ID       0
Product      0
Component    0
Assignee     0
Summary      0
Severity     0
Priority     0
TMP1         0
TMP2         0
TMP3         0
TMP4         0
TMP5         0
TMP6         0
TMP7         0
TMP8         0
TMP9         0
TMP10        0
TMP11        0
TMP12        0
dtype: int64

In [11]:
df = df.sample(frac=1.0)

In [12]:
df.reset_index(inplace=True)

In [13]:
dummy_df = pd.get_dummies( df[['Product','Component','Assignee','Severity']] )

In [14]:
df.columns[9:]

Index([u'TMP1', u'TMP2', u'TMP3', u'TMP4', u'TMP5', u'TMP6', u'TMP7', u'TMP8',
       u'TMP9', u'TMP10', u'TMP11', u'TMP12'],
      dtype='object')

In [15]:
other_features = pd.concat([dummy_df, df[df.columns[9:]]],axis=1)

In [16]:
other_features.shape

(83041, 346)

In [17]:
other_features.dropna().shape

(83041, 346)

In [18]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re

tokenizer = RegexpTokenizer(r'\w+')
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = PorterStemmer()

In [19]:
# tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')
def tokenize_stop_stem(text):
    try:
        tokens = tokenizer.tokenize(text)
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) and stem
        filtered_tokens = []
        for token in tokens:
            token = token.lower()
            if token not in stopwords:
                if not re.search('[0-9]', token):
                    try:
                        token = stemmer.stem(token)
                        filtered_tokens.append(token)
                    except UnicodeDecodeError:
                        print 'illeagal token ignored:',token
                        pass
    except UnicodeDecodeError:
        print 'illeagal token ignored:',token
        pass
    return filtered_tokens

### Run sklearn countvectorizer

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
countVect = CountVectorizer(input='content',lowercase=False, max_features=395996, tokenizer=tokenize_stop_stem, decode_error='ignore')

In [22]:
%time countVector = countVect.fit_transform(df.Summary)

CPU times: user 28.7 s, sys: 471 ms, total: 29.2 s
Wall time: 29 s


In [23]:
countVector

<83041x15905 sparse matrix of type '<type 'numpy.int64'>'
	with 467339 stored elements in Compressed Sparse Row format>

In [24]:
def ctoi(x):
    if x=='P1':
        return 1 
    if x=='P2':
        return 2
    if x=='P3':
        return 3
    if x=='P4':
        return 4
    return 5

In [25]:
df['Priority_int'] = df['Priority'].apply(lambda x: ctoi(x))

### Make spare representation

In [26]:
from scipy import sparse
other_sparse = sparse.csr_matrix(other_features.values)
inputDF = sparse.hstack((other_features,countVector),format="csr")
inputDF

<83041x16251 sparse matrix of type '<type 'numpy.float64'>'
	with 1795995 stored elements in Compressed Sparse Row format>

In [58]:
training_ip = inputDF[0:83041/2]
training_op = df['Priority_int'][0:83041/2]
print training_ip.shape
validation_ip = inputDF[83041/2:]
validation_op = df['Priority'][83041/2:]
print validation_ip.shape

(41520, 16251)
(41521, 16251)


### Train Linear Regression on Training set

In [46]:
lr = LinearRegression(n_jobs=-1)

In [None]:
for 

In [47]:
lr.fit(training_ip, training_op)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

#### Initializing Thresholds

In [48]:
validation_set_preds = lr.predict(validation_ip)
validation_set_preds

array([ 2.89514973,  3.15731004,  3.57644414, ...,  2.79892992,
        2.99556056,  3.04039715])

In [49]:
p1 = int((df['Priority']=='P1').sum()/float(df['Priority'].shape[0])*100)
p2 = int((df['Priority']=='P2').sum()/float(df['Priority'].shape[0])*100)
p3 = int((df['Priority']=='P3').sum()/float(df['Priority'].shape[0])*100)
p4 = int((df['Priority']=='P4').sum()/float(df['Priority'].shape[0])*100)
print p1,p2,p3,p4

3 7 85 2


In [50]:
T0 = validation_set_preds.min()
(T1,T2,T3,T4) = np.percentile(validation_set_preds,[p1,p2,p3,p4])

In [51]:
T0,T1,T2,T3,T4

(0.013646676378780587,
 2.2617357718857014,
 2.4745763855771483,
 3.1628268058150759,
 2.1682701317809125)

In [105]:
# Function to map regression output to class labels according to threshold
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}

In [53]:
# function to get class label based on threhsolds for a single test sample
def itoc(x, T):
    if x <= T['T1']:
        return 'P1'
    if x <= T['T2']:
        return 'P2'
    if x <= T['T3']:
        return 'P3'
    if x <= T['T4']:
        return 'P4'
    return 'P5'

In [63]:
# get f1 score for given set of thresholds
def F1ScoreTH(T, val_preds, actual_labels):
    val_class_preds = np.apply_along_axis(lambda x: itoc(x, T), 1, val_preds)
#     val_class_preds = val_preds.map(lambda x: itoc(x, T))
    return f1_score(actual_labels, val_class_preds, average='macro')

In [55]:
validation_set_class_preds = [itoc(x, T) for x in validation_set_preds]
#validation_set_class_preds

In [56]:
from sklearn.metrics import f1_score

In [59]:
f1_score(validation_op, validation_set_class_preds, average='macro')

  'precision', 'predicted', average, warn_for)


0.27065118192827181

In [104]:
TH = ['T0', 'T1', 'T2', 'T3', 'T4']
def optimize_thresholds(T, actual, preds):
    for i in range(1, len(TH)):
        D = T[TH[i]] - T[TH[i-1]]
        while (True):
            f1_v0 = F1ScoreTH(T, preds, actual)
            delta = (0.02*D)

            if (i + 1 < len(TH) and T[TH[i]] + delta < T[TH[i+1]]):
                T[TH[i]] += delta
                f1_v1 = F1ScoreTH(T, preds, actual)
                T[TH[i]] -= delta
            else:
                f1_v1 = f1_v0
            
            if (T[TH[i]] - delta > T[TH[i-1]]):
                T[TH[i]] -= delta
                f1_v2 = F1ScoreTH(T, preds, actual)
                T[TH[i]] += delta
            else:
                f1_v2 = f1_v0
                
#             print "----- handling TH for ", TH[i], "------"
#             print f1_v0, f1_v1, f1_v2
            
            if (f1_v1 > f1_v0 and f1_v1 > f1_v2):
                T[TH[i]] += delta
#                 print "increasing threshold for ", TH[i]
            elif (f1_v1 > f1_v0 and f1_v2 > f1_v1):
                T[TH[i]] -= delta
#                 print "increasing threshold for ", TH[i]
            elif (f1_v1 < f1_v0 and f1_v2 > f1_v0):
                T[TH[i]] -= delta
#                 print "decreasing threshold for ", TH[i]
            else:
                break;
    return T

In [66]:
validation_set_preds.shape
validation_set_preds = validation_set_preds.reshape(validation_set_preds.shape[0], 1)
validation_set_preds.shape

(41521, 1)

In [106]:
T_old = deepcopy(T)
T_old

{'T0': 0.013646676378780587,
 'T1': 2.2617357718857014,
 'T2': 2.4745763855771483,
 'T3': 3.1628268058150759,
 'T4': 2.1682701317809125}

In [107]:
T_new = optimize_thresholds(T, validation_op, validation_set_preds)

In [108]:
T_old

{'T0': 0.013646676378780587,
 'T1': 2.2617357718857014,
 'T2': 2.4745763855771483,
 'T3': 3.1628268058150759,
 'T4': 2.1682701317809125}

In [109]:
T

{'T0': 0.013646676378780587,
 'T1': 2.2617357718857014,
 'T2': 2.576739880149042,
 'T3': 3.1628268058150759,
 'T4': 2.1682701317809125}