In [2]:
import pandas as pd
import numpy as np
import scipy

In [3]:
from sklearn.linear_model import LinearRegression

In [5]:
df = pd.read_csv('../data/data_with_temporal.csv')
df.drop(['Keywords','Summary.1','Status','Resolution','Changed','Opened'],axis=1,inplace=True)
print df.shape
df.dropna(inplace=True)
print df.shape
df.reset_index(inplace=True)
df.isnull().sum()

(103805, 19)
(103802, 19)


index        0
Bug ID       0
Product      0
Component    0
Assignee     0
Summary      0
Priority     0
Severity     0
TMP1         0
TMP2         0
TMP3         0
TMP4         0
TMP5         0
TMP6         0
TMP7         0
TMP8         0
TMP9         0
TMP10        0
TMP11        0
TMP12        0
dtype: int64

In [6]:
df = df.sample(frac=1.0)

In [7]:
df.reset_index(inplace=True)

In [8]:
dummy_df = pd.get_dummies( df[['Product','Component','Assignee','Severity']] )

In [9]:
df.columns[9:]

Index([u'TMP1', u'TMP2', u'TMP3', u'TMP4', u'TMP5', u'TMP6', u'TMP7', u'TMP8',
       u'TMP9', u'TMP10', u'TMP11', u'TMP12'],
      dtype='object')

In [10]:
other_features = pd.concat([dummy_df, df[df.columns[9:]]],axis=1)

In [11]:
other_features.shape

(103802, 357)

In [12]:
other_features.dropna().shape

(103802, 357)

In [14]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re

tokenizer = RegexpTokenizer(r'\w+')
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = PorterStemmer()

In [15]:
# tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')
def tokenize_stop_stem(text):
    try:
        tokens = tokenizer.tokenize(text)
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) and stem
        filtered_tokens = []
        for token in tokens:
            token = token.lower()
            if token not in stopwords:
                if not re.search('[0-9]', token):
                    try:
                        token = stemmer.stem(token)
                        filtered_tokens.append(token)
                    except UnicodeDecodeError:
                        print 'illeagal token ignored:',token
                        pass
    except UnicodeDecodeError:
        print 'illeagal token ignored:',token
        pass
    return filtered_tokens

### Run sklearn countvectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
countVect = CountVectorizer(input='content',lowercase=False, max_features=395996, tokenizer=tokenize_stop_stem, decode_error='ignore')

In [18]:
%time countVector = countVect.fit_transform(df.Summary)

CPU times: user 16.6 s, sys: 36 ms, total: 16.6 s
Wall time: 16.8 s


In [19]:
countVector

<103802x17868 sparse matrix of type '<type 'numpy.int64'>'
	with 584285 stored elements in Compressed Sparse Row format>

In [34]:
def ctoi(x):
    if x=='P1':
        return 1 
    if x=='P2':
        return 2
    if x=='P3':
        return 3
    if x=='P4':
        return 4
    return 5

In [36]:
df['Priority_int'] = df['Priority'].apply(lambda x: ctoi(x))

### Make spare representation

In [30]:
from scipy import sparse
other_sparse = sparse.csr_matrix(other_features.values)
inputDF = sparse.hstack((other_features,countVector),format="csr")
inputDF

<103802x18225 sparse matrix of type '<type 'numpy.int64'>'
	with 2245117 stored elements in Compressed Sparse Row format>

In [38]:
training_ip = inputDF[0:103802/2]
training_op = df['Priority_int'][0:103802/2]
print training.shape
validation_ip = inputDF[103802/2:]
validation_op = df['Priority'][0:103802/2]
print validation.shape

(51901, 18225)
(51901, 18225)


### Train Linear Regression on Training set

In [41]:
lr = LinearRegression(n_jobs=-1)

In [None]:
for 

In [42]:
lr.fit(training_ip, training_op)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

#### Initializing Thresholds

In [43]:
validation_set_preds = lr.predict(validation_ip)
validation_set_preds

In [82]:
p1 = int((df['Priority']=='P1').sum()/float(df['Priority'].shape[0])*100)
p2 = int((df['Priority']=='P2').sum()/float(df['Priority'].shape[0])*100)
p3 = int((df['Priority']=='P3').sum()/float(df['Priority'].shape[0])*100)
p4 = int((df['Priority']=='P4').sum()/float(df['Priority'].shape[0])*100)
print p1,p2,p3,p4

3 7 85 2


In [80]:
T0 = validation_set_preds.min()
(T1,T2,T3,T4) = np.percentile(validation_set_preds,[p1,p2,p3,p4])

In [81]:
T0,T1,T2,T3,T4

(-0.06095495679443852,
 2.2816289379769694,
 2.4837134169093837,
 3.1557069211757209,
 2.1841227691038405)

In [83]:
# Function to map regression output to class labels according to threshold

In [89]:
def itoc(x):
    if x <= T1:
        return 'P1'
    if x <= T2:
        return 'P2'
    if x <= T3:
        return 'P3'
    if x <= T4:
        return 'P4'
    return 'P5'

In [91]:
validation_set_class_preds = [itoc(x) for x in validation_set_preds]

In [92]:
from sklearn.metrics import f1_score

In [102]:
f1_score(validation_op, validation_set_class_preds, average='macro')

  'precision', 'predicted', average, warn_for)


0.1851364049520588