In [1]:
import pandas as pd
import numpy as np
import scipy
from copy import deepcopy
from sklearn.linear_model import LinearRegression
from data_preprocessing import DataProcessor

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

In [3]:
p = DataProcessor()
train_processed, train_original = p.fit_transform(train_df)

removed nan values
count vectorizer finished fitting
count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [4]:
test_processed, test_original = p.transform(test_df)

count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [7]:
train_processed

<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 3359482 stored elements in Compressed Sparse Row format>

### Run sklearn countvectorizer

In [10]:
def ctoi(x):
    if x=='P1':
        return 1 
    if x=='P2':
        return 2
    if x=='P3':
        return 3
    if x=='P4':
        return 4
    return 5

In [11]:
Priority_int = train_original['Priority'].apply(lambda x: ctoi(x))

### Make spare representation

In [100]:
training_ip = train_processed[0:83041/2]
training_op = Priority_int[0:83041/2]
print training_ip.shape
validation_ip = train_processed[83041/2:]
validation_op = train_original['Priority'][83041/2:]
print validation_ip.shape

test_ip = test_processed
test_op = test_original['Priority']
print test_ip.shape

(41520, 15940)
(41521, 15940)
(20761, 15940)


### Train Linear Regression on Training set

In [125]:
lr = LinearRegression(n_jobs=-1)

In [126]:
lr.fit(training_ip, training_op)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

#### Initializing Thresholds

In [127]:
validation_set_preds = lr.predict(validation_ip)
print validation_set_preds
print validation_set_preds.shape
validation_set_preds = validation_set_preds.reshape(validation_set_preds.shape[0], 1)
print validation_set_preds.shape

[ 2.41306288  2.7908672   2.73131806 ...,  2.4883916   2.8945196
  3.63032065]
(41521,)
(41521, 1)


In [128]:
# Finding percentile of each class in training data
p1 = int((train_original['Priority']=='P1').sum()/float(train_original['Priority'].shape[0])*100)
p2 = int((train_original['Priority']=='P2').sum()/float(train_original['Priority'].shape[0])*100)
p3 = int((train_original['Priority']=='P3').sum()/float(train_original['Priority'].shape[0])*100)
p4 = int((train_original['Priority']=='P4').sum()/float(train_original['Priority'].shape[0])*100)
print p1,p2,p3,p4

3 7 85 2


In [129]:
# Create Thresholds based on these percentiles
T0 = validation_set_preds.min()
(T1,T2,T3,T4) = np.percentile(validation_set_preds,[p1,p2,p3,p4])

In [130]:
T0,T1,T2,T3,T4

(0.23618537009426799,
 2.2730375631856763,
 2.4892263957931648,
 3.1412804340464566,
 2.1795256076450489)

In [131]:
# Function to map regression output to class labels according to threshold
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}

In [132]:
# function to get class label based on threhsolds for a single test sample
def itoc(x, T):
    if x <= T['T1']:
        return 'P1'
    if x <= T['T2']:
        return 'P2'
    if x <= T['T3']:
        return 'P3'
    if x <= T['T4']:
        return 'P4'
    return 'P5'

In [133]:
# Calculating F1 Score before tweaking thresholds
from sklearn.metrics import f1_score
validation_set_class_preds = [itoc(x, T) for x in validation_set_preds]
f1_score(validation_op, validation_set_class_preds, average='macro')

0.2731744339784859

## Threshold Tweaking (Greedy) 

In [134]:
# get f1 score for given set of thresholds
def F1ScoreTH(T, val_preds, actual_labels):
    val_class_preds = np.apply_along_axis(lambda x: itoc(x, T), 1, val_preds)
#     val_class_preds = val_preds.map(lambda x: itoc(x, T))
    return f1_score(actual_labels, val_class_preds, average='macro')

In [135]:
TH = ['T0', 'T1', 'T2', 'T3', 'T4']
def optimize_thresholds(T, actual, preds, delta):
    for i in range(1, len(TH)):
        D = T[TH[i]] - T[TH[i-1]]
        while (True):
            f1_v0 = F1ScoreTH(T, preds, actual)
            delta = (delta*D)

            if (i + 1 < len(TH) and T[TH[i]] + delta < T[TH[i+1]]):
                T[TH[i]] += delta
                f1_v1 = F1ScoreTH(T, preds, actual)
                T[TH[i]] -= delta
            else:
                f1_v1 = f1_v0
            
            if (T[TH[i]] - delta > T[TH[i-1]]):
                T[TH[i]] -= delta
                f1_v2 = F1ScoreTH(T, preds, actual)
                T[TH[i]] += delta
            else:
                f1_v2 = f1_v0
                
#             print "----- handling TH for ", TH[i], "------"
#             print f1_v0, f1_v1, f1_v2
            
            if (f1_v1 > f1_v0 and f1_v1 > f1_v2):
                T[TH[i]] += delta
#                 print "increasing threshold for ", TH[i]
            elif (f1_v1 > f1_v0 and f1_v2 > f1_v1):
                T[TH[i]] -= delta
#                 print "increasing threshold for ", TH[i]
            elif (f1_v1 < f1_v0 and f1_v2 > f1_v0):
                T[TH[i]] -= delta
#                 print "decreasing threshold for ", TH[i]
            else:
                break;
    return T

In [149]:
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
T_new = deepcopy(T)
T_new

{'T0': 0.23618537009426799,
 'T1': 2.2730375631856763,
 'T2': 2.4892263957931648,
 'T3': 3.1412804340464566,
 'T4': 2.1795256076450489}

In [150]:
optimize_thresholds(T_new, validation_op, validation_set_preds, 0.03)

{'T0': 0.23618537009426799,
 'T1': 2.2119319973929339,
 'T2': 2.5359632941355561,
 'T3': 3.1412804340464566,
 'T4': 2.1795256076450489}

In [151]:
T

{'T0': 0.23618537009426799,
 'T1': 2.2730375631856763,
 'T2': 2.4892263957931648,
 'T3': 3.1412804340464566,
 'T4': 2.1795256076450489}

In [152]:
T_new

{'T0': 0.23618537009426799,
 'T1': 2.2119319973929339,
 'T2': 2.5359632941355561,
 'T3': 3.1412804340464566,
 'T4': 2.1795256076450489}

In [153]:
F1ScoreTH(T, validation_set_preds, validation_op)

0.2731744339784859

### Cross Validation

In [71]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [72]:
p1 = int((train_original['Priority']=='P1').sum()/float(train_original['Priority'].shape[0])*100)
p2 = int((train_original['Priority']=='P2').sum()/float(train_original['Priority'].shape[0])*100)
p3 = int((train_original['Priority']=='P3').sum()/float(train_original['Priority'].shape[0])*100)
p4 = int((train_original['Priority']=='P4').sum()/float(train_original['Priority'].shape[0])*100)

In [73]:
print p1,p2,p3,p4

3 7 85 2


In [74]:
kf = StratifiedKFold(n_splits=10)

In [93]:
cvscores = []
average_f1_scores = []
lr = LinearRegression(n_jobs=-1)
for train,val in kf.split(train_processed, Priority_int):
    train_input = train_processed[train]
    val_input = train_processed[val]
    train_output = Priority_int[train]
    val_output = train_original['Priority'][val]
    
    vlen = len(Priority_int)
    vtraining_ip = train_input[0:vlen/2]
    vtraining_op = train_output[0:vlen/2]

    vvalidation_ip = train_input[vlen/2:]
    vvalidation_op = train_original['Priority'][train][vlen/2:]#train_output[vlen/2:]
    
    %time lr.fit(vtraining_ip,vtraining_op)
    
    val_prediction = lr.predict(vvalidation_ip)
    val_prediction = val_prediction.reshape(val_prediction.shape[0], 1)
    
    vT0 = val_prediction.min()
    (vT1,vT2,vT3,vT4) = np.percentile(val_prediction,[p1,p2,p3,p4])
    vT = {'T0': vT0, 'T1': vT1, 'T2': vT2, 'T3': vT3, 'T4': vT4}
    
    vT_new = optimize_thresholds(vT, vvalidation_op, val_prediction, 0.01)
    
    val_prediction = lr.predict(val_input)
    val_prediction = val_prediction.reshape(val_prediction.shape[0], 1)
    
    val_class_preds = np.apply_along_axis(lambda x: itoc(x, vT_new), 1, val_prediction)
    scores = f1_score(val_output, val_class_preds, average=None)
    print np.mean(scores)
    cvscores.append(scores)
    average_f1_scores.append(np.mean(scores))

CPU times: user 11.9 s, sys: 172 ms, total: 12 s
Wall time: 6.33 s
0.289009851028
CPU times: user 11.1 s, sys: 123 ms, total: 11.3 s
Wall time: 5.75 s
0.263720377792
CPU times: user 11.4 s, sys: 137 ms, total: 11.5 s
Wall time: 5.89 s
0.278433777263
CPU times: user 10.6 s, sys: 119 ms, total: 10.7 s
Wall time: 5.5 s
0.294985915249
CPU times: user 11 s, sys: 151 ms, total: 11.1 s
Wall time: 5.67 s
0.270382028769
CPU times: user 11 s, sys: 129 ms, total: 11.1 s
Wall time: 5.68 s
0.274274442385
CPU times: user 11.3 s, sys: 137 ms, total: 11.4 s
Wall time: 5.89 s
0.280422926772
CPU times: user 11 s, sys: 128 ms, total: 11.2 s
Wall time: 5.72 s
0.266208613653
CPU times: user 11 s, sys: 124 ms, total: 11.1 s
Wall time: 5.67 s
0.282754083484
CPU times: user 11 s, sys: 130 ms, total: 11.2 s
Wall time: 5.72 s
0.27321355494


In [103]:
print np.mean(cvscores), np.std(average_f1_scores)

0.277340557134 0.00933711549747


In [124]:
cvscores

[array([ 0.26579521,  0.20411817,  0.83893516,  0.        ,  0.13620072]),
 array([ 0.20806794,  0.15752742,  0.83968791,  0.        ,  0.11331862]),
 array([ 0.22065728,  0.19735683,  0.84202683,  0.        ,  0.13212796]),
 array([ 0.3018018 ,  0.19825073,  0.84932313,  0.        ,  0.12555391]),
 array([ 0.25482625,  0.12857143,  0.8463068 ,  0.        ,  0.12220566]),
 array([ 0.21444201,  0.17760618,  0.84589294,  0.        ,  0.13343109]),
 array([ 0.27165354,  0.17094017,  0.8444641 ,  0.        ,  0.11505682]),
 array([ 0.21442495,  0.14715026,  0.84392041,  0.        ,  0.12554745]),
 array([ 0.24400871,  0.18878505,  0.84646077,  0.        ,  0.13451589]),
 array([ 0.26266417,  0.14314928,  0.83956467,  0.        ,  0.12068966])]

## Differential Evolution  - Parameter tuning for threshold change delta

In [157]:
from scipy.optimize import differential_evolution

In [158]:
args = (validation_op,validation_set_preds)
bounds = [ (0.01,0.10) ] # order: percentage shift in threhsold

In [159]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(float,parameters)
    T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
    T_new = optimize_thresholds(T, validation_op, validation_set_preds, parameters[0])
    
    f1 = F1ScoreTH(T_new, validation_set_preds, validation_op)
    print f1, parameters
    return -1*f1

In [160]:
%time result = differential_evolution(func, bounds, args, strategy='rand2bin', popsize=10, mutation=(0.5,1.9), recombination=0.7, maxiter=2)

0.277149072335 [0.09973539259257153]
0.277022081352 [0.08255773517336724]
0.278472579163 [0.02593073196862644]
0.278407777239 [0.017478081525523034]
0.279800650358 [0.05785941050928017]
0.278802473642 [0.04123262079079123]
0.276871839905 [0.07948829524087743]
0.278962487145 [0.07046625088555572]
0.278843552298 [0.030334302424327973]
0.279841632618 [0.05346692312853821]
0.278372531909 [0.03681906283105357]
0.2785263485 [0.07649305426231383]
0.276951526228 [0.08796159969389146]
0.277856203545 [0.020950475342797516]
0.278841926406 [0.03852334867739145]
0.277149072335 [0.0994633670776305]
0.276951526228 [0.09256539116549317]
0.276871839905 [0.07890586092684372]
0.276935077067 [0.0818941899731588]
0.276998055784 [0.09135158982384911]
0.279841632618 [0.05346692312853821]
0.279841632618 [0.05346693312853821]
CPU times: user 3min 23s, sys: 1.97 s, total: 3min 25s
Wall time: 3min 27s


In [123]:
result.x

array([ 0.09540116])

#### Recalculating thresholds using this parameter

In [154]:
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
T_new = optimize_thresholds(T, validation_op, validation_set_preds, 0.052)

### Running it finally on test data

In [155]:
test_set_preds = lr.predict(test_ip)
print test_set_preds
print test_set_preds.shape
test_set_preds = test_set_preds.reshape(test_set_preds.shape[0], 1)
print test_set_preds.shape

[ 2.8479649   2.74556011  3.10346262 ...,  3.43844443  2.98782104
  3.04460992]
(20761,)
(20761, 1)


In [156]:
from sklearn.metrics import f1_score
test_set_class_preds = [itoc(x, T) for x in test_set_preds]
f1_score(test_op, test_set_class_preds, average='macro')

0.27814561444252595