In [1]:
import pandas as pd
import numpy as np
import scipy
from copy import deepcopy
from sklearn.linear_model import LinearRegression
from data_preprocessing import DataProcessor

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

In [3]:
p = DataProcessor()
train_processed, train_original = p.fit_transform(train_df)

removed nan values
count vectorizer finished fitting
count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [4]:
test_processed, test_original = p.transform(test_df)

count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [5]:
train_processed

<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 3359482 stored elements in Compressed Sparse Row format>

### Run sklearn countvectorizer

In [6]:
def ctoi(x):
    if x=='P1':
        return 1 
    if x=='P2':
        return 2
    if x=='P3':
        return 3
    if x=='P4':
        return 4
    return 5

In [7]:
Priority_int = train_original['Priority'].apply(lambda x: ctoi(x))

### Make spare representation

In [8]:
training_ip = train_processed[0:83041/2]
training_op = Priority_int[0:83041/2]
print training_ip.shape
validation_ip = train_processed[83041/2:]
validation_op = train_original['Priority'][83041/2:]
print validation_ip.shape

test_ip = test_processed
test_op = test_original['Priority']
print test_ip.shape

(41520, 15940)
(41521, 15940)
(20761, 15940)


### Train Linear Regression on Training set

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
#lr = LinearRegression(n_jobs=-1)
lr = RandomForestRegressor(n_jobs=-1)

In [11]:
lr.fit(training_ip, training_op)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

#### Initializing Thresholds

In [12]:
validation_set_preds = lr.predict(validation_ip)
print validation_set_preds
print validation_set_preds.shape
validation_set_preds = validation_set_preds.reshape(validation_set_preds.shape[0], 1)
print validation_set_preds.shape

[ 2.4  3.   3.  ...,  2.5  3.   2.9]
(41521,)
(41521, 1)


In [13]:
# Finding percentile of each class in training data
p1 = int((train_original['Priority']=='P1').sum()/float(train_original['Priority'].shape[0])*100)
p2 = int((train_original['Priority']=='P2').sum()/float(train_original['Priority'].shape[0])*100)
p3 = int((train_original['Priority']=='P3').sum()/float(train_original['Priority'].shape[0])*100)
p4 = int((train_original['Priority']=='P4').sum()/float(train_original['Priority'].shape[0])*100)
print p1,p2,p3,p4

3 7 85 2


In [14]:
# Create Thresholds based on these percentiles
T0 = validation_set_preds.min()
(T1,T2,T3,T4) = np.percentile(validation_set_preds,[p1,p2,p3,p4])

In [15]:
T0,T1,T2,T3,T4

(1.0, 2.1000000000000001, 2.5, 3.0, 2.0)

In [16]:
# Function to map regression output to class labels according to threshold
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}

In [17]:
# function to get class label based on threhsolds for a single test sample
def itoc(x, T):
    if x <= T['T1']:
        return 'P1'
    if x <= T['T2']:
        return 'P2'
    if x <= T['T3']:
        return 'P3'
    if x <= T['T4']:
        return 'P4'
    return 'P5'

In [19]:
# Calculating F1 Score before tweaking thresholds
from sklearn.metrics import f1_score
validation_set_class_preds = [itoc(x, T) for x in validation_set_preds]
f1_score(validation_op, validation_set_class_preds, average='macro')

0.33075243555700173

## Threshold Tweaking (Greedy) 

In [20]:
# get f1 score for given set of thresholds
def F1ScoreTH(T, val_preds, actual_labels):
    val_class_preds = np.apply_along_axis(lambda x: itoc(x, T), 1, val_preds)
#     val_class_preds = val_preds.map(lambda x: itoc(x, T))
    return f1_score(actual_labels, val_class_preds, average='macro')

In [21]:
TH = ['T0', 'T1', 'T2', 'T3', 'T4']
def optimize_thresholds(T, actual, preds, delta):
    for i in range(1, len(TH)):
        D = T[TH[i]] - T[TH[i-1]]
        while (True):
            f1_v0 = F1ScoreTH(T, preds, actual)
            delta = (delta*D)

            if (i + 1 < len(TH) and T[TH[i]] + delta < T[TH[i+1]]):
                T[TH[i]] += delta
                f1_v1 = F1ScoreTH(T, preds, actual)
                T[TH[i]] -= delta
            else:
                f1_v1 = f1_v0
            
            if (T[TH[i]] - delta > T[TH[i-1]]):
                T[TH[i]] -= delta
                f1_v2 = F1ScoreTH(T, preds, actual)
                T[TH[i]] += delta
            else:
                f1_v2 = f1_v0
                
#             print "----- handling TH for ", TH[i], "------"
#             print f1_v0, f1_v1, f1_v2
            
            if (f1_v1 > f1_v0 and f1_v1 > f1_v2):
                T[TH[i]] += delta
#                 print "increasing threshold for ", TH[i]
            elif (f1_v1 > f1_v0 and f1_v2 > f1_v1):
                T[TH[i]] -= delta
#                 print "increasing threshold for ", TH[i]
            elif (f1_v1 < f1_v0 and f1_v2 > f1_v0):
                T[TH[i]] -= delta
#                 print "decreasing threshold for ", TH[i]
            else:
                break;
    return T

In [22]:
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
T_new = deepcopy(T)
T_new

{'T0': 1.0, 'T1': 2.1000000000000001, 'T2': 2.5, 'T3': 3.0, 'T4': 2.0}

In [23]:
optimize_thresholds(T_new, validation_op, validation_set_preds, 0.03)

{'T0': 1.0, 'T1': 2.1000000000000001, 'T2': 2.5, 'T3': 3.0, 'T4': 2.0}

In [24]:
T

{'T0': 1.0, 'T1': 2.1000000000000001, 'T2': 2.5, 'T3': 3.0, 'T4': 2.0}

In [25]:
T_new

{'T0': 1.0, 'T1': 2.1000000000000001, 'T2': 2.5, 'T3': 3.0, 'T4': 2.0}

In [26]:
F1ScoreTH(T, validation_set_preds, validation_op)

0.33075243555700173

### Cross Validation

In [27]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [28]:
p1 = int((train_original['Priority']=='P1').sum()/float(train_original['Priority'].shape[0])*100)
p2 = int((train_original['Priority']=='P2').sum()/float(train_original['Priority'].shape[0])*100)
p3 = int((train_original['Priority']=='P3').sum()/float(train_original['Priority'].shape[0])*100)
p4 = int((train_original['Priority']=='P4').sum()/float(train_original['Priority'].shape[0])*100)

In [29]:
print p1,p2,p3,p4

3 7 85 2


In [30]:
kf = StratifiedKFold(n_splits=10)

In [33]:
cvscores = []
average_f1_scores = []
lr = RandomForestRegressor(n_jobs=-1)
for train,val in kf.split(train_processed, Priority_int):
    train_input = train_processed[train]
    val_input = train_processed[val]
    train_output = Priority_int[train]
    val_output = train_original['Priority'][val]
    
    vlen = len(Priority_int)
    vtraining_ip = train_input[0:vlen/2]
    vtraining_op = train_output[0:vlen/2]

    vvalidation_ip = train_input[vlen/2:]
    vvalidation_op = train_original['Priority'][train][vlen/2:]#train_output[vlen/2:]
    
    %time lr.fit(vtraining_ip,vtraining_op)
    
    val_prediction = lr.predict(vvalidation_ip)
    val_prediction = val_prediction.reshape(val_prediction.shape[0], 1)
    
    vT0 = val_prediction.min()
    (vT1,vT2,vT3,vT4) = np.percentile(val_prediction,[p1,p2,p3,p4])
    vT = {'T0': vT0, 'T1': vT1, 'T2': vT2, 'T3': vT3, 'T4': vT4}
    
    vT_new = optimize_thresholds(vT, vvalidation_op, val_prediction, 0.01)
    
    val_prediction = lr.predict(val_input)
    val_prediction = val_prediction.reshape(val_prediction.shape[0], 1)
    
    val_class_preds = np.apply_along_axis(lambda x: itoc(x, vT_new), 1, val_prediction)
    scores = f1_score(val_output, val_class_preds, average=None)
    print np.mean(scores)
    cvscores.append(scores)
    average_f1_scores.append(np.mean(scores))

CPU times: user 3min 25s, sys: 416 ms, total: 3min 26s
Wall time: 41.6 s
0.316226153864
CPU times: user 3min 31s, sys: 136 ms, total: 3min 31s
Wall time: 41.9 s
0.310936724414
CPU times: user 3min 27s, sys: 456 ms, total: 3min 28s
Wall time: 41.2 s
0.319906051442
CPU times: user 3min 48s, sys: 492 ms, total: 3min 48s
Wall time: 47.7 s
0.32398468925
CPU times: user 4min 14s, sys: 236 ms, total: 4min 15s
Wall time: 51.7 s
0.335277481449
CPU times: user 3min 52s, sys: 204 ms, total: 3min 52s
Wall time: 46.8 s
0.333555468179
CPU times: user 4min 59s, sys: 220 ms, total: 4min 59s
Wall time: 1min
0.327226538537
CPU times: user 3min 55s, sys: 168 ms, total: 3min 56s
Wall time: 47.7 s
0.329626601626
CPU times: user 4min 39s, sys: 964 ms, total: 4min 40s
Wall time: 56.9 s
0.315655930949
CPU times: user 3min 39s, sys: 1.03 s, total: 3min 40s
Wall time: 44.3 s
0.333107442796


In [34]:
print np.mean(cvscores), np.std(average_f1_scores)

0.324550308251 0.00811135543327


In [36]:
cvscores

[array([ 0.33504274,  0.18518519,  0.88443226,  0.        ,  0.17647059]),
 array([ 0.29554656,  0.1986234 ,  0.88251366,  0.        ,  0.178     ]),
 array([ 0.32825719,  0.20171674,  0.88564828,  0.        ,  0.18390805]),
 array([ 0.36135957,  0.21353066,  0.88805648,  0.        ,  0.15697674]),
 array([ 0.36291913,  0.24758221,  0.88941548,  0.        ,  0.17647059]),
 array([ 0.36174636,  0.2509434 ,  0.8861015 ,  0.        ,  0.16898608]),
 array([ 0.33333333,  0.24876604,  0.88703531,  0.        ,  0.16699801]),
 array([ 0.33221477,  0.24279835,  0.8885045 ,  0.        ,  0.18461538]),
 array([ 0.32881356,  0.18796199,  0.88257767,  0.        ,  0.17892644]),
 array([ 0.39677419,  0.21107628,  0.88584409,  0.        ,  0.17184265])]

## Differential Evolution  - Parameter tuning for threshold change delta

In [37]:
from scipy.optimize import differential_evolution

In [38]:
args = (validation_op,validation_set_preds)
bounds = [ (0.01,0.10) ] # order: percentage shift in threhsold

In [39]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(float,parameters)
    T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
    T_new = optimize_thresholds(T, validation_op, validation_set_preds, parameters[0])
    
    f1 = F1ScoreTH(T_new, validation_set_preds, validation_op)
    print f1, parameters
    return -1*f1

In [40]:
%time result = differential_evolution(func, bounds, args, strategy='rand2bin', popsize=10, mutation=(0.5,1.9), recombination=0.7, maxiter=2)

0.330752435557 [0.08847658547469857]
0.330752435557 [0.0457725393425509]
0.337810575475 [0.09129733573351453]
0.330752435557 [0.06925096047167976]
0.330752435557 [0.07807642673240787]
0.330752435557 [0.02643104309136409]
0.330752435557 [0.058429377457218185]
0.330752435557 [0.03348321886290703]
0.330752435557 [0.016594290404370535]
0.330752435557 [0.046029896126210765]
0.330752435557 [0.06410965966777817]
0.330752435557 [0.030937913154039133]
0.330752435557 [0.03293310468917572]
0.330752435557 [0.060931724948426635]
0.330752435557 [0.08998310034159304]
0.330752435557 [0.05205466786673592]
0.330752435557 [0.039444211741080766]
0.330752435557 [0.04333032049282347]
0.330752435557 [0.060603724773334915]
0.330752435557 [0.04847243261437668]
0.337810575475 [0.09129733573351453]
0.337810575475 [0.09129734573351453]
CPU times: user 1min 20s, sys: 48 ms, total: 1min 20s
Wall time: 1min 20s


In [41]:
result.x

array([ 0.09129734])

#### Recalculating thresholds using this parameter

In [45]:
T = {'T0': T0, 'T1': T1, 'T2': T2, 'T3': T3, 'T4': T4}
T_new = optimize_thresholds(T, validation_op, validation_set_preds, 0.0912)

### Running it finally on test data

In [46]:
test_set_preds = lr.predict(test_ip)
print test_set_preds
print test_set_preds.shape
test_set_preds = test_set_preds.reshape(test_set_preds.shape[0], 1)
print test_set_preds.shape

[ 3.   2.7  3.  ...,  3.   2.8  3. ]
(20761,)
(20761, 1)


In [47]:
from sklearn.metrics import f1_score
test_set_class_preds = [itoc(x, T) for x in test_set_preds]
f1_score(test_op, test_set_class_preds, average='macro')

0.33315309320563263