In [1]:
import pandas as pd
import numpy as np

from classifier import AdaCost
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, confusion_matrix

In [2]:
train = pd.read_csv("data/claims_train.csv")
test = pd.read_csv("data/claims_test.csv")

### The target is bucket2009 which has 5 classes.

In [3]:
train.head()

Unnamed: 0,age,alzheimers,arthritis,cancer,copd,depression,diabetes,heart.failure,ihd,kidney,osteoporosis,stroke,reimbursement2008,bucket2008,reimbursement2009,bucket2009
0,85,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
1,59,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,52,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
3,75,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
4,89,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


### Cost Matrix where the rows are predicted classes and columns are actual classes

In [3]:
cost_matrix = np.array([[0.0,2.0,4.0,6.0,8.0],
                        [1.0,0.0,2.0,4.0,6.0],
                        [2.0,1.0,0.0,2.0,4.0],
                        [3.0,2.0,1.0,0.0,2.0],
                        [4.0,3.0,2.0,1.0,0.0]])

In [4]:
X_train = train.drop('bucket2009',axis = 1)    
y_train = train['bucket2009']
X_test = test.drop('bucket2009', axis = 1)
y_test = test['bucket2009']

### Cost Calculation Function

In [6]:
def cost_calc(y_p,y,print_result = False):
    con_mat = confusion_matrix(y_p,y)
    cost_mat = np.multiply(con_mat,cost_matrix)
    cost = np.sum(np.multiply(con_mat,cost_matrix))/len(y) 
    if print_result:
        print "Confusion Matrix\n",con_mat
        print "Costs\n",cost_mat
        print "Total Cost = ", cost
    else:
        return cost

### Score for cross-validation

In [7]:
score = make_scorer(cost_calc,greater_is_better = False)

## Model Results

### Baseline Model - If bucket2008 was used to predict bucket2009

In [18]:
cost_calc(test['bucket2008'],test['bucket2009'],True) 

Confusion Matrix
[[110138  16000   7006   2688    293]
 [  7787  10721   4629   1943    191]
 [  3427   4629   2774   1415    160]
 [  1452   2931   1621   1539    309]
 [   174    559    360    352    104]]
Costs
[[     0.  32000.  28024.  16128.   2344.]
 [  7787.      0.   9258.   7772.   1146.]
 [  6854.   4629.      0.   2830.    640.]
 [  4356.   5862.   1621.      0.    618.]
 [   696.   1677.    720.    352.      0.]]
Total Cost =  0.738605473739


### Random Forest

In [20]:
clf = RandomForestClassifier(random_state = 100)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      1      0]
 [     0  34839      9      2      2]
 [     0      1  16380    107     29]
 [     0      0      1   7825    297]
 [     0      0      0      2    729]]
Costs
[[   0.    0.    0.    6.    0.]
 [   0.    0.   18.    8.   12.]
 [   0.    1.    0.  214.  116.]
 [   0.    0.    1.    0.  594.]
 [   0.    0.    0.    2.    0.]]
Total Cost =  0.00530561893429


### Adaboost

In [31]:
ada = AdaBoostClassifier(random_state = 100)
ada.fit(X_train,y_train)
y_pred = ada.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      0      0]
 [     0  34840  16390   7936      0]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [     0      0      0      1   1057]]
Costs
[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   3.27800000e+04   3.17440000e+04
    0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    0.00000000e+00]]
Total Cost =  0.352206853637


### Adacost - SAMME.R

In [9]:
adac = AdaCost(algorithm = "SAMME.R", cost_matrix = cost_matrix, random_state = 100)
adac.fit(X_train,y_train)
y_pred = adac.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[     0      0      0      0      0]
 [     0      0      0      0      0]
 [122978  34840  16390   7642      0]
 [     0      0      0      0      0]
 [     0      0      0    295   1057]]
Costs
[[      0.       0.       0.       0.       0.]
 [      0.       0.       0.       0.       0.]
 [ 245956.   34840.       0.   15284.       0.]
 [      0.       0.       0.       0.       0.]
 [      0.       0.       0.     295.       0.]]
Total Cost =  1.61774980622


### Adacost - SAMME 

In [10]:
adac = AdaCost(algorithm = "SAMME", cost_matrix = cost_matrix, random_state = 100)
adac.fit(X_train,y_train)
y_pred = adac.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[     0      0      0      0      0]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [122978  34840  16390   7937   1057]]
Costs
[[      0.       0.       0.       0.       0.]
 [      0.       0.       0.       0.       0.]
 [      0.       0.       0.       0.       0.]
 [      0.       0.       0.       0.       0.]
 [ 491912.  104520.   32780.    7937.       0.]]
Total Cost =  3.47784958679


## Paramter Tuning

### Adaboost - SAMME.R

In [12]:
ada = AdaBoostClassifier(algorithm = "SAMME.R",random_state = 100)
cv = GridSearchCV(ada,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  scoring = score,
                  verbose = 1, 
                  n_jobs = 1)

In [13]:
cv.fit(X_train,y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 30.8min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=100),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(cost_calc, greater_is_better=False), verbose=1)

In [14]:
cv.grid_scores_



[mean: -0.40822, std: 0.23420, params: {'n_estimators': 10, 'learning_rate': 0.01},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 20, 'learning_rate': 0.01},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 50, 'learning_rate': 0.01},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 100, 'learning_rate': 0.01},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 200, 'learning_rate': 0.01},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 10, 'learning_rate': 0.05},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 20, 'learning_rate': 0.05},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 50, 'learning_rate': 0.05},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 100, 'learning_rate': 0.05},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 200, 'learning_rate': 0.05},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 10, 'learning_rate': 0.1},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 20, 'learning_rat

In [15]:
cv.best_params_,cv.best_score_

({'learning_rate': 0.01, 'n_estimators': 50}, -0.32077524626732606)

In [16]:
ada_cv = cv.best_estimator_

In [17]:
y_pred = ada_cv.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      0      0]
 [     0  34840      0      0      0]
 [     0      0  16390      0      0]
 [     0      0      0   7937   1057]
 [     0      0      0      0      0]]
Costs
[[    0.     0.     0.     0.     0.]
 [    0.     0.     0.     0.     0.]
 [    0.     0.     0.     0.     0.]
 [    0.     0.     0.     0.  2114.]
 [    0.     0.     0.     0.     0.]]
Total Cost =  0.0115391753365


### Adaboost - SAMME

In [18]:
ada = AdaBoostClassifier(algorithm = "SAMME",random_state = 100)
cv = GridSearchCV(ada,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  scoring = score,
                  verbose = 1, 
                  n_jobs = 1)

In [19]:
cv.fit(X_train,y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 18.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=1.0,
          n_estimators=50, random_state=100),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(cost_calc, greater_is_better=False), verbose=1)

In [20]:
cv.grid_scores_



[mean: -0.40822, std: 0.23420, params: {'n_estimators': 10, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 20, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 50, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 100, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 200, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 10, 'learning_rate': 0.05},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 20, 'learning_rate': 0.05},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 50, 'learning_rate': 0.05},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 100, 'learning_rate': 0.05},
 mean: -0.32078, std: 0.32836, params: {'n_estimators': 200, 'learning_rate': 0.05},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 10, 'learning_rate': 0.1},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 20, 'learning_rat

In [21]:
cv.best_params_,cv.best_score_

({'learning_rate': 0.05, 'n_estimators': 200}, -0.32077524626732606)

In [22]:
ada_cv = cv.best_estimator_

In [23]:
y_pred = ada_cv.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      0      0]
 [     0  34840      0      0      0]
 [     0      0  16390      0      0]
 [     0      0      0   7937   1057]
 [     0      0      0      0      0]]
Costs
[[    0.     0.     0.     0.     0.]
 [    0.     0.     0.     0.     0.]
 [    0.     0.     0.     0.     0.]
 [    0.     0.     0.     0.  2114.]
 [    0.     0.     0.     0.     0.]]
Total Cost =  0.0115391753365


### Adacost - SAMME.R (max_depth = 1)

In [24]:
adac = AdaCost(algorithm = "SAMME.R", cost_matrix = cost_matrix, random_state = 100)
cv = GridSearchCV(adac,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  scoring = score,
                  verbose = 1, 
                  n_jobs = 1)

In [25]:
cv.fit(X_train,y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 37.4min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaCost(algorithm='SAMME.R', base_estimator=None,
    cost_matrix=array([[ 0.,  2.,  4.,  6.,  8.],
       [ 1.,  0.,  2.,  4.,  6.],
       [ 2.,  1.,  0.,  2.,  4.],
       [ 3.,  2.,  1.,  0.,  2.],
       [ 4.,  3.,  2.,  1.,  0.]]),
    learning_rate=1.0, n_estimators=50, random_state=100),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(cost_calc, greater_is_better=False), verbose=1)

In [28]:
cv.grid_scores_



[mean: -0.40822, std: 0.23420, params: {'n_estimators': 10, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 20, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 50, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 100, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 200, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 10, 'learning_rate': 0.05},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 20, 'learning_rate': 0.05},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 50, 'learning_rate': 0.05},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 100, 'learning_rate': 0.05},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 200, 'learning_rate': 0.05},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 10, 'learning_rate': 0.1},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 20, 'learning_rat

In [29]:
cv.best_params_,cv.best_score_

({'learning_rate': 0.5, 'n_estimators': 20}, -0.33272198629563726)

In [30]:
adac_cv = cv.best_estimator_

In [31]:
y_pred = adac_cv.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978  22811      0      0      0]
 [     0  12029  16390   5508      0]
 [     0      0      0      0      0]
 [     0      0      0      0      0]
 [     0      0      0   2429   1057]]
Costs
[[     0.  45622.      0.      0.      0.]
 [     0.      0.  32780.  22032.      0.]
 [     0.      0.      0.      0.      0.]
 [     0.      0.      0.      0.      0.]
 [     0.      0.      0.   2429.      0.]]
Total Cost =  0.561473128023


### Adacost - SAMME (max_depth = 1)

In [33]:
adac = AdaCost(algorithm = "SAMME", cost_matrix = cost_matrix, random_state = 100)
cv = GridSearchCV(adac,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  scoring = score,
                  verbose = 1, 
                  n_jobs = 1)

In [34]:
cv.fit(X_train,y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 24.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaCost(algorithm='SAMME', base_estimator=None,
    cost_matrix=array([[ 0.,  2.,  4.,  6.,  8.],
       [ 1.,  0.,  2.,  4.,  6.],
       [ 2.,  1.,  0.,  2.,  4.],
       [ 3.,  2.,  1.,  0.,  2.],
       [ 4.,  3.,  2.,  1.,  0.]]),
    learning_rate=1.0, n_estimators=50, random_state=100),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(cost_calc, greater_is_better=False), verbose=1)

In [35]:
cv.grid_scores_



[mean: -0.40822, std: 0.23420, params: {'n_estimators': 10, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 20, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 50, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 100, 'learning_rate': 0.01},
 mean: -0.45909, std: 0.41105, params: {'n_estimators': 200, 'learning_rate': 0.01},
 mean: -0.40822, std: 0.23420, params: {'n_estimators': 10, 'learning_rate': 0.05},
 mean: -0.33868, std: 0.24413, params: {'n_estimators': 20, 'learning_rate': 0.05},
 mean: -1.38211, std: 0.84745, params: {'n_estimators': 50, 'learning_rate': 0.05},
 mean: -1.38633, std: 0.85961, params: {'n_estimators': 100, 'learning_rate': 0.05},
 mean: -1.38633, std: 0.85961, params: {'n_estimators': 200, 'learning_rate': 0.05},
 mean: -0.33868, std: 0.24413, params: {'n_estimators': 10, 'learning_rate': 0.1},
 mean: -0.48610, std: 0.38782, params: {'n_estimators': 20, 'learning_rat

In [36]:
cv.best_params_,cv.best_score_

({'learning_rate': 0.05, 'n_estimators': 20}, -0.3386789809427117)

In [37]:
adac_cv = cv.best_estimator_

In [38]:
y_pred = adac_cv.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      0      0]
 [     0  34840  16390      0      0]
 [     0      0      0      0      0]
 [     0      0      0   7937   1057]
 [     0      0      0      0      0]]
Costs
[[     0.      0.      0.      0.      0.]
 [     0.      0.  32780.      0.      0.]
 [     0.      0.      0.      0.      0.]
 [     0.      0.      0.      0.   2114.]
 [     0.      0.      0.      0.      0.]]
Total Cost =  0.190467352976


### Adacost - SAMME.R (max_depth = 2)

In [37]:
from sklearn.tree import DecisionTreeClassifier
adac = AdaCost(base_estimator = DecisionTreeClassifier(max_depth = 2),
               algorithm = "SAMME.R", 
               cost_matrix = cost_matrix, 
               random_state = 100)

cv = GridSearchCV(adac,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  scoring = score,
                  verbose = 1, 
                  n_jobs = 1)

In [38]:
cv.fit(X_train,y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 43.9min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaCost(algorithm='SAMME.R',
    base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0...  2.],
       [ 4.,  3.,  2.,  1.,  0.]]),
    learning_rate=1.0, n_estimators=50, random_state=100),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(cost_calc, greater_is_better=False), verbose=1)

In [39]:
cv.grid_scores_



[mean: -0.33723, std: 0.29815, params: {'n_estimators': 10, 'learning_rate': 0.01},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 20, 'learning_rate': 0.01},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 50, 'learning_rate': 0.01},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 100, 'learning_rate': 0.01},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 200, 'learning_rate': 0.01},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 10, 'learning_rate': 0.05},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 20, 'learning_rate': 0.05},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 50, 'learning_rate': 0.05},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 100, 'learning_rate': 0.05},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 200, 'learning_rate': 0.05},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 10, 'learning_rate': 0.1},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 20, 'learning_rat

In [40]:
cv.best_params_,cv.best_score_

({'learning_rate': 0.01, 'n_estimators': 10}, -0.33723067069864593)

In [41]:
adac_cv = cv.best_estimator_

In [42]:
y_pred = adac_cv.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      0      0]
 [     0  34840      0      0      0]
 [     0      0  16390   7937   1057]
 [     0      0      0      0      0]
 [     0      0      0      0      0]]
Costs
[[     0.      0.      0.      0.      0.]
 [     0.      0.      0.      0.      0.]
 [     0.      0.      0.  15874.   4228.]
 [     0.      0.      0.      0.      0.]
 [     0.      0.      0.      0.      0.]]
Total Cost =  0.109725876355


### Adacost - SAMME (max_depth = 2)

In [29]:
from sklearn.tree import DecisionTreeClassifier
adac = AdaCost(base_estimator = DecisionTreeClassifier(max_depth = 2),
               algorithm = "SAMME", 
               cost_matrix = cost_matrix, 
               random_state = 100)

cv = GridSearchCV(adac,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  scoring = score,
                  verbose = 1, 
                  n_jobs = 1)

In [30]:
cv.fit(X_train,y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 31.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaCost(algorithm='SAMME',
    base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0...  2.],
       [ 4.,  3.,  2.,  1.,  0.]]),
    learning_rate=1.0, n_estimators=50, random_state=100),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(cost_calc, greater_is_better=False), verbose=1)

In [31]:
cv.grid_scores_



[mean: -0.33723, std: 0.29815, params: {'n_estimators': 10, 'learning_rate': 0.01},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 20, 'learning_rate': 0.01},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 50, 'learning_rate': 0.01},
 mean: -0.48304, std: 0.39004, params: {'n_estimators': 100, 'learning_rate': 0.01},
 mean: -0.32499, std: 0.34072, params: {'n_estimators': 200, 'learning_rate': 0.01},
 mean: -0.33723, std: 0.29815, params: {'n_estimators': 10, 'learning_rate': 0.05},
 mean: -0.49031, std: 0.40030, params: {'n_estimators': 20, 'learning_rate': 0.05},
 mean: -0.32499, std: 0.34072, params: {'n_estimators': 50, 'learning_rate': 0.05},
 mean: -0.32499, std: 0.34072, params: {'n_estimators': 100, 'learning_rate': 0.05},
 mean: -0.74872, std: 0.57612, params: {'n_estimators': 200, 'learning_rate': 0.05},
 mean: -0.32499, std: 0.34072, params: {'n_estimators': 10, 'learning_rate': 0.1},
 mean: -0.32499, std: 0.34072, params: {'n_estimators': 20, 'learning_rat

In [32]:
cv.best_params_,cv.best_score_

({'learning_rate': 0.01, 'n_estimators': 200}, -0.32498917406287414)

In [34]:
adac_cv = cv.best_estimator_

In [35]:
y_pred = adac_cv.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      0      0]
 [     0  34840      0      0      0]
 [     0      0  16390      0      0]
 [     0      0      0   7936      0]
 [     0      0      0      1   1057]]
Costs
[[ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.]]
Total Cost =  5.45845569372e-06


### Random

In [39]:
ada = AdaCost(algorithm = "SAMME", 
              learning_rate = 0.1,
              n_estimators = 50, 
              cost_matrix = cost_matrix, 
              random_state = 100)
ada.fit(X_train,y_train) 
y_pred = ada.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      0      0]
 [     0  34840      0      0      0]
 [     0      0      0      0      0]
 [     0      0  16390   7937   1057]
 [     0      0      0      0      0]]
Costs
[[     0.      0.      0.      0.      0.]
 [     0.      0.      0.      0.      0.]
 [     0.      0.      0.      0.      0.]
 [     0.      0.  16390.      0.   2114.]
 [     0.      0.      0.      0.      0.]]
Total Cost =  0.101003264157


In [9]:
ada = AdaCost(algorithm = "SAMME",
              max_depth = 2,
              learning_rate = 0.1,
              n_estimators =50, 
              cost_matrix = cost_matrix, 
              random_state = 100)
ada.fit(X_train,y_train) 
y_pred = ada.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      0      0]
 [     0  34840      0      0      0]
 [     0      0  16390      0      0]
 [     0      0      0   7936      0]
 [     0      0      0      1   1057]]
Costs
[[ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.]]
Total Cost =  5.45845569372e-06


In [19]:
ada = AdaCost(algorithm = "SAMME.R",
              max_depth = 3,
              learning_rate = 0.01,
              n_estimators =50, 
              cost_matrix = cost_matrix, 
              random_state = 100)
ada.fit(X_train,y_train) 
y_pred = ada.predict(X_test)
cost_calc(y_pred,y_test,True)

Confusion Matrix
[[122978      0      0      0      0]
 [     0  34840      0      0      0]
 [     0      0  16390      0      0]
 [     0      0      0   7937   1057]
 [     0      0      0      0      0]]
Costs
[[    0.     0.     0.     0.     0.]
 [    0.     0.     0.     0.     0.]
 [    0.     0.     0.     0.     0.]
 [    0.     0.     0.     0.  2114.]
 [    0.     0.     0.     0.     0.]]
Total Cost =  0.0115391753365
