In [14]:
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml
from sklearn.metrics import roc_auc_score, log_loss

In [6]:
%run '../../code/constants.py'
%run '../../code/feature_selection.py'
%run '../../code/preprocessing.py'

In [5]:
df = pd.read_pickle(path + fin_mod_df)

In [7]:
X, y = get_x_y(df)

In [8]:
pl = Pipeline([
    ('impute', Imputer(strategy='median'))
    , ('standardize', StandardScaler())
#     , ('interactions', PolynomialFeatures(include_bias=False))
    , ('clf', SGDClassifier())
])

In [9]:
alpha = [0.0001, 0.001, 0.1]

l1_ratio = [0.1, 0.2, 0.4]

C_options = [0.2, 0.4, 0.6, 0.8]


In [10]:
param_grid =  [
    {
    'clf': [SGDClassifier(penalty='elasticnet', max_iter=500, loss='log')]
    , 'clf__l1_ratio': l1_ratio
    , 'clf__alpha': alpha
    }, 
    {'clf': [LogisticRegression(penalty='l2', max_iter=500)]
    , 'clf__C': C_options
    }
]

In [11]:
grid = GridSearchCV(pl, param_grid=param_grid, scoring={'auc': auc_scorer, 'log': log_scorer}
                    , refit='log', cv=6, verbose=5, return_train_score=True, n_jobs=2)

In [12]:
grid.fit(X, y)

Fitting 6 folds for each of 13 candidates, totalling 78 fits
[CV] clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.0001, clf__l1_ratio=0.1 
[CV] clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.0001, clf__l1_ratio=0.1 
[CV]  clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_

       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.0001, clf__l1_ratio=0.2, auc=0.6176489085707664, log=-0.5118641888860858, total=   4.6s
[CV] clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.0001, clf__l1_ratio=0.2 
[CV]  clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.0001, clf__l1_ratio=0.2, auc=0.601152616685245, log=-0.5168039102060457, total=   4.7s
[CV] clf=SGDClassifie

[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:   33.5s


[CV]  clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.0001, clf__l1_ratio=0.4, auc=0.6069499080056745, log=-0.5134814831040797, total=   4.4s
[CV] clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.0001, clf__l1_ratio=0.4 
[CV]  clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', m

       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.001, clf__l1_ratio=0.2, auc=0.6165608370554382, log=-0.5211578772928956, total=   4.3s
[CV] clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.001, clf__l1_ratio=0.2 
[CV]  clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.001, clf__l1_ratio=0.2, auc=0.6550381606090829, log=-0.5016819297050756, total=   4.3s
[CV] clf=SGDClassifier(

       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.001, clf__l1_ratio=0.4, auc=0.5997868198201491, log=-0.5167170507679995, total=   4.3s
[CV] clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.1, clf__l1_ratio=0.1 
[CV]  clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.001, clf__l1_ratio=0.4, auc=0.5948018239127377, log=-0.5163710008274518, total=   4.3s
[CV] clf=SGDClassifier(al

       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.1, clf__l1_ratio=0.2, auc=0.6027446522490599, log=-0.5182687731148864, total=   4.6s
[CV] clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.1, clf__l1_ratio=0.2 
[CV]  clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=500, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False), clf__alpha=0.1, clf__l1_ratio=0.2, auc=0.60001019274512, log=-0.5190265156664297, total=   4.7s
[CV] clf=SGDClassifier(alpha=0.

          verbose=0, warm_start=False), clf__C=0.2 
[CV]  clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=0.2, auc=0.6550031594010767, log=-0.5015689995903658, total=   0.2s
[CV] clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=0.2 
[CV]  clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=0.2, auc=0.6071083426773136

          verbose=0, warm_start=False), clf__C=0.6 
[CV]  clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=0.6, auc=0.6071032318814543, log=-0.5135434735271449, total=   0.2s
[CV] clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=0.6 


[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:  2.1min


[CV]  clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=0.6, auc=0.6195373547069105, log=-0.5111791660819417, total=   0.2s
[CV] clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=0.6 
[CV]  clf=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), clf__C=0.6, auc=0.600042412972855, log=-0.5167108462065821, total=   0.2s
[CV] clf=Log

[Parallel(n_jobs=2)]: Done  78 out of  78 | elapsed:  2.1min finished


GridSearchCV(cv=6, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('impute', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_rat...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=2,
       param_grid=[{'clf': [SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.4, learning_rate='optimal',
       loss='log', max_iter=500, n_iter=None, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True...r='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'clf__C': [0.2, 0.4, 0.6, 0.8]}],
       pre_dispatch='2*n_jobs', refit='log', return_tr

In [13]:
cv_df = pd.DataFrame(grid.cv_results_)
cv_df

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_auc,mean_test_log,mean_train_auc,mean_train_log,param_clf,param_clf__C,param_clf__alpha,param_clf__l1_ratio,...,split5_test_auc,split5_test_log,split5_train_auc,split5_train_log,std_fit_time,std_score_time,std_test_auc,std_test_log,std_train_auc,std_train_log
0,4.722055,0.010922,0.6144,-0.514005,0.616287,-0.511555,"SGDClassifier(alpha=0.001, average=False, clas...",,0.0001,0.1,...,0.593126,-0.516942,0.620537,-0.510772,0.08541,0.002446,0.019804,0.006393,0.004584,0.00121
1,4.681118,0.009633,0.615305,-0.514566,0.616365,-0.511647,"SGDClassifier(alpha=0.001, average=False, clas...",,0.0001,0.2,...,0.593698,-0.516686,0.620203,-0.510888,0.076046,0.000368,0.01942,0.007558,0.004287,0.00113
2,4.496573,0.009712,0.614601,-0.514096,0.616101,-0.511693,"SGDClassifier(alpha=0.001, average=False, clas...",,0.0001,0.4,...,0.596252,-0.516009,0.619866,-0.510894,0.101484,0.000686,0.019813,0.007305,0.004514,0.001255
3,4.368736,0.009014,0.615518,-0.513442,0.616801,-0.511398,"SGDClassifier(alpha=0.001, average=False, clas...",,0.001,0.1,...,0.594747,-0.516407,0.620554,-0.510707,0.072741,0.000449,0.01971,0.006128,0.0044,0.001198
4,4.309065,0.009658,0.615469,-0.513455,0.6168,-0.5114,"SGDClassifier(alpha=0.001, average=False, clas...",,0.001,0.2,...,0.594727,-0.51642,0.62059,-0.510707,0.037185,0.000856,0.019685,0.006091,0.004412,0.001198
5,4.310617,0.00916,0.615411,-0.513427,0.616794,-0.511408,"SGDClassifier(alpha=0.001, average=False, clas...",,0.001,0.4,...,0.594802,-0.516371,0.620575,-0.510715,0.018626,0.000262,0.019574,0.005931,0.004412,0.001199
6,4.448951,0.009136,0.610784,-0.516399,0.609652,-0.515593,"SGDClassifier(alpha=0.001, average=False, clas...",,0.1,0.1,...,0.584956,-0.519203,0.613509,-0.514872,0.027896,0.00051,0.021393,0.002647,0.004711,0.001022
7,4.640309,0.009161,0.605521,-0.51915,0.603721,-0.518836,"SGDClassifier(alpha=0.001, average=False, clas...",,0.1,0.2,...,0.584295,-0.520903,0.608134,-0.518198,0.011166,0.000646,0.020367,0.001587,0.004166,0.000759
8,4.707424,0.009072,0.569861,-0.525305,0.562437,-0.525229,"SGDClassifier(alpha=0.001, average=False, clas...",,0.1,0.4,...,0.559313,-0.525537,0.577145,-0.525187,0.027801,0.000474,0.039232,0.000648,0.028219,0.000836
9,0.162854,0.01079,0.615476,-0.513485,0.616822,-0.511396,"LogisticRegression(C=1.0, class_weight=None, d...",0.2,,,...,0.59489,-0.516377,0.620586,-0.510705,0.005832,0.000479,0.019629,0.006209,0.004407,0.001198


In [15]:
pmml_pipe = make_pmml_pipeline(grid, X.columns.values, y.name)

In [None]:
sklearn2pmml(pmml_pipe, 'pmml_model.pmml', with_repr=True)