In [1]:
from sklearn import feature_selection, pipeline, preprocessing, model_selection, impute, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


import pandas as pd
import numpy as np

In [2]:
df_train_org = pd.read_csv("data/train.csv")
df_test_org = pd.read_csv("data/test.csv")

tr_X = df_train_org.loc[:, df_train_org.columns != 'area']
ts_X = df_test_org.loc[:, tr_X.columns]
tr_y = (df_train_org['area'] > 0)*1
ts_y = (df_test_org['area'] > 0)*1

In [3]:
tr_X.shape

(466, 12)

In [5]:
steps = [
    ('imputation', impute.SimpleImputer(strategy='median')),
    ('normalizing', preprocessing.Normalizer()),
    ('feature_selection', feature_selection.SelectKBest()),
    ('knn', KNeighborsClassifier())
]

model_pipeline = pipeline.Pipeline(steps=steps)

# model_pipeline.fit(tr_X, tr_y)
# (model_pipeline.predict(ts_X) == ts_y).mean()

In [14]:
param_grid = \
    {
        'feature_selection__k': range(1, 13),
        #     'normalizing__norm': ['l1', 'l2'],
        'knn__n_neighbors': range(20, 50, 2),
        'knn__weights': ['uniform', 'distance'],
        'knn__p': np.arange(1, 2.01, 0.25)
    }

model_cv = model_selection.GridSearchCV(estimator=model_pipeline,
                                        param_grid=param_grid,
                                        scoring='accuracy',
                                        n_jobs=-1,
                                        cv=10,
                                        refit=True,
                                        verbose=True,
                                        iid=True)
model_cv.fit(tr_X, tr_y)

print(model_cv.best_params_, model_cv.best_score_)

Fitting 10 folds for each of 1800 candidates, totalling 18000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 1047 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 3047 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 5847 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 9447 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 13847 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 16927 tasks      | elapsed:  2.6min


{'feature_selection__k': 4, 'knn__n_neighbors': 48, 'knn__p': 1.25, 'knn__weights': 'uniform'} 0.5257510729613734


[Parallel(n_jobs=-1)]: Done 18000 out of 18000 | elapsed:  2.9min finished


In [23]:
model_cv.score(ts_X, ts_y)

0.6470588235294118

In [13]:
# import inspect, pprint
# pprint.pprint(inspect.getsource(preprocessing.Normalizer))

In [120]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]
    
models1 = {
#     'ExtraTreesClassifier': ExtraTreesClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
#     'AdaBoostClassifier': AdaBoostClassifier(),
#     'GradientBoostingClassifier': GradientBoostingClassifier(),
#     'SVC': SVC(),
    'KNN': model_pipeline
}

params1 = {
#     'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
    'RandomForestClassifier': { 'n_estimators': [16, 32] },
#     'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
#     'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
#     'SVC': [
#         {'kernel': ['linear'], 'C': [1, 10]},
#         {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
#     ],
    'KNN': {
        'feature_selection__k': range(6, 13),
    #     'normalizing__norm': ['l1', 'l2'],
        'knn__n_neighbors': range(1, 10, 2),
        'knn__weights': ['uniform', 'distance'],
        'knn__p': np.arange(1, 2.01, 0.5)
    }
}

helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(tr_X, tr_y, scoring='accuracy', n_jobs=-1)

Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for KNN.
Fitting 3 folds for each of 210 candidates, totalling 630 fits


[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 630 out of 630 | elapsed:    2.5s finished


In [124]:
helper1.score_summary(sort_by='max_score')

RandomForestClassifier
KNN


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,feature_selection__k,knn__n_neighbors,knn__p,knn__weights,n_estimators
4,KNN,0.445161,0.506259,0.589744,0.0611115,6,1,1.5,uniform,
5,KNN,0.445161,0.506259,0.589744,0.0611115,6,1,1.5,distance,
6,KNN,0.458065,0.501958,0.589744,0.0620741,6,1,2,uniform,
7,KNN,0.458065,0.501958,0.589744,0.0620741,6,1,2,distance,
2,KNN,0.43871,0.491219,0.583333,0.0653476,6,1,1,uniform,
3,KNN,0.43871,0.491219,0.583333,0.0653476,6,1,1,distance,
27,KNN,0.393548,0.478315,0.583333,0.0787915,6,9,1,distance,
15,KNN,0.393548,0.476179,0.576923,0.0759503,6,5,1,distance,
21,KNN,0.4,0.471891,0.570513,0.072132,6,7,1,distance,
25,KNN,0.380645,0.463303,0.564103,0.0759868,6,7,2,distance,
