In [14]:
import pandas as pd
import numpy as np
import feature_engine
from sklearn.model_selection import train_test_split
import seaborn as sns

In [15]:
data = pd.read_csv(r'../../../data/train.csv')
data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


## Split of data

In [20]:
y_train.value_counts(normalize=True) * 100, y_test.value_counts(normalize=True) * 100

(2    14.699074
 7    14.393188
 4    14.318783
 3    14.310516
 6    14.136905
 1    14.112103
 5    14.029431
 Name: Cover_Type, dtype: float64,
 5    15.310847
 1    14.980159
 6    14.880952
 3    14.186508
 4    14.153439
 7    13.855820
 2    12.632275
 Name: Cover_Type, dtype: float64)

##  Transformations

### Pipeline 1

In this pipe line some basic Feature engineering steps are made, the creation of a Euclidian Distance is set here.

In [40]:
from sklearn.pipeline import Pipeline
pipe_1 = Pipeline([
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('du', FromDummiesToCategories(cols_to_operate=soil_columns)),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dt', DistanceTransformer()),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),
    ('mxe', MinMaxScaler())
])
X_train_pipe_1 = pipe_1.fit_transform(X_train)

### Pipeline 2
This pipeline is exactly as the firstone except that I create a bins for Aspect and Hillshade.

In [25]:
pipe_2 = Pipeline([
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('du', FromDummiesToCategories(cols_to_operate=soil_columns)),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dt', DistanceTransformer()),
    ('dteq', EqualWidthDiscretiser(bins=15, variables=['EuclideanDistanceHidroloy'])),
    ('ed', EqualWidthDiscretiser(bins=36, variables=['Aspect'])),
    ('edh', EqualWidthDiscretiser(bins=26, variables=['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),
])
X_train_pipe_2 = pipe_2.fit_transform(X_train)
X_train_pipe_2.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,EuclideanDistanceHidroloy
7711,3151,2,19,2828,21,16,12,2200,0,0,1,0,11
1466,3101,30,14,3399,18,22,20,631,1,0,0,0,3
12128,2079,32,32,330,12,15,20,908,0,0,0,1,4
6301,2314,12,15,644,24,21,11,702,0,0,0,1,0
9822,3164,11,4,872,23,22,14,2536,1,0,0,0,0


### Pipeline 3
This is the most simple of all the pipe lines just droping duplicates, correlated and constant features

In [26]:
pipe_3 = Pipeline([
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),
])
X_train_pipe_3 = pipe_3.fit_transform(X_train)
X_train_pipe_3.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type23,Soil_Type24,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type38,Soil_Type39,Soil_Type40
7711,3151,23,19,1047,107,2828,206,197,122,2200,...,0,1,0,0,0,0,0,0,0,0
1466,3101,303,14,277,1,3399,181,233,192,631,...,0,0,1,0,0,0,0,0,0,0
12128,2079,324,32,360,230,330,126,189,193,908,...,0,0,0,0,0,0,0,0,0,0
6301,2314,121,15,0,0,644,244,227,107,702,...,0,0,0,0,0,0,0,0,0,0
9822,3164,117,4,0,0,872,227,236,143,2536,...,1,0,0,0,0,0,0,0,0,0


### Pipeline 4
This pipeline takes ito count the recommendations of Data analysis.

In [27]:
from feature_engine.transformation import YeoJohnsonTransformer
from sklearn.preprocessing import QuantileTransformer

In [28]:
pipe_4 = Pipeline([
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),
    ('yj', YeoJohnsonTransformer(variables=['Slope']))
])
X_train_pipe_4 = pipe_4.fit_transform(X_train)
X_train_pipe_4.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type23,Soil_Type24,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type38,Soil_Type39,Soil_Type40
7711,3151,23,6.874278,1047,107,2828,206,197,122,2200,...,0,1,0,0,0,0,0,0,0,0
1466,3101,303,5.694497,277,1,3399,181,233,192,631,...,0,0,1,0,0,0,0,0,0,0
12128,2079,324,9.374313,360,230,330,126,189,193,908,...,0,0,0,0,0,0,0,0,0,0
6301,2314,121,5.94476,0,0,644,244,227,107,702,...,0,0,0,0,0,0,0,0,0,0
9822,3164,117,2.459879,0,0,872,227,236,143,2536,...,1,0,0,0,0,0,0,0,0,0


### Pipeline 5
Applying MinMaxScaler

In [29]:
from sklearn.preprocessing import MinMaxScaler

In [30]:
from sklearn.pipeline import Pipeline
pipe_5 = Pipeline([
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dt', DistanceTransformer()),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),
    ('minmaxscaler', MinMaxScaler()),
])
X_train_pipe_5 = pipe_5.fit_transform(X_train)
X_train_pipe_5 = pd.DataFrame(data=X_train_pipe_5, columns=pipe_5.named_steps['dcf'].variables)
X_train_pipe_5.head(5)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,...,Soil_Type24,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type38,Soil_Type39,Soil_Type40,EuclideanDistanceHidroloy
0,0.64854,0.063889,0.365385,0.41045,0.811024,0.632258,0.491935,0.3146,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.775608
1,0.623364,0.841667,0.269231,0.493324,0.712598,0.864516,0.774194,0.090233,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.204137
2,0.108761,0.9,0.615385,0.047896,0.496063,0.580645,0.778226,0.129844,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.314826
3,0.22709,0.336111,0.288462,0.093469,0.96063,0.825806,0.431452,0.100386,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.655086,0.325,0.076923,0.12656,0.893701,0.883871,0.576613,0.362648,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Comparing simple model performaces with pipelines

### Pipeline 1

In [31]:
from lazypredict.Supervised import LazyClassifier



In [32]:
clf_p1 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
X_test_pipe_1 = pipe_1.transform(X_test)
X_test_pipe_1 = pd.DataFrame(data=X_test_pipe_1, columns=pipe_1.named_steps['dcf'].variables)
models_1, predictions_1 = clf_p1.fit(X_train_pipe_1, X_test_pipe_1, y_train, y_test)
models_1.head(5)

 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [01:02<00:04,  1.56s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:09<00:00,  2.33s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.84,0.84,,0.84,1.42
RandomForestClassifier,0.84,0.84,,0.84,2.64
XGBClassifier,0.84,0.83,,0.83,6.06
LGBMClassifier,0.83,0.83,,0.83,1.44
BaggingClassifier,0.82,0.82,,0.82,0.77


### Pipeline 2

In [None]:
clf_p2 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
X_test_pipe_2 = pipe_2.transform(X_test)
models_2, predictions_2 = clf_p2.fit(X_train_pipe_2, X_test_pipe_2, y_train, y_test)
models_2.head(5)

In [None]:
X_test_pipe_2

### Pipeline 3

In [None]:
clf_p3 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
X_test_pipe_3 = pipe_3.transform(X_test)
models_3, predictions_3 = clf_p3.fit(X_train_pipe_3, X_test_pipe_3, y_train, y_test)
models_3.head(5)

### Pipeline 4

In [None]:
clf_p4 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
X_test_pipe_4 = pipe_4.transform(X_test)
models_4, predictions_4 = clf_p4.fit(X_train_pipe_4, X_test_pipe_4, y_train, y_test)
models_4.head(5)

### Pipeline 5

In [None]:
clf_p5 = LazyClassifier(verbose=0,custom_metric=None)
X_test_pipe_5 = pipe_5.transform(X_test)
X_test_pipe_5 = pd.DataFrame(data=X_test_pipe_5, columns=pipe_5.named_steps['dcf'].variables)
models_5, predictions_5 = clf_p5.fit(X_train_pipe_5, X_test_pipe_5, y_train, y_test)
models_5.head(5)

## Cross Validation With different Pipelines


### Pipeline 2

### Modification of Pipeline to Classify


In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
pipe_2_rf = Pipeline([
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('du', FromDummiesToCategories(cols_to_operate=soil_columns)),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dt', DistanceTransformer()),
    ('dteq', EqualWidthDiscretiser(bins=15, variables=['EuclideanDistanceHidroloy'])),
    ('ed', EqualWidthDiscretiser(bins=36, variables=['Aspect'])),
    ('edh', EqualWidthDiscretiser(bins=26, variables=['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),
    ('rf', RandomForestClassifier())
])

In [37]:
rfpg1 ={
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [20,40,60,80],
    'rf__min_samples_split': [1, 2, 3, 4, 5],
}

In [38]:
grid1_pipe_2 = GridSearchCV(pipe_2_rf, param_grid=rfpg1, cv=5, n_jobs=-1, verbose=3, scoring='precision')
grid1_pipe_2.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 285 out of 300 | elapsed:    3.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.1s finished


ValueError: Need to specify at least one of 'labels', 'index' or 'columns'

In [None]:
print("Best cross-validation accuracy: {:.2f}".format(grid1_pipe_2.best_score_)) 
print("Test set score: {:.2f}".format(grid1_pipe_2.score(X_test, y_test))) 
print("Best parameters: {}".format(grid1_pipe_2.best_params_))

In [117]:
pd.DataFrame(grid1_pipe_2.cv_results_).sort_values(['rank_test_score']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__max_depth,param_rf__min_samples_split,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,6.36,0.09,0.32,0.04,40,2,200,"{'rf__max_depth': 40, 'rf__min_samples_split':...",0.85,0.85,0.85,0.84,0.86,0.85,0.01,1
50,6.1,0.14,0.34,0.06,80,2,200,"{'rf__max_depth': 80, 'rf__min_samples_split':...",0.85,0.85,0.85,0.84,0.86,0.85,0.01,2
52,5.11,0.16,0.27,0.01,80,3,150,"{'rf__max_depth': 80, 'rf__min_samples_split':...",0.85,0.85,0.85,0.84,0.86,0.85,0.01,3
22,4.91,0.14,0.27,0.01,40,3,150,"{'rf__max_depth': 40, 'rf__min_samples_split':...",0.85,0.86,0.85,0.84,0.86,0.85,0.01,4
41,6.57,0.2,0.34,0.02,60,4,200,"{'rf__max_depth': 60, 'rf__min_samples_split':...",0.85,0.85,0.85,0.84,0.86,0.85,0.01,5


In [118]:
grid1_pipe_2.best_estimator_['rf'].feature_importances_

array([0.2563412 , 0.04942836, 0.04839844, 0.11164728, 0.04397283,
       0.04132529, 0.04239986, 0.0905162 , 0.01856196, 0.00362147,
       0.01620046, 0.04685506, 0.00197226, 0.00727582, 0.01881156,
       0.01364721, 0.00155334, 0.00362392, 0.02466717, 0.00354617,
       0.0044374 , 0.00527022, 0.00096646, 0.00679133, 0.00505615,
       0.00521017, 0.00250736, 0.00574143, 0.0082223 , 0.00273638,
       0.00591706, 0.00415678, 0.01836003, 0.01866751, 0.0093477 ,
       0.05224587])

In [119]:
grid1_pipe_2

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('dropuniquefeatures',
                                        DropIdentifierFeatures()),
                                       ('du',
                                        FromDummiesToCategories(cols_to_operate=None)),
                                       ('dp', DropConstantFeatures(tol=0.99)),
                                       ('dd', DropDuplicateFeatures()),
                                       ('dt', DistanceTransformer()),
                                       ('dteq',
                                        EqualWidthDiscretiser(bins=15,
                                                              variables=['EuclideanDistanceHidroloy'])),
                                       ('ed',
                                        EqualWidthDiscretiser(bins=36,
                                                              variables=['Aspect'])),
                                       ('edh',
                       

## GridSearch Pipeline 3

In [120]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
pipe_3_rf = Pipeline([
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),
    ('rf', RandomForestClassifier()),
])

In [121]:
rfpg2 ={
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [20,40,60,80],
    'rf__min_samples_split': [1, 2, 3, 4, 5],
}
grid1_pipe_3 = GridSearchCV(pipe_3_rf, param_grid=rfpg2, cv=5, n_jobs=-1, verbose=3)
grid1_pipe_3.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   52.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.5min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('dropuniquefeatures',
                                        DropIdentifierFeatures()),
                                       ('dp', DropConstantFeatures(tol=0.99)),
                                       ('dd', DropDuplicateFeatures()),
                                       ('dcf', DropCorrelatedFeatures()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [20, 40, 60, 80],
                         'rf__min_samples_split': [1, 2, 3, 4, 5],
                         'rf__n_estimators': [100, 150, 200]},
             verbose=3)

In [122]:
print("Best cross-validation accuracy: {:.2f}".format(grid1_pipe_3.best_score_)) 
print("Test set score: {:.2f}".format(grid1_pipe_3.score(X_test, y_test))) 
print("Best parameters: {}".format(grid1_pipe_3.best_params_))

Best cross-validation accuracy: 0.86
Test set score: 0.87
Best parameters: {'rf__max_depth': 40, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}


In [123]:
pd.DataFrame(grid1_pipe_3.cv_results_).sort_values(['rank_test_score']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__max_depth,param_rf__min_samples_split,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,6.14,0.06,0.25,0.01,40,2,200,"{'rf__max_depth': 40, 'rf__min_samples_split':...",0.86,0.86,0.85,0.85,0.87,0.86,0.01,1
49,4.65,0.1,0.18,0.01,80,2,150,"{'rf__max_depth': 80, 'rf__min_samples_split':...",0.86,0.86,0.85,0.85,0.86,0.86,0.01,2
38,6.05,0.06,0.24,0.01,60,3,200,"{'rf__max_depth': 60, 'rf__min_samples_split':...",0.86,0.85,0.85,0.85,0.87,0.85,0.01,3
50,6.1,0.06,0.25,0.01,80,2,200,"{'rf__max_depth': 80, 'rf__min_samples_split':...",0.86,0.86,0.84,0.85,0.86,0.85,0.01,4
4,4.76,0.1,0.19,0.01,20,2,150,"{'rf__max_depth': 20, 'rf__min_samples_split':...",0.86,0.86,0.85,0.85,0.86,0.85,0.01,5


## Pipeline 4

In [124]:
pipe_4_rf = Pipeline([
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),
    ('yj', YeoJohnsonTransformer(variables=['Slope'])),
    ('rf', RandomForestClassifier()),
])

In [125]:
rfpg3 ={
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [20,40,60,80],
    'rf__min_samples_split': [1, 2, 3, 4, 5],
}
grid1_pipe_4 = GridSearchCV(pipe_4_rf, param_grid=rfpg3, cv=5, n_jobs=-1, verbose=3, scoring='precision')
grid1_pipe_4.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.5min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('dropuniquefeatures',
                                        DropIdentifierFeatures()),
                                       ('dp', DropConstantFeatures(tol=0.99)),
                                       ('dd', DropDuplicateFeatures()),
                                       ('dcf', DropCorrelatedFeatures()),
                                       ('yj',
                                        YeoJohnsonTransformer(variables=['Slope'])),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [20, 40, 60, 80],
                         'rf__min_samples_split': [1, 2, 3, 4, 5],
                         'rf__n_estimators': [100, 150, 200]},
             verbose=3)

In [127]:
print("Best cross-validation accuracy: {:.2f}".format(grid1_pipe_4.best_score_)) 
print("Test set score: {:.2f}".format(grid1_pipe_4.score(X_test, y_test))) 
print("Best parameters: {}".format(grid1_pipe_4.best_params_))
pd.DataFrame(grid1_pipe_4.cv_results_).sort_values(['rank_test_score']).head()

Best cross-validation accuracy: 0.86
Test set score: 0.86
Best parameters: {'rf__max_depth': 40, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__max_depth,param_rf__min_samples_split,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,6.07,0.11,0.25,0.01,40,2,200,"{'rf__max_depth': 40, 'rf__min_samples_split':...",0.87,0.86,0.85,0.85,0.86,0.86,0.01,1
35,6.02,0.09,0.24,0.01,60,2,200,"{'rf__max_depth': 60, 'rf__min_samples_split':...",0.87,0.86,0.85,0.85,0.86,0.86,0.01,2
50,6.08,0.14,0.25,0.01,80,2,200,"{'rf__max_depth': 80, 'rf__min_samples_split':...",0.86,0.86,0.85,0.84,0.86,0.86,0.01,3
18,3.18,0.1,0.13,0.02,40,2,100,"{'rf__max_depth': 40, 'rf__min_samples_split':...",0.86,0.86,0.85,0.85,0.86,0.86,0.01,4
19,4.53,0.13,0.18,0.01,40,2,150,"{'rf__max_depth': 40, 'rf__min_samples_split':...",0.86,0.86,0.85,0.84,0.86,0.86,0.01,5


In [128]:
test = pd.read_csv(r'../../../data/test.csv')

In [129]:
test.shape

(565892, 55)

In [130]:
test['Cover_Type'] = grid1_pipe_4.predict(test)

In [131]:
to_kaggle = test[['Id', 'Cover_Type']]


In [132]:
to_kaggle.to_csv('pipe4_predictions', index=False)

ValueError: The number of columns in this dataset is different from the one used to fit this transformer (when using the fit() method).

In [136]:
test['Cover_Type'] = grid1_pipe_2.predict(test.drop(['Cover_Type'], axis=1))

In [137]:
to_kaggle = test[['Id', 'Cover_Type']]
to_kaggle.to_csv('pipe2_predictions', index=False)

In [139]:
test['Cover_Type'] = grid1_pipe_3.predict(test.drop(['Cover_Type'], axis=1))
to_kaggle = test[['Id', 'Cover_Type']]
to_kaggle.to_csv('pipe3_predictions', index=False)