In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

from feature_engine.discretisation import EqualWidthDiscretiser, EqualFrequencyDiscretiser
from feature_engine.selection  import DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures, SelectBySingleFeaturePerformance, RecursiveFeatureElimination, SmartCorrelatedSelection, DropFeatures 
from feature_engine.encoding import RareLabelEncoder, OneHotEncoder, OrdinalEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.creation import MathematicalCombination
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics

# Custom Transformers

In [2]:
class DistanceTransformer(BaseEstimator, TransformerMixin):
    # TODO create a tranformer that do its for any numeric variables in a pandas dataframe
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['EuclideanDistanceHidroloy'] = np.around(
            np.sqrt(X['Horizontal_Distance_To_Hydrology'] **2 +
                    X['Vertical_Distance_To_Hydrology'] **2), 
            4)
        X['Elevation_Vertical_Hydro_Minus'] = X['Elevation'] - X['Vertical_Distance_To_Hydrology']
        X['Elevation_Vertical_Hydro_Plus'] = X['Elevation'] + X['Vertical_Distance_To_Hydrology']
        X['Elevation_Vertical_Hydro_Prod'] = X['Elevation'] * X['Vertical_Distance_To_Hydrology']
        
        X['Elevation_Horizontal_Hydro_Minus'] = X['Elevation'] - X['Horizontal_Distance_To_Hydrology']
        X['Elevation_Horizontal_Hydro_Plus'] = X['Elevation'] + X['Horizontal_Distance_To_Hydrology']
        X['Elevation_Horizontal_Hydro_Prod'] = X['Elevation'] * X['Horizontal_Distance_To_Hydrology']
        
        X['Elevation_Horizontal_Fire_Minus'] = X['Elevation'] - X['Horizontal_Distance_To_Fire_Points']
        X['Elevation_Horizontal_Fire_Plus'] = X['Elevation'] + X['Horizontal_Distance_To_Fire_Points']
        X['Elevation_Horizontal_Fire_Prod'] = X['Elevation'] * X['Horizontal_Distance_To_Fire_Points']
        
        X['Elevation_Horizontal_Roadways_Minus'] = X['Elevation'] - X['Horizontal_Distance_To_Roadways']
        X['Elevation_Horizontal_Roadways_Plus'] = X['Elevation'] + X['Horizontal_Distance_To_Roadways']
        X['Elevation_Horizontal_Roadways_Prod'] = X['Elevation'] * X['Horizontal_Distance_To_Roadways']
                
        X['Hidrology_Horizonal_Fire_Minus'] = X['Horizontal_Distance_To_Hydrology'] - X['Horizontal_Distance_To_Fire_Points']
        X['Hidrology_Horizonal_Fire_Plus'] = X['Horizontal_Distance_To_Hydrology'] + X['Horizontal_Distance_To_Fire_Points']
        X['Hidrology_Horizonal_Fire_Prod'] = X['Horizontal_Distance_To_Hydrology'] * X['Horizontal_Distance_To_Fire_Points']
        
        X['Hidrology_Horizonal_Roadways_Minus'] = X['Horizontal_Distance_To_Hydrology'] - X['Horizontal_Distance_To_Roadways']
        X['Hidrology_Horizonal_Roadways_Plus'] = X['Horizontal_Distance_To_Hydrology'] + X['Horizontal_Distance_To_Roadways']
        X['Hidrology_Horizonal_Roadways_Prod'] = X['Horizontal_Distance_To_Hydrology'] * X['Horizontal_Distance_To_Roadways']
        
        X['Hidrology_Horizonal_Vertical_Minus'] = X['Horizontal_Distance_To_Hydrology'] - X['Vertical_Distance_To_Hydrology']
        X['Hidrology_Horizonal_Vertical_Plus'] = X['Horizontal_Distance_To_Hydrology'] + X['Vertical_Distance_To_Hydrology']
        X['Hidrology_Horizonal_Vertical_Prod'] = X['Horizontal_Distance_To_Hydrology'] * X['Vertical_Distance_To_Hydrology']
        
        X['Hidrology_Vertical_Fire_Minus'] = X['Vertical_Distance_To_Hydrology'] - X['Horizontal_Distance_To_Fire_Points']
        X['Hidrology_Vertical_Fire_Plus'] = X['Vertical_Distance_To_Hydrology'] + X['Horizontal_Distance_To_Fire_Points']
        X['Hidrology_Vertical_Fire_Prod'] = X['Vertical_Distance_To_Hydrology'] * X['Horizontal_Distance_To_Fire_Points']
        
        X['Hidrology_Vertical_Fire_Minus'] = X['Vertical_Distance_To_Hydrology'] - X['Horizontal_Distance_To_Roadways']
        X['Hidrology_Vertical_Fire_Plus'] = X['Vertical_Distance_To_Hydrology'] + X['Horizontal_Distance_To_Roadways']
        X['Hidrology_Vertical_Fire_Prod'] = X['Vertical_Distance_To_Hydrology'] * X['Horizontal_Distance_To_Roadways']
        
        X.drop(['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points'], axis=1, inplace=True)
        return X

In [3]:
class DropIdentifierFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X.drop('Id', axis=1, inplace=True)
        return X

In [4]:
from sklearn.utils.validation import check_is_fitted

class FromDummiesToCategories(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_operate, new_column_name):
        self.cols_to_operate = cols_to_operate
        self.new_column_name = new_column_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X1 = pd.DataFrame(X[self.cols_to_operate])
        serie = X1.columns[np.where(X1!=0)[1]]
        X[self.new_column_name] = serie
        X.drop(self.cols_to_operate, axis=1, inplace=True)
        return X
    

In [5]:
class AspectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, new_name):
        self.column = column
        self.new_name = new_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X[self.new_name] = X[self.column].copy().apply(self._reclassify_aspect)
        X.drop(self.column, axis=1, inplace=True)
        return X
        
    def _reclassify_aspect(self, x):
        if x<0:
            return 'Flat'
        if  0 >= x < 45:
            return 'North'
        if 45 >= x < 90:
            return 'North_East'
        if 90 >= x < 135:
            return 'East'
        if 135 >= x < 180:
            return 'South_East'
        if 180 >= x < 225:
            return 'South'
        if 225 >= x < 270:
            return 'South_West'
        if 270 >= x < 315:
            return 'West'
        if 315 >= x <360:
            return 'North_West'
        if 360 >= x :
            return 'North_West'
        

# Load Data

In [6]:
data = pd.read_csv(r'../../../data/train.csv')
data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


## Data split

In [7]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [38]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
X = data.drop('Cover_Type', axis=1).copy()
y = data['Cover_Type'].copy()
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [9]:
soil_columns = [x for x in X.columns if x.startswith('Soil_Type')]
wilder_columns = [x for x in X.columns if x.startswith('Wilder')]

## Pipeline 1

In [52]:
pipeline_list_1 = [
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('aspect', AspectTransformer(column='Aspect', new_name='Orientation')),
    ('soil_columns_dummies', FromDummiesToCategories(cols_to_operate=soil_columns, new_column_name='Soil_Type')),
    ('wilder_columns_dummies', FromDummiesToCategories(cols_to_operate=wilder_columns, new_column_name='Wilderness')),
    ('soil_rare', RareLabelEncoder(tol=0.05, variables=['Soil_Type'])),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dt', DistanceTransformer()),
     ('one_hot',OneHotEncoder(variables=['Soil_Type', 'Wilderness', 'Orientation'])),
    ('dteq', EqualFrequencyDiscretiser(q=10, variables=['EuclideanDistanceHidroloy',                                                   
                                                       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),    
]   
pipeline_1 = Pipeline(pipeline_list_1)
X_train_pipe_1 = pipeline_1.fit_transform(X_train)

Unnamed: 0,Elevation,Slope,Hillshade_9am,Hillshade_Noon,EuclideanDistanceHidroloy,Elevation_Vertical_Hydro_Prod,Elevation_Horizontal_Fire_Minus,Elevation_Horizontal_Fire_Plus,Elevation_Horizontal_Roadways_Minus,Hidrology_Horizonal_Fire_Prod,...,Wilderness_Wilderness_Area4,Wilderness_Wilderness_Area2,Orientation_South_East,Orientation_South,Orientation_East,Orientation_North_East,Orientation_South_West,Orientation_North_West,Orientation_West,Orientation_North
520,3047,3,5,7,8,429627,-544,6638,-2853,3346812,...,0,0,1,0,0,0,0,0,0,0
9491,3166,20,6,8,4,0,2805,3527,1004,69312,...,0,0,0,1,0,0,0,0,0,0
12076,3372,10,7,6,5,128136,775,5969,990,628474,...,0,0,1,0,0,0,0,0,0,0
10197,2863,17,5,1,3,188958,584,5142,675,398825,...,0,0,0,0,1,0,0,0,0,0
4775,2408,21,1,1,7,529760,1266,3550,1451,465936,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14984,2491,14,7,8,2,7473,1830,3152,1431,88574,...,0,0,0,1,0,0,0,0,0,0
4940,2289,22,0,3,6,249501,1082,3496,1228,385033,...,1,0,0,0,0,0,0,1,0,0
5458,2299,9,4,4,0,0,1034,3564,1026,0,...,1,0,0,0,0,1,0,0,0,0
14996,2472,10,7,5,0,12360,1801,3143,942,28182,...,0,0,1,0,0,0,0,0,0,0


In [83]:
clf_p1 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
X_test_pipe_1 = pipeline_1.transform(X_test)
X_test_pipe_1 = pd.DataFrame(data=X_test_pipe_1, columns=pipeline_1.named_steps['dcf'].variables)
models_1, predictions_1 = clf_p1.fit(X_train_pipe_1, X_test_pipe_1, y_train, y_test)
models_1.head(5)

 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [01:31<00:07,  2.41s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:39<00:00,  3.31s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.87,0.87,,0.86,2.41
ExtraTreesClassifier,0.86,0.86,,0.86,1.62
LGBMClassifier,0.86,0.86,,0.85,1.84
XGBClassifier,0.85,0.85,,0.85,5.42
BaggingClassifier,0.84,0.84,,0.84,1.14


## Pipeline 2

In this pipeline I will change some transformers as :
* Outliers in Elevation will be windorized
* EuclidianDiscanteHidrology will not be Discretized, its outliers willbe windorized.
* Hillshade_9am will not be Discretized, its outliers will be windorized.
* Hillshade_Noon will not be Discretized, its outliers will be windorized.
* Hillshade_3pm will not be Discretized, its outliers will be windorized

In [85]:
pipeline_list_2 = [
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('aspect', AspectTransformer(column='Aspect', new_name='Orientation')),
    ('soil_columns_dummies', FromDummiesToCategories(cols_to_operate=soil_columns, new_column_name='Soil_Type')),
    ('wilder_columns_dummies', FromDummiesToCategories(cols_to_operate=wilder_columns, new_column_name='Wilderness')),
    ('soil_rare', RareLabelEncoder(tol=0.05, variables=['Soil_Type'])),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dt', DistanceTransformer()),
     ('one_hot',OneHotEncoder(variables=['Soil_Type', 'Wilderness', 'Orientation'])),
    ('winds', Winsorizer(variables=['EuclideanDistanceHidroloy',  'Elevation',                                                 
                                                       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])),    
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),    
]   
pipeline_2 = Pipeline(pipeline_list_2)
X_train_pipe_2 = pipeline_2.fit_transform(X_train)
X_train_pipe_2.head()

Unnamed: 0,Elevation,Slope,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,EuclideanDistanceHidroloy,Elevation_Vertical_Hydro_Prod,Elevation_Horizontal_Fire_Minus,Elevation_Horizontal_Fire_Plus,Elevation_Horizontal_Roadways_Minus,...,Wilderness_Wilderness_Area4,Wilderness_Wilderness_Area2,Orientation_South_East,Orientation_South,Orientation_East,Orientation_North_East,Orientation_South_West,Orientation_North_West,Orientation_West,Orientation_North
520,3047.0,3,224.0,238.0,149.0,888.24,429627,-544,6638,-2853,...,0,0,1,0,0,0,0,0,0,0
9491,3166.0,20,227.0,246.0,141.0,192.0,0,2805,3527,1004,...,0,0,0,1,0,0,0,0,0,0
12076,3372.0,10,238.0,229.0,121.0,244.97,128136,775,5969,990,...,0,0,1,0,0,0,0,0,0,0
10197,2863.0,17,226.0,202.0,104.0,187.03,188958,584,5142,675,...,0,0,0,0,1,0,0,0,0,0
4775,2408.0,21,184.0,195.0,144.0,463.53,529760,1266,3550,1451,...,1,0,0,0,0,1,0,0,0,0


In [87]:
X_test_pipe_2 = pipeline_2.transform(X_test)
X_test_pipe_2.head(5)

Unnamed: 0,Elevation,Slope,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,EuclideanDistanceHidroloy,Elevation_Vertical_Hydro_Prod,Elevation_Horizontal_Fire_Minus,Elevation_Horizontal_Fire_Plus,Elevation_Horizontal_Roadways_Minus,...,Wilderness_Wilderness_Area4,Wilderness_Wilderness_Area2,Orientation_South_East,Orientation_South,Orientation_East,Orientation_North_East,Orientation_South_West,Orientation_North_West,Orientation_West,Orientation_North
9553,3332.0,28,146.0,192.0,178.0,154.5,123284,968,5696,-52,...,0,0,0,0,0,0,0,1,0,0
12001,2743.0,40,83.0,207.0,246.0,610.17,713180,971,4515,1843,...,0,0,0,0,0,0,0,1,0,0
10321,3328.0,22,239.0,193.0,73.0,419.84,329472,2534,4122,2704,...,0,1,0,0,1,0,0,0,0,0
4602,2195.0,21,248.0,225.0,94.0,43.42,24145,965,3425,1134,...,1,0,1,0,0,0,0,0,0,0
4311,2311.0,7,218.0,226.0,143.0,0.0,0,2161,2461,1081,...,1,0,0,0,0,1,0,0,0,0


In [89]:
clf_p2 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_2, predictions_2 = clf_p2.fit(X_train_pipe_2, X_test_pipe_2, y_train, y_test)
models_2.head(5)

 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [01:35<00:07,  2.61s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:43<00:00,  3.44s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.87,0.87,,0.86,3.04
ExtraTreesClassifier,0.86,0.86,,0.86,1.66
LGBMClassifier,0.85,0.85,,0.85,1.91
XGBClassifier,0.85,0.85,,0.85,5.47
BaggingClassifier,0.83,0.83,,0.83,1.15


## Pipeline 3
This pipeline is similar to pipeline2 except it uses SmartcorrelatedSelection.

In [93]:
pipeline_list_3 = [
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('aspect', AspectTransformer(column='Aspect', new_name='Orientation')),
    ('soil_columns_dummies', FromDummiesToCategories(cols_to_operate=soil_columns, new_column_name='Soil_Type')),
    ('wilder_columns_dummies', FromDummiesToCategories(cols_to_operate=wilder_columns, new_column_name='Wilderness')),
    ('soil_rare', RareLabelEncoder(tol=0.05, variables=['Soil_Type'])),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dt', DistanceTransformer()),
     ('one_hot',OneHotEncoder(variables=['Soil_Type', 'Wilderness', 'Orientation'])),
    ('winds', Winsorizer(variables=['EuclideanDistanceHidroloy',  'Elevation',                                                 
                                                       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])),    
    ('dd', SmartCorrelatedSelection()),  
]   
pipeline_3 = Pipeline(pipeline_list_3)
X_train_pipe_3 = pipeline_3.fit_transform(X_train)
X_train_pipe_3.head()

Unnamed: 0,Slope,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,EuclideanDistanceHidroloy,Elevation_Vertical_Hydro_Prod,Elevation_Horizontal_Hydro_Minus,Elevation_Horizontal_Fire_Plus,Elevation_Horizontal_Roadways_Prod,Hidrology_Horizonal_Fire_Minus,...,Wilderness_Wilderness_Area4,Wilderness_Wilderness_Area2,Orientation_South_East,Orientation_South,Orientation_East,Orientation_North_East,Orientation_South_West,Orientation_North_West,Orientation_West,Orientation_North
520,3,224.0,238.0,149.0,888.24,429627,2115,6638,17977300,-2659,...,0,0,1,0,0,0,0,0,0,0
9491,20,227.0,246.0,141.0,192.0,0,2974,3527,6844892,-169,...,0,0,0,1,0,0,0,0,0,0
12076,10,238.0,229.0,121.0,244.97,128136,3130,5969,8032104,-2355,...,0,0,1,0,0,0,0,0,0,0
10197,17,226.0,202.0,104.0,187.03,188958,2688,5142,6264244,-2104,...,0,0,0,0,1,0,0,0,0,0
4775,21,184.0,195.0,144.0,463.53,529760,2000,3550,2304456,-734,...,1,0,0,0,0,1,0,0,0,0


In [94]:
X_test_pipe_3 = pipeline_3.transform(X_test)
X_test_pipe_3.head(5)

Unnamed: 0,Slope,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,EuclideanDistanceHidroloy,Elevation_Vertical_Hydro_Prod,Elevation_Horizontal_Hydro_Minus,Elevation_Horizontal_Fire_Plus,Elevation_Horizontal_Roadways_Prod,Hidrology_Horizonal_Fire_Minus,...,Wilderness_Wilderness_Area4,Wilderness_Wilderness_Area2,Orientation_South_East,Orientation_South,Orientation_East,Orientation_North_East,Orientation_South_West,Orientation_North_West,Orientation_West,Orientation_North
9553,28,146.0,192.0,178.0,154.5,123284,3182,5696,11275488,-2214,...,0,0,0,0,0,0,0,1,0,0
12001,40,83.0,207.0,246.0,610.17,713180,2191,4515,2468700,-1220,...,0,0,0,0,0,0,0,1,0,0
10321,22,239.0,193.0,73.0,419.84,329472,2920,4122,2076672,-386,...,0,1,0,0,1,0,0,0,0,0
4602,21,248.0,225.0,94.0,43.42,24145,2153,3425,2328895,-1188,...,1,0,1,0,0,0,0,0,0,0
4311,7,218.0,226.0,143.0,0.0,0,2311,2461,2842530,-150,...,1,0,0,0,0,1,0,0,0,0


In [95]:
clf_p3 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_3, predictions_3 = clf_p3.fit(X_train_pipe_3, X_test_pipe_3, y_train, y_test)


 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [01:19<00:06,  2.10s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:25<00:00,  2.86s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.85,0.85,,0.85,2.4
LGBMClassifier,0.85,0.85,,0.84,1.44
XGBClassifier,0.84,0.84,,0.84,4.87
ExtraTreesClassifier,0.84,0.84,,0.84,1.52
BaggingClassifier,0.82,0.82,,0.82,1.09


In [96]:
models_3.head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.85,0.85,,0.85,2.4
LGBMClassifier,0.85,0.85,,0.84,1.44
XGBClassifier,0.84,0.84,,0.84,4.87
ExtraTreesClassifier,0.84,0.84,,0.84,1.52
BaggingClassifier,0.82,0.82,,0.82,1.09
LabelSpreading,0.76,0.76,,0.76,13.62
LabelPropagation,0.76,0.76,,0.76,10.4
DecisionTreeClassifier,0.75,0.75,,0.75,0.22
SVC,0.75,0.75,,0.74,5.04
KNeighborsClassifier,0.74,0.74,,0.73,1.14


## Pipeline 4
This pipeline will drop Soil and wilderness columns

In [101]:
pipeline_list_4 = [
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('dropwild_soil',DropFeatures(soil_columns + wilder_columns)),
    ('aspect', AspectTransformer(column='Aspect', new_name='Orientation')),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dt', DistanceTransformer()),
    ('one_hot',OneHotEncoder(variables=['Orientation'])),
    ('winds', Winsorizer(variables=[
        'EuclideanDistanceHidroloy',  'Elevation', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])),    
    ('dd', SmartCorrelatedSelection()),  
]   
pipeline_4 = Pipeline(pipeline_list_4)
X_train_pipe_4 = pipeline_4.fit_transform(X_train)
X_train_pipe_4.head()

Unnamed: 0,Slope,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,EuclideanDistanceHidroloy,Elevation_Vertical_Hydro_Prod,Elevation_Horizontal_Hydro_Minus,Elevation_Horizontal_Fire_Plus,Elevation_Horizontal_Roadways_Prod,Hidrology_Horizonal_Fire_Minus,Hidrology_Horizonal_Fire_Prod,Hidrology_Vertical_Fire_Prod,Orientation_South_East,Orientation_South,Orientation_East,Orientation_North_East,Orientation_South_West,Orientation_North_West,Orientation_West,Orientation_North
520,3,224.0,238.0,149.0,888.24,429627,2115,6638,17977300,-2659,3346812,831900,1,0,0,0,0,0,0,0
9491,20,227.0,246.0,141.0,192.0,0,2974,3527,6844892,-169,69312,0,0,1,0,0,0,0,0,0
12076,10,238.0,229.0,121.0,244.97,128136,3130,5969,8032104,-2355,628474,90516,1,0,0,0,0,0,0,0
10197,17,226.0,202.0,104.0,187.03,188958,2688,5142,6264244,-2104,398825,144408,0,0,1,0,0,0,0,0
4775,21,184.0,195.0,144.0,463.53,529760,2000,3550,2304456,-734,465936,210540,0,0,0,1,0,0,0,0


In [102]:
X_test_pipe_4 = pipeline_4.transform(X_test)
X_test_pipe_4.head(5)

Unnamed: 0,Slope,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,EuclideanDistanceHidroloy,Elevation_Vertical_Hydro_Prod,Elevation_Horizontal_Hydro_Minus,Elevation_Horizontal_Fire_Plus,Elevation_Horizontal_Roadways_Prod,Hidrology_Horizonal_Fire_Minus,Hidrology_Horizonal_Fire_Prod,Hidrology_Vertical_Fire_Prod,Orientation_South_East,Orientation_South,Orientation_East,Orientation_North_East,Orientation_South_West,Orientation_North_West,Orientation_West,Orientation_North
9553,28,146.0,192.0,178.0,154.5,123284,3182,5696,11275488,-2214,354600,125208,0,0,0,0,0,1,0,0
12001,40,83.0,207.0,246.0,610.17,713180,2191,4515,2468700,-1220,978144,234000,0,0,0,0,0,1,0,0
10321,22,239.0,193.0,73.0,419.84,329472,2920,4122,2076672,-386,323952,61776,0,0,1,0,0,0,0,0
4602,21,248.0,225.0,94.0,43.42,24145,2153,3425,2328895,-1188,51660,11671,1,0,0,0,0,0,0,0
4311,7,218.0,226.0,143.0,0.0,0,2311,2461,2842530,-150,0,0,0,0,0,1,0,0,0,0


In [103]:
clf_p4 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_4, predictions_4 = clf_p4.fit(X_train_pipe_4, X_test_pipe_4, y_train, y_test)

 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [01:23<00:06,  2.06s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:30<00:00,  3.01s/it]


In [104]:
models_4.head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.83,0.83,,0.83,1.54
RandomForestClassifier,0.83,0.83,,0.82,2.99
XGBClassifier,0.82,0.82,,0.82,5.22
ExtraTreesClassifier,0.82,0.82,,0.81,1.63
BaggingClassifier,0.79,0.79,,0.79,1.01
LabelPropagation,0.73,0.73,,0.73,9.32
LabelSpreading,0.73,0.73,,0.73,12.17
SVC,0.73,0.73,,0.72,4.7
DecisionTreeClassifier,0.72,0.72,,0.72,0.22
KNeighborsClassifier,0.71,0.71,,0.7,0.88


## Pipeline 5
This pipeline will use again the coluns of wilderness and soiltype, but will not make the aspect transformation.

In [106]:
pipeline_list_5 = [
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('soil_columns_dummies', FromDummiesToCategories(cols_to_operate=soil_columns, new_column_name='Soil_Type')),
    ('wilder_columns_dummies', FromDummiesToCategories(cols_to_operate=wilder_columns, new_column_name='Wilderness')),
    ('soil_rare', RareLabelEncoder(tol=0.05, variables=['Soil_Type'])),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dt', DistanceTransformer()),
     ('one_hot',OneHotEncoder(variables=['Soil_Type', 'Wilderness'])),
    ('winds', Winsorizer(variables=['EuclideanDistanceHidroloy',  'Elevation',                                                 
                                                       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])),    
    ('dd', SmartCorrelatedSelection()),  
]   
pipeline_5 = Pipeline(pipeline_list_5)
X_train_pipe_5 = pipeline_5.fit_transform(X_train)
X_train_pipe_5.head()

Unnamed: 0,Aspect,Slope,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,EuclideanDistanceHidroloy,Elevation_Vertical_Hydro_Prod,Elevation_Horizontal_Hydro_Minus,Elevation_Horizontal_Fire_Plus,Elevation_Horizontal_Roadways_Prod,...,Hidrology_Vertical_Fire_Prod,Soil_Type_Soil_Type29,Soil_Type_Rare,Soil_Type_Soil_Type10,Soil_Type_Soil_Type3,Soil_Type_Soil_Type4,Wilderness_Wilderness_Area1,Wilderness_Wilderness_Area3,Wilderness_Wilderness_Area4,Wilderness_Wilderness_Area2
520,135,3,224.0,238.0,149.0,888.24,429627,2115,6638,17977300,...,831900,1,0,0,0,0,1,0,0,0
9491,174,20,227.0,246.0,141.0,192.0,0,2974,3527,6844892,...,0,0,1,0,0,0,0,1,0,0
12076,112,10,238.0,229.0,121.0,244.97,128136,3130,5969,8032104,...,90516,0,1,0,0,0,1,0,0,0
10197,51,17,226.0,202.0,104.0,187.03,188958,2688,5142,6264244,...,144408,0,1,0,0,0,0,1,0,0
4775,1,21,184.0,195.0,144.0,463.53,529760,2000,3550,2304456,...,210540,0,1,0,0,0,0,0,1,0


In [107]:
X_test_pipe_5 = pipeline_5.transform(X_test)
X_test_pipe_5.head(5)

Unnamed: 0,Aspect,Slope,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,EuclideanDistanceHidroloy,Elevation_Vertical_Hydro_Prod,Elevation_Horizontal_Hydro_Minus,Elevation_Horizontal_Fire_Plus,Elevation_Horizontal_Roadways_Prod,...,Hidrology_Vertical_Fire_Prod,Soil_Type_Soil_Type29,Soil_Type_Rare,Soil_Type_Soil_Type10,Soil_Type_Soil_Type3,Soil_Type_Soil_Type4,Wilderness_Wilderness_Area1,Wilderness_Wilderness_Area3,Wilderness_Wilderness_Area4,Wilderness_Wilderness_Area2
9553,334,28,146.0,192.0,178.0,154.5,123284,3182,5696,11275488,...,125208,0,1,0,0,0,0,1,0,0
12001,282,40,83.0,207.0,246.0,610.17,713180,2191,4515,2468700,...,234000,0,1,0,0,0,0,1,0,0
10321,73,22,239.0,193.0,73.0,419.84,329472,2920,4122,2076672,...,61776,0,1,0,0,0,0,0,0,1
4602,134,21,248.0,225.0,94.0,43.42,24145,2153,3425,2328895,...,11671,0,0,0,1,0,0,0,1,0
4311,30,7,218.0,226.0,143.0,0.0,0,2311,2461,2842530,...,0,0,1,0,0,0,0,0,1,0


In [108]:
clf_p5 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_5, predictions_5 = clf_p5.fit(X_train_pipe_5, X_test_pipe_5, y_train, y_test)

 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [01:19<00:06,  2.04s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:25<00:00,  2.86s/it]


In [109]:
models_5.head(10)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,0.86,0.86,,0.86,2.36
RandomForestClassifier,0.85,0.85,,0.85,2.93
LGBMClassifier,0.85,0.85,,0.84,1.34
XGBClassifier,0.84,0.84,,0.84,5.47
BaggingClassifier,0.82,0.82,,0.81,1.18
LabelSpreading,0.78,0.78,,0.78,13.29
LabelPropagation,0.78,0.78,,0.78,10.44
KNeighborsClassifier,0.78,0.78,,0.77,0.85
SVC,0.76,0.76,,0.76,4.63
DecisionTreeClassifier,0.76,0.76,,0.75,0.47


# Evaluation Pipeline 1

In [114]:
pipe_list_1_rf = [('rf', RandomForestClassifier(random_state=42))]
pipe_1_rf = Pipeline(pipeline_list_1 + pipe_list_1_rf)
rfpg1 ={
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [60,80, None],
    'rf__min_samples_split': [2, 3, 4],
}
grid1_pipe_1 = GridSearchCV(pipe_1_rf, param_grid=rfpg1, cv=sss, n_jobs=-1, verbose=3)
grid1_pipe_1.fit(X_train, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid1_pipe_1.best_score_)) 
print("Test set score: {:.2f}".format(grid1_pipe_1.score(X_test, y_test))) 
print("Best parameters: {}".format(grid1_pipe_1.best_params_))

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  1.8min finished


Best cross-validation accuracy: 0.86
Test set score: 0.87
Best parameters: {'rf__max_depth': 60, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}


In [115]:
y_predict_pipe_1 = grid1_pipe_1.predict(X_test)

In [116]:
pd.DataFrame(metrics.confusion_matrix(y_test, y_predict_pipe_1, labels=y_test.unique().tolist()), columns =y_test.unique().tolist(), index = y_test.unique().tolist() )

Unnamed: 0,7,2,4,6,3,5,1
7,428,0,0,0,0,0,4
2,1,300,0,10,12,36,73
4,0,0,420,6,6,0,0
6,0,3,15,378,32,4,0
3,0,0,9,52,365,6,0
5,0,6,0,1,3,422,0
1,31,74,0,3,1,9,314


In [117]:
print(metrics.classification_report(y_test, y_predict_pipe_1, digits=3))

              precision    recall  f1-score   support

           1      0.803     0.727     0.763       432
           2      0.783     0.694     0.736       432
           3      0.871     0.845     0.858       432
           4      0.946     0.972     0.959       432
           5      0.885     0.977     0.928       432
           6      0.840     0.875     0.857       432
           7      0.930     0.991     0.960       432

    accuracy                          0.869      3024
   macro avg      0.866     0.869     0.866      3024
weighted avg      0.866     0.869     0.866      3024



### Analysis of Classifier
This classifier has problems with cover type 2 as it miss classify a large portion of the samples, specially it gets confusse class 2 with class 1

# Evaluation pipeline 2

In [119]:
pipe_list_2_rf = [('rf', RandomForestClassifier(random_state=42))]
pipe_2_rf = Pipeline(pipeline_list_2 + pipe_list_2_rf)
rfpg2 ={
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [60,80, None],
    'rf__min_samples_split': [2, 3, 4],
}
grid1_pipe_2 = GridSearchCV(pipe_2_rf, param_grid=rfpg2, cv=sss, n_jobs=-1, verbose=3)
grid1_pipe_2.fit(X_train, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid1_pipe_2.best_score_)) 
print("Test set score: {:.2f}".format(grid1_pipe_2.score(X_test, y_test))) 
print("Best parameters: {}".format(grid1_pipe_2.best_params_))

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  1.8min finished


Best cross-validation accuracy: 0.85
Test set score: 0.86
Best parameters: {'rf__max_depth': 60, 'rf__min_samples_split': 3, 'rf__n_estimators': 200}


In [120]:
y_predict_pipe_2 = grid1_pipe_2.predict(X_test)

In [121]:
pd.DataFrame(metrics.confusion_matrix(y_test, y_predict_pipe_2, labels=y_test.unique().tolist()), columns =y_test.unique().tolist(), index = y_test.unique().tolist() )

Unnamed: 0,7,2,4,6,3,5,1
7,426,0,0,0,0,1,5
2,2,289,0,12,10,40,79
4,0,0,418,8,6,0,0
6,0,3,15,375,35,4,0
3,0,1,12,52,361,6,0
5,0,8,0,3,4,416,1
1,29,76,0,3,1,11,312


In [122]:
print(metrics.classification_report(y_test, y_predict_pipe_2, digits=3))

              precision    recall  f1-score   support

           1      0.786     0.722     0.753       432
           2      0.767     0.669     0.714       432
           3      0.866     0.836     0.850       432
           4      0.939     0.968     0.953       432
           5      0.870     0.963     0.914       432
           6      0.828     0.868     0.847       432
           7      0.932     0.986     0.958       432

    accuracy                          0.859      3024
   macro avg      0.855     0.859     0.856      3024
weighted avg      0.855     0.859     0.856      3024



### Analysis of Classifier
This classifier and pipeline behavies worst that the first one.


# Conclusion
Im not happy with the results in this one, but im gonna stay with pipeline number 1, if in the future I have a better idea or if in my study I see somthing that is worth to try on Data Analysis I will do it in this dataset as i Really enjoy working on it. Now TO KAGGLE

In [123]:
test = pd.read_csv(r'../../../data/test.csv')

In [124]:
test['Cover_Type'] = grid1_pipe_1.predict(test)

In [125]:
to_kaggle = test[['Id', 'Cover_Type']]


In [127]:
to_kaggle.to_csv('grid1_pipe_1', index=False)