In [54]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import seaborn as sns

from feature_engine.discretisation import EqualWidthDiscretiser, EqualFrequencyDiscretiser
from feature_engine.selection  import DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures
from feature_engine.encoding import RareLabelEncoder, OneHotEncoder, OrdinalEncoder
from feature_engine.creation import MathematicalCombination
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from lazypredict.Supervised import LazyClassifier

# Custom Transformers

In [45]:
class DistanceTransformer(BaseEstimator, TransformerMixin):
    # TODO create a tranformer that do its for any numeric variables in a pandas dataframe
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['EuclideanDistanceHidroloy'] = np.around(
            np.sqrt(X['Horizontal_Distance_To_Hydrology'] **2 +
                    X['Vertical_Distance_To_Hydrology'] **2), 
            4)
        X['Elevation_Vertical_Hydro_Minus'] = X['Elevation'] - X['Vertical_Distance_To_Hydrology']
        X['Elevation_Vertical_Hydro_Plus'] = X['Elevation'] + X['Vertical_Distance_To_Hydrology']
        X['Elevation_Vertical_Hydro_Prod'] = X['Elevation'] * X['Vertical_Distance_To_Hydrology']
        
        X['Elevation_Horizontal_Hydro_Minus'] = X['Elevation'] - X['Horizontal_Distance_To_Hydrology']
        X['Elevation_Horizontal_Hydro_Plus'] = X['Elevation'] + X['Horizontal_Distance_To_Hydrology']
        X['Elevation_Horizontal_Hydro_Prod'] = X['Elevation'] * X['Horizontal_Distance_To_Hydrology']
        
        X['Elevation_Horizontal_Fire_Minus'] = X['Elevation'] - X['Horizontal_Distance_To_Fire_Points']
        X['Elevation_Horizontal_Fire_Plus'] = X['Elevation'] + X['Horizontal_Distance_To_Fire_Points']
        X['Elevation_Horizontal_Fire_Prod'] = X['Elevation'] * X['Horizontal_Distance_To_Fire_Points']
        
        X['Elevation_Horizontal_Roadways_Minus'] = X['Elevation'] - X['Horizontal_Distance_To_Roadways']
        X['Elevation_Horizontal_Roadways_Plus'] = X['Elevation'] + X['Horizontal_Distance_To_Roadways']
        X['Elevation_Horizontal_Roadways_Prod'] = X['Elevation'] * X['Horizontal_Distance_To_Roadways']
                
        X['Hidrology_Horizonal_Fire_Minus'] = X['Horizontal_Distance_To_Hydrology'] - X['Horizontal_Distance_To_Fire_Points']
        X['Hidrology_Horizonal_Fire_Plus'] = X['Horizontal_Distance_To_Hydrology'] + X['Horizontal_Distance_To_Fire_Points']
        X['Hidrology_Horizonal_Fire_Prod'] = X['Horizontal_Distance_To_Hydrology'] * X['Horizontal_Distance_To_Fire_Points']
        
        X['Hidrology_Horizonal_Roadways_Minus'] = X['Horizontal_Distance_To_Hydrology'] - X['Horizontal_Distance_To_Roadways']
        X['Hidrology_Horizonal_Roadways_Plus'] = X['Horizontal_Distance_To_Hydrology'] + X['Horizontal_Distance_To_Roadways']
        X['Hidrology_Horizonal_Roadways_Prod'] = X['Horizontal_Distance_To_Hydrology'] * X['Horizontal_Distance_To_Roadways']
        
        X['Hidrology_Horizonal_Vertical_Minus'] = X['Horizontal_Distance_To_Hydrology'] - X['Vertical_Distance_To_Hydrology']
        X['Hidrology_Horizonal_Vertical_Plus'] = X['Horizontal_Distance_To_Hydrology'] + X['Vertical_Distance_To_Hydrology']
        X['Hidrology_Horizonal_Vertical_Prod'] = X['Horizontal_Distance_To_Hydrology'] * X['Vertical_Distance_To_Hydrology']
        
        X['Hidrology_Vertical_Fire_Minus'] = X['Vertical_Distance_To_Hydrology'] - X['Horizontal_Distance_To_Fire_Points']
        X['Hidrology_Vertical_Fire_Plus'] = X['Vertical_Distance_To_Hydrology'] + X['Horizontal_Distance_To_Fire_Points']
        X['Hidrology_Vertical_Fire_Prod'] = X['Vertical_Distance_To_Hydrology'] * X['Horizontal_Distance_To_Fire_Points']
        
        X['Hidrology_Vertical_Fire_Minus'] = X['Vertical_Distance_To_Hydrology'] - X['Horizontal_Distance_To_Roadways']
        X['Hidrology_Vertical_Fire_Plus'] = X['Vertical_Distance_To_Hydrology'] + X['Horizontal_Distance_To_Roadways']
        X['Hidrology_Vertical_Fire_Prod'] = X['Vertical_Distance_To_Hydrology'] * X['Horizontal_Distance_To_Roadways']
        
        X.drop(['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Elevation', 'Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points'], axis=1, inplace=True)
        return X

In [3]:
class DropIdentifierFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X.drop('Id', axis=1, inplace=True)
        return X

In [4]:
from sklearn.utils.validation import check_is_fitted

class FromDummiesToCategories(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_operate):
        self.cols_to_operate = cols_to_operate

    def fit(self, X, y=None):
        self.columns_to_keep_ = list(set(X.columns.tolist()) - set(self.cols_to_operate))
        return self

    def transform(self, X):
        check_is_fitted(self, 'columns_to_keep_')
        X = X.copy()
        X = X[self.columns_to_keep_]
        return X
    

# Load Data

In [5]:
data = pd.read_csv(r'../../../data/train.csv')
data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


## Data split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('Cover_Type', axis=1),
                                                    data['Cover_Type'],
                                                    test_size=.2,
                                                    random_state =42)

In [7]:
soil_columns = [x for x in data.columns if x.startswith('Soil_Type')]

In [78]:
pipeline_list_1 = [
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('du', FromDummiesToCategories(cols_to_operate=soil_columns)),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dt', DistanceTransformer()),
    ('dteq', EqualFrequencyDiscretiser(q=10, variables=['EuclideanDistanceHidroloy',   'Aspect',                                                     
                                                       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])),
    ('dcf', DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)),
]   
pipeline_1 = Pipeline(pipeline_list_1)
X_train_pipe_1 = pipeline_1.fit_transform(X_train)
X_train_pipe_1.head(5)

Unnamed: 0,Wilderness_Area3,Hillshade_Noon,Wilderness_Area1,Hillshade_9am,Wilderness_Area4,Elevation,Aspect,Wilderness_Area2,Slope,EuclideanDistanceHidroloy,Hidrology_Horizonal_Fire_Minus,Hidrology_Horizonal_Fire_Prod,Hidrology_Horizonal_Roadways_Minus,Hidrology_Horizonal_Vertical_Prod
4679,0,0,0,0,1,2364,0,0,30,7,309,38380,-703,61408
6634,1,4,0,6,0,2883,2,0,10,6,-1040,512825,-325,18615
11459,0,8,1,7,0,2972,5,0,15,2,-5172,570240,-4087,2160
15053,1,0,0,0,0,2395,9,0,32,2,-780,83125,-512,2565
8222,1,1,0,7,0,3244,2,0,21,2,-300,44064,-2413,2700


In [79]:
clf_p1 = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
X_test_pipe_1 = pipeline_1.transform(X_test)
X_test_pipe_1 = pd.DataFrame(data=X_test_pipe_1, columns=pipeline_1.named_steps['dcf'].variables)
models_1, predictions_1 = clf_p1.fit(X_train_pipe_1, X_test_pipe_1, y_train, y_test)
models_1.head(5)

 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [01:01<00:04,  1.60s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:08<00:00,  2.28s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.86,0.86,,0.86,2.26
ExtraTreesClassifier,0.85,0.85,,0.85,1.49
LGBMClassifier,0.85,0.85,,0.85,1.47
XGBClassifier,0.85,0.85,,0.84,5.04
BaggingClassifier,0.82,0.82,,0.82,0.85


In [80]:
pipe_list_1_rf = [('rf', RandomForestClassifier(random_state=42))]
pipe_2_rf = Pipeline(pipeline_list_1 + pipe_list_1_rf)
rfpg1 ={
    'dcf__threshold': [.75, .80, ],
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [60,80, None],
    'rf__min_samples_split': [2, 3, 4],
}
grid1_pipe_2 = GridSearchCV(pipe_2_rf, param_grid=rfpg1, cv=5, n_jobs=-1, verbose=3)
grid1_pipe_2.fit(X_train, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid1_pipe_2.best_score_)) 
print("Test set score: {:.2f}".format(grid1_pipe_2.score(X_test, y_test))) 
print("Best parameters: {}".format(grid1_pipe_2.best_params_))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  2.5min finished


Best cross-validation accuracy: 0.84
Test set score: 0.86
Best parameters: {'dcf__threshold': 0.8, 'rf__max_depth': None, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}


In [81]:
test = pd.read_csv(r'../../../data/test.csv')

In [82]:
test['Cover_Type'] = grid1_pipe_2.predict(test)

In [83]:
to_kaggle = test[['Id', 'Cover_Type']]


In [84]:
to_kaggle.to_csv('new_transformations', index=False)