In [42]:
import pandas as pd
import numpy as np
import feature_engine
from sklearn.model_selection import train_test_split
import seaborn as sns

In [43]:
data = pd.read_csv(r'../../../data/train.csv')
data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [44]:
from sklearn.base import BaseEstimator, TransformerMixin

class DistanceTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['EuclideanDistanceHidroloy'] = np.around(np.sqrt(X['Horizontal_Distance_To_Hydrology'] **2 + X['Vertical_Distance_To_Hydrology'] **2), 4)
        X.drop(['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'], axis=1, inplace=True)
        return X

In [45]:
class DropIdentifierFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X.drop('Id', axis=1, inplace=True)
        return X

In [46]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('Cover_Type', axis=1),
                                                    data['Cover_Type'],
                                                    test_size=.2,
                                                    random_state =123)

In [47]:
y_train.value_counts(normalize=True) * 100, y_test.value_counts(normalize=True) * 100

(2   14.70
 7   14.39
 4   14.32
 3   14.31
 6   14.14
 1   14.11
 5   14.03
 Name: Cover_Type, dtype: float64,
 5   15.31
 1   14.98
 6   14.88
 3   14.19
 4   14.15
 7   13.86
 2   12.63
 Name: Cover_Type, dtype: float64)

In [48]:
from feature_engine.discretisation import EqualWidthDiscretiser
from feature_engine.selection  import DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures

##  Pipeline

In [49]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('dropuniquefeatures', DropIdentifierFeatures()),
    ('dp', DropConstantFeatures(tol=0.99)),
    ('dd', DropDuplicateFeatures()),
    ('dt', DistanceTransformer()),
    #('ed', EqualWidthDiscretiser(bins=36, variables=['Aspect'])),
    #('edh', EqualWidthDiscretiser(bins=26, variables=['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'])),
])
new = pipe.fit_transform(X_train)
new.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,...,Soil_Type24,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type38,Soil_Type39,Soil_Type40,EuclideanDistanceHidroloy
7711,3151,23,19,2828,206,197,122,2200,0,0,...,1,0,0,0,0,0,0,0,0,1052.45
1466,3101,303,14,3399,181,233,192,631,1,0,...,0,1,0,0,0,0,0,0,0,277.0
12128,2079,324,32,330,126,189,193,908,0,0,...,0,0,0,0,0,0,0,0,0,427.2
6301,2314,121,15,644,244,227,107,702,0,0,...,0,0,0,0,0,0,0,0,0,0.0
9822,3164,117,4,872,227,236,143,2536,1,0,...,0,0,0,0,0,0,0,0,0,0.0


In [50]:
X_train.shape, new.shape

((12096, 55), (12096, 36))

In [51]:
new.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type17', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40', 'EuclideanDistanceHidroloy'],
      dtype='object')

In [52]:
tr = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.80)
new = tr.fit_transform(new)
new.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,...,Soil_Type24,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type38,Soil_Type39,Soil_Type40,EuclideanDistanceHidroloy
7711,3151,23,19,2828,206,197,122,2200,0,0,...,1,0,0,0,0,0,0,0,0,1052.45
1466,3101,303,14,3399,181,233,192,631,1,0,...,0,1,0,0,0,0,0,0,0,277.0
12128,2079,324,32,330,126,189,193,908,0,0,...,0,0,0,0,0,0,0,0,0,427.2
6301,2314,121,15,644,244,227,107,702,0,0,...,0,0,0,0,0,0,0,0,0,0.0
9822,3164,117,4,872,227,236,143,2536,1,0,...,0,0,0,0,0,0,0,0,0,0.0


In [53]:
new.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type17', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40', 'EuclideanDistanceHidroloy'],
      dtype='object')

In [54]:
new.shape, y_train.shape

((12096, 36), (12096,))

## Comparing model performaces with engineered data vs Not enginereed

In [55]:
from lazypredict.Supervised import LazyClassifier

In [56]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)

In [57]:
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(new, y_train, test_size=.2,random_state =123)
models,predictions = clf.fit(X_train_t, X_test_t, y_train_t, y_test_t )
predictions

 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [01:10<00:05,  1.67s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:17<00:00,  2.60s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.85,0.85,,0.85,1.82
ExtraTreesClassifier,0.85,0.85,,0.85,1.5
LGBMClassifier,0.85,0.85,,0.84,1.66
XGBClassifier,0.84,0.85,,0.84,5.29
BaggingClassifier,0.83,0.83,,0.83,0.67
LabelPropagation,0.8,0.8,,0.79,7.77
LabelSpreading,0.8,0.8,,0.79,9.62
DecisionTreeClassifier,0.77,0.77,,0.76,0.18
KNeighborsClassifier,0.76,0.76,,0.75,1.85
SVC,0.72,0.73,,0.72,3.86


In [58]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier

## Random Forest


### Iteration 1

In [70]:
rfpg = {
    'n_estimators': [12,18,24,48,89,100],
    'max_depth': [2,4,6,8,10,12,14,16,18,20],
    'min_samples_split': [5,8,10,15],
}

In [71]:
rfgs = GridSearchCV(RandomForestClassifier(), rfpg, cv=5, n_jobs=-1, verbose=2)

In [72]:
rfgs.fit(new, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
                         'min_samples_split': [5, 8, 10, 15],
                         'n_estimators': [12, 18, 24, 48, 89, 100]})

In [73]:
rfgs.best_params_

{'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}

In [74]:
rfgs.best_score_

0.8451550227366681

In [76]:
pd.DataFrame(rfgs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.06,0.01,0.01,0.00,2,5,12,"{'max_depth': 2, 'min_samples_split': 5, 'n_es...",0.57,0.49,0.56,0.54,0.60,0.55,0.04,233
1,0.08,0.00,0.01,0.00,2,5,18,"{'max_depth': 2, 'min_samples_split': 5, 'n_es...",0.52,0.58,0.55,0.51,0.54,0.54,0.02,237
2,0.11,0.01,0.01,0.00,2,5,24,"{'max_depth': 2, 'min_samples_split': 5, 'n_es...",0.55,0.50,0.58,0.52,0.55,0.54,0.03,236
3,0.20,0.01,0.01,0.00,2,5,48,"{'max_depth': 2, 'min_samples_split': 5, 'n_es...",0.58,0.57,0.57,0.58,0.57,0.57,0.01,227
4,0.35,0.00,0.02,0.00,2,5,89,"{'max_depth': 2, 'min_samples_split': 5, 'n_es...",0.60,0.59,0.56,0.60,0.60,0.59,0.02,217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.26,0.01,0.01,0.00,20,15,18,"{'max_depth': 20, 'min_samples_split': 15, 'n_...",0.83,0.82,0.82,0.81,0.83,0.82,0.01,59
236,0.34,0.00,0.01,0.00,20,15,24,"{'max_depth': 20, 'min_samples_split': 15, 'n_...",0.83,0.82,0.82,0.82,0.83,0.83,0.01,46
237,0.66,0.01,0.03,0.00,20,15,48,"{'max_depth': 20, 'min_samples_split': 15, 'n_...",0.82,0.83,0.83,0.82,0.84,0.83,0.01,36
238,1.21,0.01,0.05,0.00,20,15,89,"{'max_depth': 20, 'min_samples_split': 15, 'n_...",0.83,0.84,0.83,0.82,0.84,0.83,0.01,28


As the gridsearch foun that the min_samples_split was the lower value, and n estimators an max depth where the highest another gridsearch can be created with different hyperparameters

### Iteration 2

In [78]:
rfpg_2 = {
    'n_estimators': [100, 150, 200],
    'max_depth': [20,40,60,80],
    'min_samples_split': [1, 2, 3, 4, 5],
}
rfgs2 = GridSearchCV(RandomForestClassifier(), rfpg_2, cv=5, n_jobs=-1, verbose=2)
rfgs2.fit(new, y_train)


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.3min finished


{'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
0.8451550227366681


In [80]:
print(rfgs2.best_params_)
print(rfgs2.best_score_)

{'max_depth': 80, 'min_samples_split': 2, 'n_estimators': 200}
0.8547448744273127


In [82]:
results2 = pd.DataFrame(rfgs2.cv_results_)
results2.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.1,0.01,0.0,0.0,20,1,100,"{'max_depth': 20, 'min_samples_split': 1, 'n_e...",,,,,,,,60
1,0.12,0.02,0.0,0.0,20,1,150,"{'max_depth': 20, 'min_samples_split': 1, 'n_e...",,,,,,,,50
2,0.19,0.01,0.0,0.0,20,1,200,"{'max_depth': 20, 'min_samples_split': 1, 'n_e...",,,,,,,,49
3,2.88,0.21,0.12,0.01,20,2,100,"{'max_depth': 20, 'min_samples_split': 2, 'n_e...",0.85,0.85,0.85,0.84,0.86,0.85,0.01,32
4,3.97,0.22,0.18,0.02,20,2,150,"{'max_depth': 20, 'min_samples_split': 2, 'n_e...",0.85,0.85,0.84,0.84,0.86,0.85,0.01,21


#### Conclusion Iteration 2
As the result was that both n_estimator, and max_depth was at the maximum of the Grid i will increase those values

### Iteration 3


In [83]:
rfpg_3 = {
    'n_estimators': [200, 400, 600],
    'max_depth': [80, 160, 240],
}
rfgs3 = GridSearchCV(RandomForestClassifier(), rfpg_3, cv=5, n_jobs=-1, verbose=2)
rfgs3.fit(new, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.3min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [80, 160, 240],
                         'n_estimators': [200, 400, 600]},
             verbose=2)

In [85]:
print(rfgs3.best_params_)
print(rfgs3.best_score_)

{'max_depth': 160, 'n_estimators': 600}
0.8554062706056392


In [86]:
pd.DataFrame(rfgs3.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,5.96,0.11,0.25,0.04,80,200,"{'max_depth': 80, 'n_estimators': 200}",0.86,0.85,0.86,0.84,0.86,0.85,0.01,5
1,12.39,0.66,0.48,0.05,80,400,"{'max_depth': 80, 'n_estimators': 400}",0.86,0.85,0.85,0.84,0.86,0.85,0.01,6
2,18.39,0.58,0.77,0.11,80,600,"{'max_depth': 80, 'n_estimators': 600}",0.86,0.86,0.85,0.84,0.87,0.86,0.01,2
3,6.12,0.71,0.22,0.02,160,200,"{'max_depth': 160, 'n_estimators': 200}",0.86,0.85,0.85,0.84,0.87,0.85,0.01,4
4,11.42,0.5,0.51,0.04,160,400,"{'max_depth': 160, 'n_estimators': 400}",0.86,0.86,0.84,0.84,0.86,0.85,0.01,8
5,16.98,0.29,0.78,0.04,160,600,"{'max_depth': 160, 'n_estimators': 600}",0.86,0.86,0.85,0.84,0.86,0.86,0.01,1
6,5.46,0.14,0.23,0.02,240,200,"{'max_depth': 240, 'n_estimators': 200}",0.86,0.85,0.84,0.84,0.87,0.85,0.01,9
7,11.02,0.12,0.48,0.02,240,400,"{'max_depth': 240, 'n_estimators': 400}",0.86,0.86,0.84,0.84,0.86,0.85,0.01,7
8,14.58,0.93,0.53,0.07,240,600,"{'max_depth': 240, 'n_estimators': 600}",0.86,0.86,0.85,0.84,0.87,0.85,0.01,3


####  Conclusion Iteration 3
After checkih the table its posible to see that there 2 posible solutions with a score of 85.54%, lets see if I can increase a little bit more the precision

### Iteration 4

In [87]:
rfpg_4 = {
    'n_estimators': [600,800,1000,1500],
    'max_depth': [160,180,200,220],
}
rfpg_4 = GridSearchCV(RandomForestClassifier(), rfpg_4, cv=5, n_jobs=-1, verbose=2)
rfpg_4.fit(new, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  5.0min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [160, 180, 200, 220],
                         'n_estimators': [600, 800, 1000, 1500]},
             verbose=2)

In [91]:
print(rfpg_4.best_params_)
print(rfpg_4.best_score_)
pd.DataFrame(rfpg_4.cv_results_).sort_values(['rank_test_score'])

{'max_depth': 160, 'n_estimators': 1500}
0.8565638078708844


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,42.34,0.36,2.57,0.23,160,1500,"{'max_depth': 160, 'n_estimators': 1500}",0.86,0.86,0.85,0.84,0.87,0.86,0.01,1
13,22.02,0.97,0.98,0.04,220,800,"{'max_depth': 220, 'n_estimators': 800}",0.86,0.86,0.85,0.85,0.87,0.86,0.01,2
1,22.6,0.98,1.01,0.05,160,800,"{'max_depth': 160, 'n_estimators': 800}",0.86,0.86,0.85,0.84,0.87,0.86,0.01,3
2,27.85,0.69,1.32,0.06,160,1000,"{'max_depth': 160, 'n_estimators': 1000}",0.86,0.86,0.85,0.85,0.87,0.86,0.01,4
8,17.05,0.77,0.75,0.06,200,600,"{'max_depth': 200, 'n_estimators': 600}",0.86,0.86,0.85,0.85,0.86,0.86,0.01,5
15,37.73,2.68,1.57,0.21,220,1500,"{'max_depth': 220, 'n_estimators': 1500}",0.86,0.86,0.85,0.84,0.87,0.86,0.01,5
11,42.57,0.72,2.42,0.19,200,1500,"{'max_depth': 200, 'n_estimators': 1500}",0.86,0.86,0.85,0.84,0.87,0.86,0.01,7
6,26.98,0.83,1.32,0.08,180,1000,"{'max_depth': 180, 'n_estimators': 1000}",0.86,0.86,0.85,0.84,0.87,0.86,0.01,8
5,22.45,0.45,1.03,0.04,180,800,"{'max_depth': 180, 'n_estimators': 800}",0.86,0.86,0.85,0.84,0.87,0.86,0.01,9
14,28.73,1.15,1.41,0.18,220,1000,"{'max_depth': 220, 'n_estimators': 1000}",0.86,0.86,0.85,0.84,0.87,0.86,0.01,10


####  Conclusion Iteration 4
There is no significant difference for the model of Iteration 3, so the Model 3 Is the chossen one for Random Forests


## LGBMClassifier

### Iteration 1

In [89]:
lgbmg_1 = {
    'boosting_type': ['goss', 'dart', 'gbdt'],
    'num_leaves': [10, 20, 40],
    'class_weight': ['balanced']
}
lgbmg_1 = GridSearchCV(LGBMClassifier(), lgbmg_1, cv=5, n_jobs=-1, verbose=2)
lgbmg_1.fit(new, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   18.1s finished


GridSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'boosting_type': ['goss', 'dart', 'gbdt'],
                         'class_weight': ['balanced'],
                         'num_leaves': [10, 20, 40]},
             verbose=2)

In [92]:
print(lgbmg_1.best_params_)
print(lgbmg_1.best_score_)
pd.DataFrame(lgbmg_1.cv_results_).sort_values('rank_test_score')

{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'num_leaves': 40}
0.8530914010638917


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_class_weight,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,1.95,0.18,0.19,0.02,gbdt,balanced,40,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.86,0.85,0.85,0.85,0.86,0.85,0.0,1
2,2.9,0.09,0.26,0.01,goss,balanced,40,"{'boosting_type': 'goss', 'class_weight': 'bal...",0.85,0.85,0.84,0.83,0.85,0.84,0.01,2
5,4.63,0.2,0.22,0.04,dart,balanced,40,"{'boosting_type': 'dart', 'class_weight': 'bal...",0.85,0.84,0.84,0.84,0.85,0.84,0.01,3
7,1.63,0.05,0.18,0.02,gbdt,balanced,20,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.85,0.84,0.83,0.83,0.85,0.84,0.01,4
1,1.92,0.07,0.21,0.01,goss,balanced,20,"{'boosting_type': 'goss', 'class_weight': 'bal...",0.84,0.83,0.83,0.82,0.85,0.83,0.01,5
4,3.3,0.15,0.15,0.0,dart,balanced,20,"{'boosting_type': 'dart', 'class_weight': 'bal...",0.83,0.82,0.82,0.81,0.83,0.82,0.01,6
6,1.09,0.04,0.15,0.01,gbdt,balanced,10,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.81,0.81,0.81,0.81,0.82,0.81,0.01,7
0,1.34,0.12,0.18,0.02,goss,balanced,10,"{'boosting_type': 'goss', 'class_weight': 'bal...",0.8,0.8,0.81,0.81,0.82,0.81,0.01,8
3,2.5,0.16,0.12,0.0,dart,balanced,10,"{'boosting_type': 'dart', 'class_weight': 'bal...",0.79,0.78,0.8,0.78,0.8,0.79,0.01,9


#### Conclusion Iteration 1
There is a minor performance in This model Compare with the one selected on RandomForest, lets play a little bit more with its hyperparameters

### Iteration 2

In [93]:
lgbmg_2 = {
    'boosting_type': ['goss', 'dart', 'gbdt'],
    'num_leaves': [40, 80, 120],
    'class_weight': ['balanced']
}
lgbmg_2 = GridSearchCV(LGBMClassifier(), lgbmg_2, cv=5, n_jobs=-1, verbose=2)
lgbmg_2.fit(new, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   34.6s finished


GridSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'boosting_type': ['goss', 'dart', 'gbdt'],
                         'class_weight': ['balanced'],
                         'num_leaves': [40, 80, 120]},
             verbose=2)

In [94]:
print(lgbmg_2.best_params_)
print(lgbmg_2.best_score_)
pd.DataFrame(lgbmg_2.cv_results_).sort_values('rank_test_score')

{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'num_leaves': 120}
0.8628468836586389


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_class_weight,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,5.89,0.15,0.54,0.21,gbdt,balanced,120,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.87,0.87,0.85,0.86,0.87,0.86,0.01,1
7,4.13,0.16,0.67,0.16,gbdt,balanced,80,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.87,0.86,0.85,0.85,0.87,0.86,0.01,2
5,9.07,0.3,0.59,0.12,dart,balanced,120,"{'boosting_type': 'dart', 'class_weight': 'bal...",0.86,0.86,0.85,0.85,0.86,0.86,0.01,3
6,2.5,0.07,0.24,0.02,gbdt,balanced,40,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.86,0.85,0.85,0.85,0.86,0.85,0.0,4
4,7.02,0.22,0.35,0.06,dart,balanced,80,"{'boosting_type': 'dart', 'class_weight': 'bal...",0.86,0.85,0.85,0.85,0.86,0.85,0.01,5
1,4.59,0.31,0.7,0.11,goss,balanced,80,"{'boosting_type': 'goss', 'class_weight': 'bal...",0.86,0.85,0.84,0.84,0.86,0.85,0.01,6
2,5.85,0.46,0.72,0.1,goss,balanced,120,"{'boosting_type': 'goss', 'class_weight': 'bal...",0.85,0.86,0.84,0.85,0.86,0.85,0.01,7
0,2.92,0.26,0.35,0.04,goss,balanced,40,"{'boosting_type': 'goss', 'class_weight': 'bal...",0.85,0.85,0.84,0.83,0.85,0.84,0.01,8
3,4.9,0.35,0.23,0.03,dart,balanced,40,"{'boosting_type': 'dart', 'class_weight': 'bal...",0.85,0.84,0.84,0.84,0.85,0.84,0.01,9


### Conclusion Iteration 2
There is an Increase in gbdt booting type, now lets see if we can find a significance improvement. Score: 86.28%

### Iteration 3

In [95]:
lgbmg_3 = {
    'boosting_type': ['goss', 'dart', 'gbdt'],
    'num_leaves': [120, 240, 360],
    'class_weight': ['balanced']
}
lgbmg_3 = GridSearchCV(LGBMClassifier(), lgbmg_3, cv=5, n_jobs=-1, verbose=2)
lgbmg_3.fit(new, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.2min finished


GridSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'boosting_type': ['goss', 'dart', 'gbdt'],
                         'class_weight': ['balanced'],
                         'num_leaves': [120, 240, 360]},
             verbose=2)

In [97]:
print(lgbmg_3.best_params_)
print(lgbmg_3.best_score_)
pd.DataFrame(lgbmg_3.cv_results_).sort_values('rank_test_score').head()

{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'num_leaves': 240}
0.8637561112268918


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_class_weight,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,9.61,0.33,1.47,0.28,gbdt,balanced,240,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.87,0.86,0.86,0.85,0.87,0.86,0.01,1
8,11.14,0.69,0.97,0.2,gbdt,balanced,360,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.87,0.86,0.86,0.85,0.87,0.86,0.01,2
6,5.19,0.17,0.55,0.14,gbdt,balanced,120,"{'boosting_type': 'gbdt', 'class_weight': 'bal...",0.87,0.87,0.85,0.86,0.87,0.86,0.01,3
4,15.8,0.59,1.15,0.15,dart,balanced,240,"{'boosting_type': 'dart', 'class_weight': 'bal...",0.86,0.86,0.86,0.85,0.86,0.86,0.01,4
5,21.03,0.6,1.78,0.14,dart,balanced,360,"{'boosting_type': 'dart', 'class_weight': 'bal...",0.86,0.86,0.86,0.85,0.87,0.86,0.01,5


#### Conclusion Iteration 3
This model seems to have a better Performance

##### 86.37

## Prediction

### Feature Engineering of Test 

In [100]:
transformed_test = pipe.transform(X_test)
transformed_test = tr.transform(transformed_test)

In [103]:
prediction = lgbmg_3.predict(transformed_test)

In [107]:
from sklearn.metrics import accuracy_score, confusion_matrix

### Confusion Matrix

In [118]:
lgbmg_3.score(transformed_test, y_test)

0.8736772486772487

In [108]:
accuracy_score(y_test, prediction)

0.8736772486772487

In [111]:
confusion_matrix(y_test, prediction)

array([[342,  74,   0,   0,   9,   3,  25],
       [ 57, 271,  12,   0,  34,   6,   2],
       [  0,   5, 366,  13,   1,  44,   0],
       [  0,   0,  11, 417,   0,   0,   0],
       [  0,  14,   8,   0, 434,   7,   0],
       [  0,   4,  26,  11,   2, 407,   0],
       [ 13,   1,   0,   0,   0,   0, 405]], dtype=int64)

## Checking with test of kaggle

In [114]:
final_test = pd.read_csv(r'../../../data/test.csv')
final_test.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,15121,2680,354,14,0,0,2684,196,214,156,...,0,0,0,0,0,0,0,0,0,0
1,15122,2683,0,13,0,0,2654,201,216,152,...,0,0,0,0,0,0,0,0,0,0
2,15123,2713,16,15,0,0,2980,206,208,137,...,0,0,0,0,0,0,0,0,0,0
3,15124,2709,24,17,0,0,2950,208,201,125,...,0,0,0,0,0,0,0,0,0,0
4,15125,2706,29,19,0,0,2920,210,195,115,...,0,0,0,0,0,0,0,0,0,0


In [116]:
final_test_features = pipe.transform(final_test.drop('Cover_Type', axis=1))

KeyError: "['Cover_Type'] not found in axis"

In [16]:
X_train_um, X_test_um, y_train_um, y_test_um = train_test_split(data.drop('Cover_Type', axis=1),
                                                    data['Cover_Type'],
                                                    test_size=.1,
                                                    random_state =123)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [02:19<00:11,  3.68s/it]



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [02:27<00:00,  4.92s/it]


In [17]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.89,0.88,,0.89,2.38
LGBMClassifier,0.89,0.88,,0.88,1.78
XGBClassifier,0.88,0.88,,0.88,6.15
ExtraTreesClassifier,0.88,0.88,,0.88,1.97
BaggingClassifier,0.85,0.85,,0.85,1.37
LabelPropagation,0.81,0.8,,0.81,12.76
LabelSpreading,0.81,0.8,,0.81,15.38
KNeighborsClassifier,0.79,0.79,,0.79,2.09
DecisionTreeClassifier,0.79,0.78,,0.79,0.24
SVC,0.76,0.76,,0.76,9.12


[[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]### Results Iteration 1

The engineered features perform better, at least with a quick check on XGBoosts, and the not engineered features perform well with RandomForest. Nevertheless the time performance in later features is worst than the one with the second models check.