In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/forest-cover-type-prediction/train.csv")

# **Feature Engineering**

In [None]:
# Make Copy of train_data data
copy_train_data = train_data
copy_train_data.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

In [None]:
# Mean hillshade of the hillshade at 9AM, Noon, and 3PM (0-255)
copy_train_data['Hillshade_mean'] = (copy_train_data['Hillshade_9am'] +
                                     copy_train_data['Hillshade_Noon'] +
                                     copy_train_data['Hillshade_3pm'])/3

# 9AM, Noon, 3PM hillshade squared
copy_train_data['Hillshade_9am_sq'] = np.square(copy_train_data['Hillshade_9am'])
copy_train_data['Hillshade_Noon_sq'] = np.square(copy_train_data['Hillshade_Noon'])
copy_train_data['Hillshade_3pm_sq'] = np.square(copy_train_data['Hillshade_3pm'])

# interaction_9amnoon	Product of hillshades at 9AM and Noon
# interaction_noon3pm	Product of hillshades at Noon and 3PM
# interaction_9am3pm	Product of hillshades at 9AM and 3PM

copy_train_data['interaction_9amnoon'] = np.multiply(copy_train_data['Hillshade_9am'], copy_train_data['Hillshade_Noon'])
copy_train_data['interaction_noon3pm'] = np.multiply(copy_train_data['Hillshade_Noon'], copy_train_data['Hillshade_3pm'])
copy_train_data['interaction_9am3pm'] = np.multiply(copy_train_data['Hillshade_3pm'], copy_train_data['Hillshade_9am'])

copy_train_data.drop(['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm'], axis=1, inplace=True)

In [None]:
# Square root of the sum of the squared horizontal & vertical distances to water
sum_of_squared_distances = copy_train_data['Horizontal_Distance_To_Hydrology']**2 + copy_train_data['Vertical_Distance_To_Hydrology']**2
copy_train_data['Euclidean_Distance_To_Hydrology'] = np.sqrt(sum_of_squared_distances)

In [None]:
# Logarithm of elevation
copy_train_data['log_elevation'] = np.log(copy_train_data['Elevation'])

# cosine_slope	The cosine of the slope, used to partially model the relationships between hillshade
copy_train_data['cosine_slope'] = np.cos(np.radians(copy_train_data['Slope']))

In [None]:
X = copy_train_data.drop(['Id', 'Soil_Type7', 'Soil_Type15', 'Cover_Type'], axis=1)
y = copy_train_data['Cover_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
lr = LogisticRegression(random_state=42)

lr_params = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'class_weight': ['balanced'],
    'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'max_iter': [100, 200, 500]
}

grid_search = GridSearchCV(estimator=lr, param_grid=lr_params, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

lr = LogisticRegression(**grid_search.best_params_)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

lr_accuracy = accuracy_score(y_test, y_pred)
lr_conf_matrix = confusion_matrix(y_test, y_pred)
lr_classification_report = classification_report(y_test, y_pred)

print('LR Accuracy: \n', lr_accuracy)
print('LR Confusion matrix: \n', lr_conf_matrix)
print('LR Classification report: \n', lr_classification_report)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
{'class_weight': 'balanced', 'max_iter': 200, 'penalty': 'l1', 'solver': 'liblinear'}
LR Accuracy: 
 0.6820987654320988
LR Confusion matrix: 
 [[402  97   5   0  55   4  57]
 [149 319  23   1 131  26   9]
 [  0   4 350  86  23 182   0]
 [  0   0  40 596   0  25   0]
 [ 17  80  64   0 459  30   0]
 [  0  18 110  73  59 390   0]
 [ 67   3   2   0   2   0 578]]
LR Classification report: 
               precision    recall  f1-score   support

           1       0.63      0.65      0.64       620
           2       0.61      0.48      0.54       658
           3       0.59      0.54      0.56       645
           4       0.79      0.90      0.84       661
           5       0.63      0.71      0.67       650
           6       0.59      0.60      0.60       650
           7       0.90      0.89      0.89       652

    accuracy                           0.68      4536
   macro avg       0.68      0.68      0.68      4536
weighte

In [None]:
rfc = RandomForestClassifier(random_state=42)

rfc_params = {
    'n_estimators': [50, 100, 200, 500],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['auto'],
    'bootstrap': [True],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight' : ['balanced', 'balanced_subsample']
}

grid_search = GridSearchCV(estimator=rfc, param_grid=rfc_params, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

rfc = RandomForestClassifier(**grid_search.best_params_)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)
rfc_accuracy = accuracy_score(y_test, y_pred)
rfc_conf_matrix = confusion_matrix(y_test, y_pred)
rfc_classification_report = classification_report(y_test, y_pred)

print('RFC Accuracy: \n', rfc_accuracy)
print('RFC Confusion matrix: \n', rfc_conf_matrix)
print('RFC Classification report: \n', rfc_classification_report)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
{'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
RFC Accuracy: 
 0.8450176366843033
RFC Confusion matrix: 
 [[461  94   1   0  21   1  42]
 [124 427  20   0  68  13   6]
 [  0   1 496  38   5 105   0]
 [  0   0   8 649   0   4   0]
 [  1  18  14   0 615   2   0]
 [  0   4  67  22   7 550   0]
 [ 17   0   0   0   0   0 635]]
RFC Classification report: 
               precision    recall  f1-score   support

           1       0.76      0.74      0.75       620
           2       0.78      0.65      0.71       658
           3       0.82      0.77      0.79       645
           4       0.92      0.98      0.95       661
           5       0.86      0.95      0.90       650
           6       0.81      0.85      0.83       650
           7       0.93      0.97      0.95       652

    accuracy                    

In [None]:
extra_trees = ExtraTreesClassifier(random_state=42)

extr_params = {
    'n_estimators': [50, 100, 200, 500],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_features': ['auto'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True],
    'class_weight' : ['balanced', 'balanced_subsample']
}

grid_search = GridSearchCV(estimator=extra_trees, param_grid=extr_params, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

extr = ExtraTreesClassifier(**grid_search.best_params_)
extr.fit(X_train, y_train)

y_pred = extr.predict(X_test)

extr_accuracy = accuracy_score(y_test, y_pred)
extr_conf_matrix = confusion_matrix(y_test, y_pred)
extr_extree_classification_report = classification_report(y_test, y_pred)

print('EXTR Accuracy: \n', extr_accuracy)
print('EXTR Confusion matrix: \n', extr_conf_matrix)
print('EXTR Classification report: \n', extr_extree_classification_report)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
{'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
EXTR Accuracy: 
 0.8434744268077602
EXTR Confusion matrix: 
 [[453 107   1   0  25   1  33]
 [121 437  22   0  64  11   3]
 [  0   1 499  46   3  96   0]
 [  0   0   7 649   0   5   0]
 [  3  16  15   0 612   4   0]
 [  0   3  69  25   5 548   0]
 [ 21   3   0   0   0   0 628]]
EXTR Classification report: 
               precision    recall  f1-score   support

           1       0.76      0.73      0.74       620
           2       0.77      0.66      0.71       658
           3       0.81      0.77      0.79       645
           4       0.90      0.98      0.94       661
           5       0.86      0.94      0.90       650
           6       0.82      0.84      0.83       650
           7       0.95      0.96      0.95       652

    accuracy                 

In [None]:
dec_trees = DecisionTreeClassifier(random_state=42)

dec_params = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=dec_trees, param_grid=dec_params, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

dec = DecisionTreeClassifier(**grid_search.best_params_)
dec.fit(X_train, y_train)

y_pred = dec.predict(X_test)

dec_accuracy = accuracy_score(y_test, y_pred)
dec_conf_matrix = confusion_matrix(y_test, y_pred)
dec_extree_classification_report = classification_report(y_test, y_pred)

print('DEC Accuracy: \n', dec_accuracy)
print('DEC Confusion matrix: \n', dec_conf_matrix)
print('DEC Classification report: \n', dec_extree_classification_report)

In [None]:
gb = GradientBoostingClassifier(random_state=42)

gb_params = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.1, 0.01, 0.001],
    'criterion': ['friedman_mse', 'squared_error'],
    'max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(estimator=gb, param_grid=gb_params, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

gb = GradientBoostingClassifier(**grid_search.best_params_)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

gb_accuracy = accuracy_score(y_test, y_pred)
gb_conf_matrix = confusion_matrix(y_test, y_pred)
gb_extree_classification_report = classification_report(y_test, y_pred)

print('GB Accuracy: \n', gb_accuracy)
print('GB Confusion matrix: \n', gb_conf_matrix)
print('GB Classification report: \n', gb_extree_classification_report)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'max_features': None, 'n_estimators': 500}
GB Accuracy: 
 0.8271604938271605
GB Confusion matrix: 
 [[456 114   1   0  16   2  31]
 [137 418  24   0  64  12   3]
 [  0   5 490  31   6 112   1]
 [  0   0  12 642   0   7   0]
 [  3  25  18   0 602   2   0]
 [  0  16  90  16   7 521   0]
 [ 28   1   0   0   0   0 623]]
GB Classification report: 
               precision    recall  f1-score   support

           1       0.73      0.74      0.73       620
           2       0.72      0.64      0.68       658
           3       0.77      0.76      0.77       645
           4       0.93      0.97      0.95       661
           5       0.87      0.93      0.90       650
           6       0.79      0.80      0.80       650
           7       0.95      0.96      0.95       652

    accuracy                           0.83      4536
   macro avg       0.82      0.83      0.82      453