In [2]:
import pandas as pd

ESN = pd.read_csv('../content/csv/FINALDATASET_EastSlopesNorth.csv')
WSN = pd.read_csv('../content/csv/FINALDATASET_WestSlopesNorth.csv')
WSS = pd.read_csv('../content/csv/FINALDATASET_WestSlopesSouth.csv')

assert all(ESN.columns) == all(WSN.columns) and all(ESN.columns) == all(WSS.columns), \
    'All dataframes should have the same columns'

# Combine dataframes

DATA = pd.concat([ESN, WSN, WSS], axis=0, ignore_index=True)

# Show any rows with NaN
DATA[DATA.isna().any(axis=1)]

# Todo: try dropping NaN columns instead
DATA.fillna(DATA.mean(), inplace=True)

# Todo: should we try to use these?
DATA.drop(['Date', 'Area'], axis=1, inplace=True)

# Can uncomment to make prediction better
DATA.drop(['Yesterday Danger Above Treeline', 'Yesterday Danger At Treeline', 'Yesterday Danger Below Treeline'], axis=1, inplace=True)

  DATA.fillna(DATA.mean(), inplace=True)


In [3]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier

X, y = DATA.drop('At Treeline', axis=1), DATA['At Treeline']

kf = KFold(n_splits=5,random_state=42,shuffle=True)
for train_index,val_index in kf.split(X):
    X_train,X_val = X.iloc[train_index],X.iloc[val_index],
    y_train,y_val = y.iloc[train_index],y.iloc[val_index],

In [4]:
gradient_booster = GradientBoostingClassifier(learning_rate=0.1, random_state=42)
gradient_booster.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [5]:
gradient_booster.fit(X_train,y_train)
print(classification_report(y_val,gradient_booster.predict(X_val)))

              precision    recall  f1-score   support

           1       0.50      0.45      0.48        11
           2       0.88      0.82      0.85        55
           3       0.78      0.95      0.86        19
           4       0.75      1.00      0.86         3

    accuracy                           0.81        88
   macro avg       0.73      0.81      0.76        88
weighted avg       0.81      0.81      0.80        88



In [6]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

best_learning_rate, best_score = 0.1, 0

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    validation = gb_clf.score(X_val, y_val)
    if validation > best_score:
        best_learning_rate = learning_rate
        best_score = validation

    # print("Learning rate: ", learning_rate)  
    # print("\tAccuracy score (training): \t{0:.3f}".format(gb_clf.score(X_train, y_train)))
    # print("\tAccuracy score (validation): \t{0:.3f}".format(gb_clf.score(X_val, y_val)))
print('Best Learning Rate for Validation:', best_learning_rate)

n_est_list = [i for i in range(1, 100)]

best_n_estimator, best_score = 20, 0

for n_est in n_est_list:
    gb_clf = GradientBoostingClassifier(n_estimators=n_est, learning_rate=0.1, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    validation = gb_clf.score(X_val, y_val)
    if validation > best_score:
        best_n_estimator = n_est
        best_score = validation

    # print("Learning rate: ", learning_rate)  
    # print("\tAccuracy score (training): \t{0:.3f}".format(gb_clf.score(X_train, y_train)))
    # print("\tAccuracy score (validation): \t{0:.3f}".format(gb_clf.score(X_val, y_val)))
print('Best N Estimators for Validation:', best_n_estimator)

features_list = [i for i in range(1, 20)]

best_features_estimator, best_score = 2, 0

for features in features_list:
    gb_clf = GradientBoostingClassifier(n_estimators=34, learning_rate=0.1, max_features=features, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    validation = gb_clf.score(X_val, y_val)
    if validation > best_score:
        best_features_estimator = features
        best_score = validation

    # print("Learning rate: ", learning_rate)  
    # print("\tAccuracy score (training): \t{0:.3f}".format(gb_clf.score(X_train, y_train)))
    # print("\tAccuracy score (validation): \t{0:.3f}".format(gb_clf.score(X_val, y_val)))
print('Best Max Features for Validation:', best_features_estimator)

depth_list = [i for i in range(1, 20)]

best_depth_estimator, best_score = 2, 0

for depth in depth_list:
    gb_clf = GradientBoostingClassifier(n_estimators=34, learning_rate=0.1, max_features=13, max_depth=depth, random_state=0)
    gb_clf.fit(X_train, y_train)

    validation = gb_clf.score(X_val, y_val)
    if validation > best_score:
        best_depth_estimator = depth
        best_score = validation

    # print("Learning rate: ", learning_rate)  
    # print("\tAccuracy score (training): \t{0:.3f}".format(gb_clf.score(X_train, y_train)))
    # print("\tAccuracy score (validation): \t{0:.3f}".format(gb_clf.score(X_val, y_val)))
print('Best Max Depth for Validation:', best_depth_estimator)

Best Learning Rate for Validation: 0.1
Best N Estimators for Validation: 34
Best Max Features for Validation: 13
Best Max Depth for Validation: 2


In [7]:
gb_clf = GradientBoostingClassifier(n_estimators=34, learning_rate=0.1, max_features=13, max_depth=2, random_state=0)
gb_clf.fit(X_train, y_train)

print("\tAccuracy score (training): \t{0:.3f}".format(gb_clf.score(X_train, y_train)))
print("\tAccuracy score (validation): \t{0:.3f}".format(gb_clf.score(X_val, y_val)))

	Accuracy score (training): 	0.890
	Accuracy score (validation): 	0.852


In [8]:
predicted = gb_clf.predict(DATA.drop('At Treeline', axis=1))
actual = DATA['At Treeline']
assert len(predicted) == len(actual)
over_predict = 0
under_predict = 0
way_off = 0
for i in range(len(actual)):
    p = predicted[i]
    a = actual[i]
    diff = a - p
    if diff == 1:
        under_predict += 1
    if diff == -1:
        over_predict += 1
    if diff < -1 or diff > 1:
        way_off += 1
        print('WAY off: predicted:', p, 'actual:', a)
under_pct = '{0:.2f}'.format(under_predict * 100.0 / len(actual))
over_pct = '{0:.2f}'.format(over_predict * 100.0 / len(actual))
way_pct = '{0:.2f}'.format(way_off * 100.0 / len(actual))
print('Predicted less dangerous than actual:', under_predict, f'{under_pct}%')
print('Predicted more dangerous than actual:', over_predict, f'{over_pct}%')
print('WAY off:', way_off, f'{way_pct}%')

Predicted less dangerous than actual: 0 0.00%
Predicted more dangerous than actual: 52 11.79%
WAY off: 0 0.00%
