# Gradient Boosted Trees
---
Gradient Boosting  is a tree-based algorithm which uses **Regression** instead of classification trees. it gives us a prediction model in the form of an ensemble of weak prediction model.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

import plotly.express as px
import plotly.graph_objects as go 


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject



In [2]:
df = pd.read_csv('../data/weatherAUS.csv')

df = df[pd.isnull(df['RainTomorrow'])==False]
df = df.fillna(df.mean())

df['RainTodayFlag'] = df['RainToday'].apply(lambda x: 1 if x=='Yes' else 0)
df['RainTomorrowFlag'] = df['RainTomorrow'].apply( lambda x: 1 if x =='Yes' else 0)

In [5]:
def model_training(X, y, n_trees):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    model = GradientBoostingClassifier(loss='deviance',
                                        criterion='mse',
                                        learning_rate=0.1,
                                        subsample=1.0,
                                        random_state=0,
                                        max_features='sqrt',
                                        min_samples_leaf=1000,
                                        max_depth=3,
                                        n_estimators=n_trees)
    
    clf = model.fit(X_train, y_train)

    pred_labels_tr = model.predict(X_train)
    pred_labels_te = model.predict(X_test)

    print('*************** Tree Summary ***************')
    print('No. of classes: ', clf.n_classes_)
    print('Classes: ', clf.classes_)
    print('No. of features: ', clf.n_features_)
    print('No. of Estimators: ', len(clf.estimators_))
    print('--------------------------------------------------------')
    print("")

    print('*************** Evaluation on Test Data ***************')
    score_te = model.score(X_test, y_test)
    print('Accuracy Score: ', score_te)
    print(classification_report(y_test, pred_labels_te))
    print('--------------------------------------------------------')
    print("")

    print('*************** Evaluation on Training Data ***************')
    score_tr = model.score(X_train, y_train)
    print('Accuracy Score: ', score_tr)
    print(classification_report(y_train, pred_labels_tr))
    print('--------------------------------------------------------')
    
    return clf, X_test, y_test

In [6]:
X = df[['WindGustSpeed', 'Humidity3pm']]
y = df['RainTomorrowFlag'].values

clf, X_test, y_test = model_training(X, y, n_trees=500)


Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.


Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.


Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.


Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.


Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.


Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.


Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.


Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion

*************** Tree Summary ***************
No. of classes:  2
Classes:  [0 1]
No. of features:  2
No. of Estimators:  500
--------------------------------------------------------

*************** Evaluation on Test Data ***************
Accuracy Score:  0.834980132916066
              precision    recall  f1-score   support

           0       0.85      0.95      0.90     22067
           1       0.73      0.42      0.53      6372

    accuracy                           0.83     28439
   macro avg       0.79      0.69      0.72     28439
weighted avg       0.82      0.83      0.82     28439

--------------------------------------------------------

*************** Evaluation on Training Data ***************



Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.



Accuracy Score:  0.8328586247516571
              precision    recall  f1-score   support

           0       0.85      0.95      0.90     88249
           1       0.72      0.41      0.53     25505

    accuracy                           0.83    113754
   macro avg       0.79      0.68      0.71    113754
weighted avg       0.82      0.83      0.81    113754

--------------------------------------------------------


In [7]:
def Plot_3D(X, X_test, y_test, clf, x1, x2, mesh_size, margin, xe, ye, ze):
            
    mesh_size=mesh_size
    margin=margin

    x_min, x_max = X.iloc[:, 0].fillna(X.mean()).min() - margin, X.iloc[:, 0].fillna(X.mean()).max() + margin
    y_min, y_max = X.iloc[:, 1].fillna(X.mean()).min() - margin, X.iloc[:, 1].fillna(X.mean()).max() + margin
    xrange = np.arange(x_min, x_max, mesh_size)
    yrange = np.arange(y_min, y_max, mesh_size)
    xx, yy = np.meshgrid(xrange, yrange)
            
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)

    fig = px.scatter_3d(x=[], y=[], z=[], 
                        width=600, height=600)

    fig.update_layout(paper_bgcolor = 'white',
                      scene_camera=dict(up=dict(x=0, y=0, z=1), 
                                        center=dict(x=-0.1, y=0, z=-0.18),
                                        eye=dict(x=xe, y=ye, z=ze)),
                      scene = dict(xaxis=dict(title=x1,
                                              backgroundcolor='white',
                                              color='black',
                                              gridcolor='#f0f0f0',
                                              title_font=dict(size=10),
                                              tickfont=dict(size=10),
                                             ),
                                   yaxis=dict(title=x2,
                                              backgroundcolor='white',
                                              color='black',
                                              gridcolor='#f0f0f0',
                                              title_font=dict(size=10),
                                              tickfont=dict(size=10),
                                              ),
                                   zaxis=dict(title='Probability of Rain Tomorrow',
                                              backgroundcolor='lightgrey',
                                              color='black', 
                                              gridcolor='#f0f0f0',
                                              range=[0.0, 1.0],
                                              title_font=dict(size=10),
                                              tickfont=dict(size=10),
                                              )))

    fig.update_traces(marker=dict(size=1))

    fig.add_traces(go.Surface(x=xrange, y=yrange, z=Z, name='Gradient Boost Prediction',
                              colorscale='Turbo',
                              reversescale=True,
                              showscale=False, 
                              contours = {"z": {"show": True, "start": 0.5, "end": 0.9, 
                                                "size": 0.5, "color":"white"}}))
    fig.show()
    fig.update_layout(margin=dict(l=0, r=0, t=1, b=1)
                     )
    return fig
  
  
fig = Plot_3D(X, X_test, y_test, clf, x1='WindGustSpeed', x2='Humidity3pm', mesh_size=1, margin=10, 
                  xe=-1.7, ye=-1.5, ze=0.4)


X does not have valid feature names, but GradientBoostingClassifier was fitted with feature names

