In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

## Day 35 Lecture 1 Assignment

In this assignment, we will learn about gradient boosting. We will use a dataset describing survival rates after breast cancer surgery loaded below and analyze the model generated for this dataset.

In [36]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV

from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    fbeta_score,
)

<IPython.core.display.Javascript object>

In [3]:
# Attributes:
# Age of patient at time of operation (numerical)
# Patient's year of operation (year - 1900, numerical)
# Number of positive axillary nodes detected (numerical)
# Survival status (class attribute)
#  -- 1 = the patient survived 5 years or longer
#  -- 2 = the patient died within 5 year

cols = ["age", "op_year", "nodes", "survival"]
cancer = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/haberman.data",
    names=cols,
)

<IPython.core.display.Javascript object>

In [4]:
cancer.head()

Unnamed: 0,age,op_year,nodes,survival
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


<IPython.core.display.Javascript object>

Check for missing data and remove all rows containing missing data

In [5]:
# answer below:
cancer.isna().mean()


age         0.0
op_year     0.0
nodes       0.0
survival    0.0
dtype: float64

<IPython.core.display.Javascript object>

Adjust the target variable so that it has values of either 0 or 1

In [6]:
# answer below:
cancer['survival'].value_counts()


1    225
2     81
Name: survival, dtype: int64

<IPython.core.display.Javascript object>

In [7]:
cancer = pd.get_dummies(cancer, columns=["survival"], drop_first=True)

<IPython.core.display.Javascript object>

Split the data into train and test (20% in test)

In [8]:
cancer.columns

Index(['age', 'op_year', 'nodes', 'survival_2'], dtype='object')

<IPython.core.display.Javascript object>

In [9]:
# answer below:
X = cancer.drop(columns=["survival_2"])
y = cancer["survival_2"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=27
)

<IPython.core.display.Javascript object>

Create a gradient boosted classification algorithm with a learning rate of 0.01 and max depth of 5. Report the accuracy.

In [10]:
cancer["nodes"].value_counts()

0     136
1      41
2      20
3      20
4      13
6       7
7       7
8       7
5       6
9       6
13      5
14      4
11      4
10      3
15      3
19      3
22      3
23      3
12      2
20      2
46      1
16      1
17      1
18      1
21      1
24      1
25      1
28      1
30      1
35      1
52      1
Name: nodes, dtype: int64

<IPython.core.display.Javascript object>

In [11]:
nodes_counts = cancer["nodes"].value_counts()
keep = nodes_counts[nodes_counts > 4].index
cancer = cancer[cancer["nodes"].isin(keep)]

<IPython.core.display.Javascript object>

In [12]:
cancer["nodes"].value_counts()

0     136
1      41
3      20
2      20
4      13
8       7
7       7
6       7
9       6
5       6
13      5
Name: nodes, dtype: int64

<IPython.core.display.Javascript object>

In [18]:
# answer below:
num_cols = ["age", "op_year", "nodes"]

preprocessing = ColumnTransformer(
    [("scale", StandardScaler(), num_cols)], remainder="passthrough"
)

<IPython.core.display.Javascript object>

In [20]:
pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("xgb", XGBClassifier(learning_rate=0.01, max_depth=5, random_state=27)),
    ]
)

<IPython.core.display.Javascript object>

In [22]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'op_year', 'nodes'])],
                                   verbose=False)),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_...
                               interaction_constraints='', learning_rate=0.01,
                               max_delta_step=0, max_depth=5,
                           

<IPython.core.display.Javascript object>

In [23]:
pipeline.score(X_train, y_train)

0.8360655737704918

<IPython.core.display.Javascript object>

In [24]:
pipeline.score(X_test, y_test)

0.7096774193548387

<IPython.core.display.Javascript object>

Print the confusion matrix for the test data. What do you notice about our predictions?

In [26]:
# answer below:
y_pred = pipeline.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[42,  6],
       [12,  2]], dtype=int64)

<IPython.core.display.Javascript object>

Print the confusion matrix for a learning rate of 1 and a learning rate of 0.5. What do you see now that stands out to you in the confusion matrix?

In [27]:
# answer below:
pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("xgb", XGBClassifier(learning_rate=1, max_depth=5, random_state=27)),
    ]
)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'op_year', 'nodes'])],
                                   verbose=False)),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_...
                               importance_type='gain',
                               interaction_constraints='', learning_rate=1,
                               max_de

<IPython.core.display.Javascript object>

In [28]:
y_pred = pipeline.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[36, 12],
       [10,  4]], dtype=int64)

<IPython.core.display.Javascript object>

In [29]:
pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("xgb", XGBClassifier(learning_rate=0.5, max_depth=5, random_state=27)),
    ]
)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'op_year', 'nodes'])],
                                   verbose=False)),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_...
                               interaction_constraints='', learning_rate=0.5,
                               max_delta_step=0, max_depth=5,
                            

<IPython.core.display.Javascript object>

In [30]:
y_pred = pipeline.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[35, 13],
       [12,  2]], dtype=int64)

<IPython.core.display.Javascript object>

Perform a grid search for the optimal learning rate. Instead of accuracy, use a metric that will help your model predict the positive class.

In [57]:
# answer below:
params = {
    "xgb__learning_rate": [0.001, 0.01, 0.015,0.10],
    "xgb__max_depth": [3, 5, 7, 10],
    'xgb__subsample': [0.5, 0.75, 1.0]
}


<IPython.core.display.Javascript object>

In [67]:
pipeline_cv = GridSearchCV(
    pipeline,
    params,
    verbose=1,
    cv=2,
    scoring=make_scorer(fbeta_score, beta=3),
    n_jobs=-1,
)

pipeline_cv.fit(X_train, y_train)

pipeline_cv.best_params_

Fitting 2 folds for each of 48 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    4.6s finished


{'xgb__learning_rate': 0.1, 'xgb__max_depth': 7, 'xgb__subsample': 1.0}

<IPython.core.display.Javascript object>

In [69]:
pipeline_cv.score(X_train, y_train)

0.8333333333333335

<IPython.core.display.Javascript object>

In [70]:
pipeline_cv.score(X_test, y_test)

0.21739130434782608

<IPython.core.display.Javascript object>

In [71]:
y_pred = pipeline_cv.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[39,  9],
       [11,  3]], dtype=int64)

<IPython.core.display.Javascript object>

In [62]:
pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        (
            "xgb",
            XGBClassifier(learning_rate=0.1, max_depth=7, subsample=1, random_state=27),
        ),
    ]
)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['age', 'op_year', 'nodes'])],
                                   verbose=False)),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_...
                               interaction_constraints='', learning_rate=0.1,
                               max_delta_step=0, max_depth=7,
                            

<IPython.core.display.Javascript object>

In [63]:
pipeline.score(X_train, y_train)

0.9426229508196722

<IPython.core.display.Javascript object>

In [64]:
pipeline.score(X_test, y_test)

0.6774193548387096

<IPython.core.display.Javascript object>

In [65]:
y_pred = pipeline.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[39,  9],
       [11,  3]], dtype=int64)

<IPython.core.display.Javascript object>

List the feature importances for the model with the optimal learning rate.

In [66]:
# answer below:
importance_df = pd.DataFrame()
importance_df["feat"] = ["age", "op_year", "nodes"]
importance_df["importance"] = pipeline_cv.best_estimator_["xgb"].feature_importances_
importance_df = importance_df.sort_values("importance", ascending=False)

importance_df

Unnamed: 0,feat,importance
2,nodes,0.429878
0,age,0.314243
1,op_year,0.255879


<IPython.core.display.Javascript object>