# Decision Tree Practice

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn import tree

In [2]:
# Grabbing our data, then exploring it a bit
# Data source: https://www.kaggle.com/ronitf/heart-disease-uci
df = pd.read_csv("heart.csv")

# EDA

In [3]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [4]:
X = df.drop(columns = 'target')

In [5]:
y=df.target

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=7, stratify=y)
#stratify makes sure target equally represented as % between train and test

In [8]:
y_train.value_counts(normalize=True)

1    0.545455
0    0.454545
Name: target, dtype: float64

In [9]:
y_test.value_counts(normalize=True)

1    0.540984
0    0.459016
Name: target, dtype: float64

In [11]:
X_train.head()
#only look at head here because me as human dont want to be biased by test
#whatsoever

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
170,56,1,2,130,256,1,0,142,1,0.6,1,1,1
124,39,0,2,94,199,0,1,179,0,0.0,2,0,2
134,41,0,1,126,306,0,1,163,0,0.0,2,0,2
28,65,0,2,140,417,1,0,157,0,0.8,2,1,2
208,49,1,2,120,188,0,1,139,0,2.0,1,3,3


In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242 entries, 170 to 11
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       242 non-null    int64  
 1   sex       242 non-null    int64  
 2   cp        242 non-null    int64  
 3   trestbps  242 non-null    int64  
 4   chol      242 non-null    int64  
 5   fbs       242 non-null    int64  
 6   restecg   242 non-null    int64  
 7   thalach   242 non-null    int64  
 8   exang     242 non-null    int64  
 9   oldpeak   242 non-null    float64
 10  slope     242 non-null    int64  
 11  ca        242 non-null    int64  
 12  thal      242 non-null    int64  
dtypes: float64(1), int64(12)
memory usage: 26.5 KB


In [13]:
X_train.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0
mean,54.086777,0.68595,1.016529,131.677686,242.590909,0.173554,0.570248,149.785124,0.338843,1.016116,1.42562,0.780992,2.297521
std,9.217334,0.465098,1.0544,18.005861,47.020191,0.37951,0.528467,23.236724,0.474297,1.153503,0.61503,1.053334,0.633041
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0
25%,47.0,0.0,0.0,120.0,210.25,0.0,0.0,134.5,0.0,0.0,1.0,0.0,2.0
50%,55.0,1.0,1.0,130.0,239.0,0.0,1.0,152.0,0.0,0.65,1.0,0.0,2.0
75%,60.75,1.0,2.0,140.0,269.0,0.0,1.0,169.0,1.0,1.6,2.0,1.0,3.0
max,77.0,1.0,3.0,200.0,417.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0


In [15]:
for column in X_train.columns:
    print(X_train[column].value_counts())
    #trying to see which are categorical - how to visualize 

58    18
54    15
57    14
52    11
44    10
59     9
51     9
56     9
43     8
63     8
62     8
60     8
64     7
65     7
53     7
42     7
66     6
45     6
41     6
67     6
61     6
49     5
46     5
47     5
48     4
39     4
55     4
50     3
71     3
38     3
69     3
68     3
40     3
35     3
37     2
34     2
77     1
76     1
70     1
74     1
29     1
Name: age, dtype: int64
1    166
0     76
Name: sex, dtype: int64
0    112
2     76
1     34
3     20
Name: cp, dtype: int64
130    28
120    28
140    25
110    15
150    14
138    11
128    10
112     9
125     8
132     7
160     7
118     7
124     6
108     6
134     5
152     5
100     4
145     3
126     3
170     3
105     3
136     3
180     3
135     3
115     2
94      2
142     2
178     2
146     2
129     1
114     1
102     1
174     1
172     1
165     1
164     1
117     1
156     1
192     1
155     1
148     1
122     1
123     1
144     1
200     1
Name: trestbps, dtype: int64
234    6
204    6
197    5


# Modeling 

## Logistic Regression

### Preprocessing

What kind of preprocessing steps are there?
- Imputing
- Scaling
- Encoding

In [16]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

### Logistic Regression Model

In [21]:
from sklearn.linear_model import LogisticRegression
# Lets set up a Logistic Regression Pipeline
encoder = OneHotEncoder(drop='first')

catvar = ['restecg','slope','ca','thal']
catvarid = [X_train.columns.tolist().index(column) for colum in catvar]

pipe_lr = make_pipeline(SimpleImputer(),ColumnTransformer([('encoder',
                                                            encoder,catvarid)], 
                                                         remainder = 'passthrough',
                                                         sparse_threshold=0), StandardScaler(), LogisticRegression())

pipe_lr.steps


[('simpleimputer', SimpleImputer()),
 ('columntransformer',
  ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                    transformers=[('encoder', OneHotEncoder(drop='first'),
                                   [12, 12, 12, 12])])),
 ('standardscaler', StandardScaler()),
 ('logisticregression', LogisticRegression())]

In [23]:
# Lets fit the LogReg on our training and calculate our roc_auc_scores
train_pred = pipe_lr.predict_proba(X_train)
test_pred = pipe_lr.predict_proba(X_test)

NotFittedError: This SimpleImputer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

How'd we do?

## Decision Trees

#### let's do a basic decision tree with just the default values

In [None]:
from sklearn.tree import DecisionTreeClassifier
# Fit the model and lets see the tree


In [None]:
# Lets calculate the roc_auc_score


#### A simple decision tree with a single split

In [None]:
# lets set up the Decision Tree with a single split


In [None]:
# Visualizing the single-split decision tree
plt.figure(figsize=(10,10))
tree.plot_tree(dt_maxdepth2)

In [None]:
# Lets calculate the roc_auc_score


#### Try something Random!

In [None]:
# Okay - time to change things!
# Replace None with relevant code
dt_2 = DecisionTreeClassifier(max_depth= None, criterion=None, 
                              min_samples_split=None)

dt_2.fit(X_train, y_train)

In [None]:
# Visualizing the decision tree
plt.figure(figsize=(20,20))
tree.plot_tree(dt_2)

In [None]:
# Lets calculate the roc_auc_score
train_preds = dt_2.predict_proba(X_train)
test_preds = dt_2.predict_proba(X_test)

train_score = roc_auc_score(y_train, train_preds[:,1])
test_score = roc_auc_score(y_test, test_preds[:,1])

print(f"Train ROC-AUC: {train_score}")
print(f"Test ROC-AUC: {test_score}")

## [GridSearch](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

Lets start by taking a look at the documentation

# Vizualizing all the models scores

In [None]:
# Visualizing the ROCs for the models we've done
fig, ax = plt.subplots()
plot_roc_curve(pipe_lr, X_test, y_test, name="Baseline Log Reg", ax=ax)
plot_roc_curve(dt, X_test, y_test, name="Default DT", ax=ax)
plot_roc_curve(dt_maxdepth1, X_test, y_test, name="DT with Max Depth = 1", ax=ax)
plot_roc_curve(dt_maxdepth2, X_test, y_test, name="DT with Max Depth = 2", ax=ax)
plot_roc_curve(dt_2, X_test, y_test, name="DT with Hyper Parameters", ax=ax)
plot_roc_curve(dt_grid, X_test, y_test, name="DT after Grid Search", ax=ax)

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)
plt.title("Receiver Operating Characteristic Curves\n(Evaluated on Test Set)")
ax.legend()
plt.show()

What can we learn from this visualization? Which is the best model - and how could you tell without looking at the scores?

- Baseline is still best - from both the visualization (closest to the '1' corner) and from the score

