In [None]:
#packages
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score,validation_curve
from sklearn.base import ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
# data
stroke=pd.read_csv('brain_stroke.csv')
stroke.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [None]:
stroke.stroke.value_counts()

0    4733
1     248
Name: stroke, dtype: int64

In [None]:
# need to mention we balanced our data in report
stroke_bal=stroke.groupby(['stroke'],group_keys=False).apply(lambda x: x.sample(n=1500,replace=True,
                                                                                random_state=0))

In [None]:
stroke_bal.stroke.value_counts()

0    1500
1    1500
Name: stroke, dtype: int64

In [None]:
stroke_bal.shape

(3000, 11)

In [None]:
stroke_bal.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,54.911533,0.170333,0.129667,119.77799,29.4121,0.5
std,21.856151,0.375988,0.335992,55.908581,6.2465,0.500083
min,0.24,0.0,0.0,55.12,14.3,0.0
25%,41.0,0.0,0.0,78.46,25.4,0.0
50%,59.0,0.0,0.0,97.76,28.8,0.5
75%,74.0,0.0,0.0,151.16,32.6,1.0
max,82.0,1.0,1.0,271.74,48.9,1.0


## Comparing stroke versus no stroke in data

In [None]:
# describe for stroke
yesstroke=stroke_bal[stroke_bal['stroke']==1]
yesstroke.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,67.4184,0.256667,0.204667,133.89854,30.286733,1.0
std,12.50128,0.43694,0.403592,61.984224,5.460197,0.0
min,1.32,0.0,0.0,56.11,16.9,1.0
25%,59.0,0.0,0.0,80.355,27.0,1.0
50%,70.0,0.0,0.0,106.41,29.6,1.0
75%,78.0,1.0,0.0,197.28,32.6,1.0
max,82.0,1.0,1.0,271.74,48.9,1.0


In [None]:
# describe for no stroke
nostroke=stroke_bal[stroke_bal['stroke']==0]
nostroke.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,42.404667,0.084,0.054667,105.65744,28.537467,0.0
std,22.053736,0.27748,0.227404,44.861279,6.835117,0.0
min,0.24,0.0,0.0,55.12,14.3,0.0
25%,25.0,0.0,0.0,77.29,23.5,0.0
50%,43.0,0.0,0.0,91.745,28.1,0.0
75%,59.0,0.0,0.0,114.915,32.5,0.0
max,82.0,1.0,1.0,267.6,48.8,0.0


In [None]:
#checking for missing
print(stroke_bal.isnull().all())
#no NA

gender               False
age                  False
hypertension         False
heart_disease        False
ever_married         False
work_type            False
Residence_type       False
avg_glucose_level    False
bmi                  False
smoking_status       False
stroke               False
dtype: bool


# start of predictive analytics

## Here we are:
    - Splitting data into feature and test sets
    - Then, creating train and test
    - Lastly, creating dummy variables for our training set

In [None]:
# make features and target sets
X_stroke, y_stroke=stroke_bal.iloc[:,:-1],stroke_bal['stroke']

In [None]:
X_stroke.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
2940,Male,56.0,0,0,Yes,Private,Urban,78.93,31.1,Unknown
2815,Female,78.0,0,0,Yes,Private,Urban,93.15,23.6,Unknown
1861,Male,69.0,1,0,Yes,Private,Urban,229.21,30.1,smokes


In [None]:
y_stroke.head(3)

2940    0
2815    0
1861    0
Name: stroke, dtype: int64

In [None]:
y_stroke=y_stroke.astype(str)

In [None]:
y_stroke.dtypes

dtype('O')

In [None]:
X_stroke['age']=X_stroke['age'].astype(int)

In [None]:
X_stroke.dtypes

gender                object
age                    int64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
dtype: object

In [None]:
# creating dummy variables
X_dummy=pd.get_dummies(X_stroke)
X_dummy.head(3)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
2940,56,0,0,78.93,31.1,0,1,0,1,0,1,0,0,0,1,1,0,0,0
2815,78,0,0,93.15,23.6,1,0,0,1,0,1,0,0,0,1,1,0,0,0
1861,69,1,0,229.21,30.1,0,1,0,1,0,1,0,0,0,1,0,0,0,1


In [None]:
# split train and test (75/25)
X_train, X_test, y_train, y_test=train_test_split(X_dummy,y_stroke,random_state=0,test_size=.25)

## KNN-Classification Model

### First we need to create a validation curve to select the optimal parameter:
    - For KNN-Classification, n_neighbors = 1 produces the best result

In [None]:
# validation curve
train_scores_knn,test_scores_knn=validation_curve(KNeighborsClassifier(),X_dummy.values,y_stroke.values,
                                          param_name='n_neighbors',
                                          param_range=[1,5,10,15,20,25,50,100],cv=5,scoring="accuracy")

In [None]:
train_scores_knn.round(5)

array([[1.     , 1.     , 1.     , 1.     , 1.     ],
       [0.905  , 0.89542, 0.89792, 0.90375, 0.89958],
       [0.84042, 0.83583, 0.82458, 0.84208, 0.83458],
       [0.81625, 0.805  , 0.81417, 0.80833, 0.81   ],
       [0.81667, 0.79333, 0.79375, 0.79125, 0.81167],
       [0.80208, 0.79542, 0.79958, 0.79583, 0.79833],
       [0.77542, 0.76958, 0.76292, 0.775  , 0.77958],
       [0.765  , 0.76042, 0.75792, 0.76333, 0.76625]])

In [None]:
test_scores_knn.round(5)

array([[0.93167, 0.96667, 0.95333, 0.94833, 0.94167],
       [0.83833, 0.875  , 0.855  , 0.845  , 0.84333],
       [0.78167, 0.8    , 0.815  , 0.78667, 0.805  ],
       [0.77167, 0.81167, 0.79667, 0.76833, 0.79   ],
       [0.77167, 0.81167, 0.79333, 0.775  , 0.805  ],
       [0.76167, 0.80333, 0.8    , 0.78667, 0.775  ],
       [0.75   , 0.795  , 0.78167, 0.76667, 0.74833],
       [0.725  , 0.78333, 0.785  , 0.76833, 0.73667]])

In [None]:
#find best
print('mean train scores across 5 folds',train_scores_knn.mean(axis=1).round(4))
print('mean test scores across 5 folds',test_scores_knn.mean(axis=1).round(4))
# n_neighbors = 1 is best

mean train scores across 5 folds [1.     0.9003 0.8355 0.8108 0.8013 0.7983 0.7725 0.7626]
mean test scores across 5 folds [0.9483 0.8513 0.7977 0.7877 0.7913 0.7853 0.7683 0.7597]


### Now we create and fit our model:
    - Our model holds a 100% accuracy on train and a 93.60% accuracy on test

In [None]:
# fit model
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train.values,y_train.values)

In [None]:
# evaluate model
print('knn acc on train: {:.2%}'.format(knn.score(X_train.values,y_train.values)))
print('knn acc on test: {:.2%}'.format(knn.score(X_test.values,y_test.values)))

knn acc on train: 100.00%
knn acc on test: 93.60%


## Logistic Regression

### First we need to create a validation curve to select the optimal parameter:
    - Before

In [None]:
# define and fit scaler
scaler1=MinMaxScaler()
X_stroke_trns=scaler1.fit_transform(X_dummy)
X_stroke_trns

array([[0.68292683, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.95121951, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.84146341, 1.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.97560976, 1.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.96341463, 0.        , 1.        , ..., 0.        , 0.        ,
        1.        ],
       [0.69512195, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
# validation curve
train_scores,test_scores=validation_curve(LogisticRegression(max_iter=100000),X_stroke_trns,y_stroke,
                                          param_name='C',
                                          param_range=[0.01,0.1,0.5,0.75,1,5,10,15,20],cv=5)

In [None]:
train_scores.round(4)

array([[0.7596, 0.7425, 0.7517, 0.7462, 0.7554],
       [0.7837, 0.7775, 0.7762, 0.7738, 0.7817],
       [0.785 , 0.7829, 0.7729, 0.7754, 0.7788],
       [0.7875, 0.7829, 0.7733, 0.7762, 0.7767],
       [0.7908, 0.7833, 0.7733, 0.775 , 0.7775],
       [0.79  , 0.7812, 0.7696, 0.7717, 0.7821],
       [0.7904, 0.7812, 0.7696, 0.7717, 0.7821],
       [0.7904, 0.7812, 0.7696, 0.7717, 0.7821],
       [0.7908, 0.7812, 0.7696, 0.7717, 0.7821]])

In [None]:
test_scores.round(4)

array([[0.7217, 0.765 , 0.7417, 0.7533, 0.7533],
       [0.7583, 0.785 , 0.7983, 0.7683, 0.76  ],
       [0.75  , 0.78  , 0.7967, 0.7767, 0.7667],
       [0.75  , 0.7767, 0.8   , 0.775 , 0.76  ],
       [0.7533, 0.78  , 0.7967, 0.7733, 0.76  ],
       [0.7517, 0.7833, 0.7933, 0.7733, 0.7667],
       [0.7517, 0.7833, 0.795 , 0.7733, 0.7667],
       [0.7533, 0.7833, 0.795 , 0.7733, 0.7667],
       [0.7533, 0.7833, 0.795 , 0.7733, 0.7667]])

In [None]:
#find best model
print('mean train scores across 5 folds',train_scores.mean(axis=1).round(4))
print('mean test scores across 5 folds',test_scores.mean(axis=1).round(4))
#C=0.75 be best?

mean train scores across 5 folds [0.7511 0.7786 0.779  0.7793 0.78   0.7789 0.779  0.779  0.7791]
mean test scores across 5 folds [0.747  0.774  0.774  0.7723 0.7727 0.7737 0.774  0.7743 0.7743]


In [None]:
# make the model
logreg=LogisticRegression(C=0.75,max_iter=100000,random_state=0)

In [None]:
# fit the model
logreg.fit(X_train,y_train)

In [None]:
# check accuracy
print('logreg acc in train: {:.2%}'.format(logreg.score(X_train,y_train)))
print('logreg acc in test: {:.2%}'.format(logreg.score(X_test,y_test)))


logreg acc in train: 78.09%
logreg acc in test: 76.27%


## SVM

In [None]:
from sklearn.svm import LinearSVC

In [None]:
# define and fit scaler
scaler1=MinMaxScaler()
X_stroke_trns=scaler1.fit_transform(X_dummy)
X_stroke_trns

array([[0.68292683, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.95121951, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.84146341, 1.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.97560976, 1.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.96341463, 0.        , 1.        , ..., 0.        , 0.        ,
        1.        ],
       [0.69512195, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
#validation curve
train_scores_svm,test_scores_svm=validation_curve(LinearSVC(random_state=0, max_iter=1000000),X_stroke_trns,y_stroke,
                                          param_name='C',
                                          param_range=[.1,.25,.5,.75,1,5,10,15],cv=5)





In [None]:
train_scores_svm.round(4)

array([[0.785 , 0.7842, 0.7729, 0.775 , 0.7821],
       [0.7871, 0.7829, 0.77  , 0.7729, 0.7867],
       [0.7879, 0.7821, 0.7671, 0.7738, 0.7837],
       [0.7883, 0.7783, 0.7671, 0.7738, 0.7842],
       [0.7879, 0.7783, 0.7671, 0.7738, 0.7842],
       [0.7883, 0.7792, 0.7671, 0.7746, 0.7842],
       [0.7883, 0.7775, 0.7679, 0.7746, 0.7842],
       [0.7883, 0.7775, 0.7679, 0.7746, 0.7837]])

In [None]:
test_scores_svm.round(4)

array([[0.7467, 0.775 , 0.7983, 0.7733, 0.7617],
       [0.7567, 0.78  , 0.7967, 0.77  , 0.7733],
       [0.7583, 0.78  , 0.795 , 0.77  , 0.7717],
       [0.7583, 0.7783, 0.7933, 0.7683, 0.7717],
       [0.7583, 0.7783, 0.7933, 0.7683, 0.7717],
       [0.7583, 0.78  , 0.7933, 0.7683, 0.7717],
       [0.7583, 0.7767, 0.7933, 0.7683, 0.7717],
       [0.7583, 0.7767, 0.7933, 0.7683, 0.7717]])

In [None]:
#find best
print('mean train scores across 5 folds',train_scores_svm.mean(axis=1).round(4))
print('mean test scores across 5 folds',test_scores_svm.mean(axis=1).round(4))
# C=0.1 is best

mean train scores across 5 folds [0.7798 0.7799 0.7789 0.7783 0.7783 0.7787 0.7785 0.7784]
mean test scores across 5 folds [0.771  0.7753 0.775  0.774  0.774  0.7743 0.7737 0.7737]


In [None]:
# define and fit the model
svm=LinearSVC(C=.1,max_iter=1000000,random_state=0)
svm.fit(X_train,y_train)



In [None]:
print('svm acc on train: {:.2%}'.format(svm.score(X_train,y_train)))
print('svm acc on test: {:.2%}'.format(svm.score(X_test,y_test)))


svm acc on train: 78.00%
svm acc on test: 76.67%


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
#validation curve for max leaf node
train_scores_dt,test_scores_dt=validation_curve(DecisionTreeClassifier(random_state=0),
                                         X_train,y_train,param_name='max_leaf_nodes',
                                          param_range=[10,20,30,40,50,75,100,150,200,250],cv=5)

In [None]:
#max leaf nodes
print('avg train acc for each param val:', train_scores_dt.mean(axis=1).round(3))
print('avg test acc for each param val:', test_scores_dt.mean(axis=1).round(3))

#max_leaf_nodes=200 is the optimized parameter but in favor of readability we have
#sacrificed accuracy by selecting max_leaf_nodes=30 for our decision tree

avg train acc for each param val: [0.802 0.844 0.866 0.883 0.898 0.928 0.947 0.982 1.    1.   ]
avg test acc for each param val: [0.781 0.816 0.834 0.846 0.857 0.868 0.887 0.909 0.924 0.924]


In [None]:
#create model
dt=DecisionTreeClassifier(max_leaf_nodes=30,random_state=0)


In [None]:
# fit model(s)
dt.fit(X_train,y_train)

In [None]:
# evaluate and pick between the two
print('dt acc on train: {:.2%}'.format(dt.score(X_train,y_train)))
print('dt acc on test: {:.2%}'.format(dt.score(X_test,y_test)))

dt acc on train: 85.42%
dt acc on test: 78.93%


In [None]:
# feature importance
feat_imp=pd.DataFrame(data=dt.feature_importances_.round(3),
                      index=X_train.columns,
                     columns=['importance'])
feat_imp.sort_values('importance',ascending=False)

Unnamed: 0,importance
age,0.736
avg_glucose_level,0.097
bmi,0.066
smoking_status_never smoked,0.025
ever_married_Yes,0.017
smoking_status_formerly smoked,0.017
smoking_status_smokes,0.011
gender_Female,0.011
work_type_Govt_job,0.01
work_type_Private,0.009


In [None]:
#webgraphviz.com
export_graphviz(dt, out_file='dectree_vis.dot', feature_names=X_train.columns,filled=True,)

## Random Forest

In [None]:
#validation curve
train_scores_rf,test_scores_rf=validation_curve(RandomForestClassifier(random_state=0),X_train,y_train,
                                         param_name='n_estimators',param_range=[50,100,150,200,250],cv=5)

In [None]:
#selecting optimal parameter
print('avg train acc for each param val:', train_scores_rf.mean(axis=1).round(3))
print('avg test acc for each param val:', test_scores_rf.mean(axis=1).round(3))

#n_estimators = 50 is best

avg train acc for each param val: [1. 1. 1. 1. 1.]
avg test acc for each param val: [0.932 0.928 0.928 0.928 0.929]


In [None]:
# create and fit the model (use best from decision tree)
rf=RandomForestClassifier(n_estimators=50,random_state=0)
rf.fit(X_train,y_train)

In [None]:
# evaluate the performance
print('rf acc on train: {:.2%}'.format(rf.score(X_train,y_train)))
print('rf acc on test: {:.2%}'.format(rf.score(X_test,y_test)))

rf acc on train: 100.00%
rf acc on test: 96.27%


In [None]:
# feature importance
rf_feat_imp=pd.DataFrame(data=rf.feature_importances_.round(3), index=X_dummy.columns, columns=['importance'])
rf_feat_imp.sort_values(by=['importance'], ascending=False)

Unnamed: 0,importance
age,0.344
avg_glucose_level,0.2
bmi,0.169
hypertension,0.033
ever_married_Yes,0.024
ever_married_No,0.023
heart_disease,0.022
smoking_status_formerly smoked,0.02
smoking_status_never smoked,0.019
work_type_Private,0.018


## Comparing Accuracy for all

In [None]:
print('knn acc on train: {:.2%}'.format(knn.score(X_train.values,y_train.values)))
print('knn acc on test: {:.2%}'.format(knn.score(X_test.values,y_test.values)))

print('logreg acc in train: {:.2%}'.format(logreg.score(X_train,y_train)))
print('logreg acc in test: {:.2%}'.format(logreg.score(X_test,y_test)))

print('svm acc on train: {:.2%}'.format(svm.score(X_train,y_train)))
print('svm acc on test: {:.2%}'.format(svm.score(X_test,y_test)))

print('dt acc on train: {:.2%}'.format(dt.score(X_train,y_train)))
print('dt acc on test: {:.2%}'.format(dt.score(X_test,y_test)))

print('rf acc on train: {:.2%}'.format(rf.score(X_train,y_train)))
print('rf acc on test: {:.2%}'.format(rf.score(X_test,y_test)))

knn acc on train: 100.00%
knn acc on test: 93.60%
logreg acc in train: 78.09%
logreg acc in test: 76.27%
svm acc on train: 78.00%
svm acc on test: 76.67%
dt acc on train: 85.42%
dt acc on test: 78.93%
rf acc on train: 100.00%
rf acc on test: 96.27%


### Selecting best models:
    - The RandomForest model is the most accurate on test with a rating of 96.27%
    - The KNN model is close being with an accuracy rate of 93.60%

## Prediction Time
    - Will be conducting predictions for KNN and RandomForest
    - Will use 3 patients from the test data

In [None]:
# randomizing order of test to select 3 patients for prediction data at random
X_test_sample=X_test.sample(frac=1,random_state=0)
X_test_sample.head(3)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
520,54,0,0,107.47,29.3,1,0,0,1,0,0,1,0,0,1,0,1,0,0
114,76,1,0,194.37,27.0,0,1,0,1,0,0,1,0,1,0,0,1,0,0
2031,31,0,0,206.59,41.4,1,0,0,1,0,0,1,0,0,1,0,0,0,1


In [None]:
# creating p's
p1=[54,0,0,107.47,29.3,1,0,0,1,0,0,1,0,0,1,0,1,0,0]

p2=[76,1,0,194.37,27.0,0,1,0,1,0,0,1,0,1,0,0,1,0,0]

p3=[31,0,0,206.59,41.4,1,0,0,1,0,0,1,0,0,1,0,0,0,1]

In [None]:
# checking for p1 real result
stroke_bal.loc[[520]].head(1)
#stroke=0

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
520,Female,54.0,0,0,Yes,Self-employed,Urban,107.47,29.3,formerly smoked,0


In [None]:
# checking for p2 real result
stroke_bal.loc[[114]].head(1)
#stroke=1

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
114,Male,76.0,1,0,Yes,Self-employed,Rural,194.37,27.0,formerly smoked,1


In [None]:
# checking for p3 real result
stroke_bal.loc[[2031]]
#stroke=0

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
2031,Female,31.0,0,0,Yes,Self-employed,Urban,206.59,41.4,smokes,0


### KNN prediction

In [None]:
# prediction for p1
knn.predict([p1])
#correct

array(['0'], dtype=object)

In [None]:
# prediction for p2
knn.predict([p2])
#correct

array(['1'], dtype=object)

In [None]:
# prediction for p3
knn.predict([p3])
#correct

array(['0'], dtype=object)

### RandomForest Prediction

In [None]:
# prediction for p1
rf.predict([p1])
#correct



array(['0'], dtype=object)

In [None]:
# prediction for p2
rf.predict([p2])
#correct



array(['1'], dtype=object)

In [None]:
# prediction for p3
rf.predict([p3])
#correct



array(['0'], dtype=object)

    - next predictions are not necessary just comparing

### SVM prediction

In [None]:
# prediction for p1
svm.predict([p1])
#correct



array(['0'], dtype=object)

In [None]:
# prediction for p2
svm.predict([p2])
#correct



array(['1'], dtype=object)

In [None]:
# prediction for p3
svm.predict([p3])
#correct



array(['0'], dtype=object)

### Logistic Regression prediction

In [None]:
logreg.predict([p1])
#correct



array(['0'], dtype=object)

In [None]:
logreg.predict([p2])
#correct



array(['1'], dtype=object)

In [None]:
# prediction for p3
logreg.predict([p3])
#correct



array(['0'], dtype=object)

### Decision Tree prediction

In [None]:
dt.predict([p1])
#incorrect



array(['1'], dtype=object)

In [None]:
dt.predict([p2])
#correct



array(['1'], dtype=object)

In [None]:
# prediction for p3
dt.predict([p3])
#correct



array(['0'], dtype=object)