In [89]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz
from graphviz import Graph

In [8]:
import acquire

In [11]:
import prep_titanic

In [12]:
from acquire import get_titanic_data
from prep_titanic import prep_titanic_data

In [13]:
df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [14]:
df = prep_titanic_data(df)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [17]:
def encode_sex(df):
    encoder = LabelEncoder()
    encoder.fit(df.sex)
    return df.assign(sex_encode = encoder.transform(df.sex))
df = encode_sex(df)

In [18]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode,sex_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3,1
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3,0
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3,1


In [32]:
df_drop_age_na = df.dropna()

In [33]:
df_drop_age_na.isnull().sum()

passenger_id       0
survived           0
pclass             0
sex                0
age                0
sibsp              0
parch              0
fare               0
embarked           0
class              0
embark_town        0
alone              0
embarked_encode    0
sex_encode         0
dtype: int64

In [94]:
X = df_drop_age_na[['pclass', 'age', 'sibsp', 'parch', 'sex_encode']]
y = df_drop_age_na[['survived']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,sex_encode
60,3,22.0,0,0,1
348,3,3.0,1,1,1
606,3,30.0,0,0,1
195,1,58.0,0,0,0
56,2,21.0,0,0,0


In [95]:
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

In [96]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [97]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.25329634  0.01860232 -0.04746574  0.4788608  -1.28895344]]
Intercept: 
 [0.77998646]


In [98]:
y_pred = logit.predict(X_train)

In [99]:
y_pred_proba = logit.predict_proba(X_train)

In [100]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.76


In [101]:
print(confusion_matrix(y_train, y_pred))

[[221  72]
 [ 48 158]]


In [112]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,221,72
Actual +,48,158


In [42]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.75      0.79       293
           1       0.69      0.77      0.72       206

   micro avg       0.76      0.76      0.76       499
   macro avg       0.75      0.76      0.76       499
weighted avg       0.77      0.76      0.76       499



For the sample that excludes the 177 passengers without an age:

206 passengers actually survived. Of those that did actually survive, this model was able to predict 77% of them accurately based on the recall.

230 passengers were predicted to have survived with this model.
Out of those that were predicted to have survived, our precision for TP's, were 69%.
This would mean that out of predicted positives, 31% were false positives.

293 did not survive.  Of those that did not actually survive, this model was able to predict 75% of them accurately based on the recall.

269 passengers were predicted to not have survived.
Out of those that were predicted to have survived, our precision for TP's, was 82%.
This would mean that out of predicted positives, 18% were false negatives.

In [222]:
X1 = df_drop_age_na[['pclass', 'age', 'parch', 'sex_encode']]
y1 = df_drop_age_na[['survived']]

def analyze_log_reg_model(X_df, y_df, solver_name):
    print('Results using ' + str(solver_name) + ' as the solver.')
    print('-----')
    
    X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df, y_df, test_size = .30, random_state = 123)
    print(X_df_train.head())
    
    logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver=solver_name)
    logit.fit(X_df_train, y_df_train)
    print('-----')
    
    print('Coefficient: \n', logit.coef_)
    print('Intercept: \n', logit.intercept_)
    print('-----')
    
    y_df_pred = logit.predict(X_df_train)
    y_df_pred_proba = logit.predict_proba(X_df_train)
    print('Accuracy of Logistic Regression classifier on training set: {:.5f}'
     .format(logit.score(X_df_train, y_df_train)))
    print('-----')
    
    print('The results of running the model on the test sample:')
    
    cm = pd.DataFrame(confusion_matrix(y_df_train, y_df_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
    print(cm)
    print('-----')
    
    print(classification_report(y_df_train, y_df_pred))
    print('-----')
    
    y_df_pred_test = logit.predict(X_df_test)
    y_df_pred_proba_test = logit.predict_proba(X_df_test)
    print('Accuracy of Logistic Regression classifier on test set: {:.5f}'
     .format(logit.score(X_df_test, y_df_test)))
    print('-----')
    
    print('-----')
    print('For the sample that excludes the 177 passengers without an age:')


#     print('{:.0f} passengers actually survived. Of those that did not actually survive, \
#     this model was able to predict {:.2f}% of them accurately based on the recall.'.format((cm.iloc[0][0]) + (cm.iloc[0][1])), #need equation for percentage)

#     230 passengers were predicted to have survived with this model.
#     Out of those that were predicted to have survived, our precision for TP's, was 69%.
#     This would mean that out of predicted positives, 31% were false positives.

#     293 did not survive.  Of those that did not actually survive, this model was able to predict 75% of them accurately based on the recall.

#     269 passengers were predicted to not have survived.
#     Out of those that were predicted to have survived, our precision for TP's, was 82%.
#     This would mean that out of predicted positives, 18% were false negatives.


In [223]:
analyze_log_reg_model(X1, y1, 'saga')

Results using saga as the solver.
-----
     pclass   age  parch  sex_encode
60        3  22.0      0           1
348       3   3.0      1           1
606       3  30.0      0           1
195       1  58.0      0           0
56        2  21.0      0           0
-----
Coefficient: 
 [[-0.26035825  0.01864     0.47079312 -1.29017474]]
Intercept: 
 [0.77681435]
-----
Accuracy of Logistic Regression classifier on training set: 0.75551
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     218      75
Actual +      47     159
-----
              precision    recall  f1-score   support

           0       0.82      0.74      0.78       293
           1       0.68      0.77      0.72       206

   micro avg       0.76      0.76      0.76       499
   macro avg       0.75      0.76      0.75       499
weighted avg       0.76      0.76      0.76       499

-----
Accuracy of Logistic Regression classifier on test set: 0.66512
-----
-----
For the sample t

In [157]:
analyze_log_reg_model(X1, y1, 'liblinear')

Results using liblinear as the solver.
-----
     pclass   age  parch  sex_encode
60        3  22.0      0           1
348       3   3.0      1           1
606       3  30.0      0           1
195       1  58.0      0           0
56        2  21.0      0           0
-----
Coefficient: 
 [[-0.92706373 -0.01779685  0.09747168 -2.29284896]]
Intercept: 
 [4.20419336]
-----
Accuracy of Logistic Regression classifier on training set: 0.78958
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     219      74
Actual +      31     175
-----
              precision    recall  f1-score   support

           0       0.88      0.75      0.81       293
           1       0.70      0.85      0.77       206

   micro avg       0.79      0.79      0.79       499
   macro avg       0.79      0.80      0.79       499
weighted avg       0.80      0.79      0.79       499

-----
Accuracy of Logistic Regression classifier on test set: 0.75814
-----
-----
For the sam

In [158]:
X2 = df_drop_age_na[['pclass', 'age', 'parch', 'sex_encode', 'sibsp']]
y2 = df_drop_age_na[['survived']]

In [159]:
analyze_log_reg_model(X2, y2, 'saga')

Results using saga as the solver.
-----
     pclass   age  parch  sex_encode  sibsp
60        3  22.0      0           1      0
348       3   3.0      1           1      1
606       3  30.0      0           1      0
195       1  58.0      0           0      0
56        2  21.0      0           0      0
-----
Coefficient: 
 [[-0.25329634  0.01860232  0.4788608  -1.28895344 -0.04746574]]
Intercept: 
 [0.77998646]
-----
Accuracy of Logistic Regression classifier on training set: 0.75952
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     221      72
Actual +      48     158
-----
              precision    recall  f1-score   support

           0       0.82      0.75      0.79       293
           1       0.69      0.77      0.72       206

   micro avg       0.76      0.76      0.76       499
   macro avg       0.75      0.76      0.76       499
weighted avg       0.77      0.76      0.76       499

-----
Accuracy of Logistic Regression classi

In [161]:
analyze_log_reg_model(X2, y2, 'liblinear')

Results using liblinear as the solver.
-----
     pclass   age  parch  sex_encode  sibsp
60        3  22.0      0           1      0
348       3   3.0      1           1      1
606       3  30.0      0           1      0
195       1  58.0      0           0      0
56        2  21.0      0           0      0
-----
Coefficient: 
 [[-0.93271629 -0.02030594  0.18904603 -2.29565391 -0.26499086]]
Intercept: 
 [4.3924343]
-----
Accuracy of Logistic Regression classifier on training set: 0.77956
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     222      71
Actual +      39     167
-----
              precision    recall  f1-score   support

           0       0.85      0.76      0.80       293
           1       0.70      0.81      0.75       206

   micro avg       0.78      0.78      0.78       499
   macro avg       0.78      0.78      0.78       499
weighted avg       0.79      0.78      0.78       499

-----
Accuracy of Logistic Regression cl

# Decision Tree

## Iris

In [43]:
from acquire import get_iris_data

In [44]:
from prep_iris import prep_iris_data

In [248]:
df = prep_iris_data(get_iris_data())
df.head()

Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species,species_enc
0,1,5.1,3.5,1.4,0.2,setosa,0
1,2,4.9,3.0,1.4,0.2,setosa,0
2,3,4.7,3.2,1.3,0.2,setosa,0
3,4,4.6,3.1,1.5,0.2,setosa,0
4,5,5.0,3.6,1.4,0.2,setosa,0


In [249]:
df.isnull().sum()

measurement_id    0
sepal_length      0
sepal_width       0
petal_length      0
petal_width       0
species           0
species_enc       0
dtype: int64

In [250]:
X3 = df.drop(['measurement_id', 'species', 'species_enc'], axis=1)
y3 = df[['species']]
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = .30, random_state=123)
X3_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
114,5.8,2.8,5.1,2.4
136,6.3,3.4,5.6,2.4
53,5.5,2.3,4.0,1.3
19,5.1,3.8,1.5,0.3
38,4.4,3.0,1.3,0.2


In [251]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)

In [252]:
clf.fit(X3_train, y3_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [253]:
y3_pred = clf.predict(X3_train)
y3_pred[0:5]

array(['virginica', 'virginica', 'versicolor', 'setosa', 'setosa'],
      dtype=object)

In [254]:
y3_pred_proba = clf.predict_proba(X3_train)
y3_pred_proba[0:5]

array([[0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ]])

In [255]:
print('Accuracy of Decision Tree classifier on training set: {:.5f}'.format(clf.score(X3_train, y3_train)))

Accuracy of Decision Tree classifier on training set: 0.98095


In [256]:
cm = pd.DataFrame(confusion_matrix(y3_train, y3_pred),
             columns=['Pred Virginica', 'Pred Versicolor', 'Predicted Setosa'], index=['Actual Virginica', 'Actual Versicolor', 'Actual Setosa'])
cm

Unnamed: 0,Pred Virginica,Pred Versicolor,Predicted Setosa
Actual Virginica,32,0,0
Actual Versicolor,0,40,0
Actual Setosa,0,2,31


In [257]:
print(classification_report(y3_train, y3_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.95      1.00      0.98        40
   virginica       1.00      0.94      0.97        33

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



In [258]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(X3_test, y3_test)))

Accuracy of Decision Tree classifier on test set: 0.93


In [90]:
from sklearn.datasets import load_iris

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)

graph.render('iris_decision_tree2', view=True)

'iris_decision_tree2.pdf'

In [180]:
X4 = df.drop(['measurement_id', 'species', 'species_enc'], axis=1)
y4 = df[['species']]

def analyze_decision_tree(X_df, y_df, string_criterion, max_depth_input):
    print('Results using ' + str(string_criterion) + ' as the measure of impurity and ' + str(max_depth_input) + ' as the depth.')
    print('-----')
    X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df, y_df, test_size = .30, random_state=123)
    X_df_train.head()

    clf = DecisionTreeClassifier(criterion=string_criterion, max_depth=max_depth_input, random_state=123)

    clf.fit(X_df_train, y_df_train)

    y_df_pred = clf.predict(X_df_train)
    print('Head of predicted on X_train:')
    print(y_df_pred[0:5])
    print('-----')

    y_df_pred_proba = clf.predict_proba(X_df_train)
    print('Head of probabilities on X_train:')
    print(y_df_pred_proba[0:5])
    print('-----')

    print('Accuracy of Decision Tree classifier on training set: {:.8f}'.format(clf.score(X_df_train, y_df_train)))

In [182]:
analyze_decision_tree(X4, y4, 'entropy', 3)

Results using entropy as the measure of impurity and 3 as the depth.
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.    0.    1.   ]
 [0.    0.    1.   ]
 [0.    0.975 0.025]
 [1.    0.    0.   ]
 [1.    0.    0.   ]]
-----
Accuracy of Decision Tree classifier on training set: 0.98095238


In [170]:
analyze_decision_tree(X4, y4, 'gini', 3)

Results using gini as the measure of impurity and 3 as the depth.
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.    0.    1.   ]
 [0.    0.    1.   ]
 [0.    0.975 0.025]
 [1.    0.    0.   ]
 [1.    0.    0.   ]]
-----
Accuracy of Decision Tree classifier on training set: 0.98095238


In [172]:
analyze_decision_tree(X4, y4, 'gini', 4)

Results using gini as the measure of impurity and 4 as the depth.
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of Decision Tree classifier on training set: 0.99047619


In [173]:
analyze_decision_tree(X4, y4, 'entropy', 4)

Results using entropy as the measure of impurity and 4 as the depth.
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of Decision Tree classifier on training set: 0.99047619


In [174]:
analyze_decision_tree(X4, y4, 'entropy', 2)

Results using entropy as the measure of impurity and 2 as the depth.
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.         0.03225806 0.96774194]
 [0.         0.03225806 0.96774194]
 [0.         0.92857143 0.07142857]
 [1.         0.         0.        ]
 [1.         0.         0.        ]]
-----
Accuracy of Decision Tree classifier on training set: 0.96190476


In [175]:
analyze_decision_tree(X4, y4, 'gini', 2)

Results using gini as the measure of impurity and 2 as the depth.
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.         0.03225806 0.96774194]
 [0.         0.03225806 0.96774194]
 [0.         0.92857143 0.07142857]
 [1.         0.         0.        ]
 [1.         0.         0.        ]]
-----
Accuracy of Decision Tree classifier on training set: 0.96190476


In [232]:
analyze_decision_tree(X4, y4, 'entropy', 3)

Results using entropy as the measure of impurity and 3 as the depth.
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.    0.    1.   ]
 [0.    0.    1.   ]
 [0.    0.975 0.025]
 [1.    0.    0.   ]
 [1.    0.    0.   ]]
-----
Accuracy of Decision Tree classifier on training set: 0.98095238


In [176]:
analyze_decision_tree(X4, y4, 'gini', 6)

Results using gini as the measure of impurity and 6 as the depth.
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of Decision Tree classifier on training set: 1.00000000


In [177]:
analyze_decision_tree(X4, y4, 'entropy', 6)

Results using entropy as the measure of impurity and 6 as the depth.
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of Decision Tree classifier on training set: 1.00000000


In [183]:
X5 = df_drop_age_na[['pclass', 'age', 'parch', 'sex_encode']]
y5 = df_drop_age_na[['survived']]

In [211]:
analyze_decision_tree(X5, y5, 'entropy', 5)

Results using entropy as the measure of impurity and 5 as the depth.
-----
Head of predicted on X_train:
[0 0 0 1 1]
-----
Head of probabilities on X_train:
[[0.9        0.1       ]
 [0.61111111 0.38888889]
 [0.9        0.1       ]
 [0.03030303 0.96969697]
 [0.         1.        ]]
-----
Accuracy of Decision Tree classifier on training set: 0.83567134


In [212]:
analyze_decision_tree(X5, y5, 'gini', 5)

Results using gini as the measure of impurity and 5 as the depth.
-----
Head of predicted on X_train:
[0 0 0 1 1]
-----
Head of probabilities on X_train:
[[0.9        0.1       ]
 [0.61111111 0.38888889]
 [0.9        0.1       ]
 [0.         1.        ]
 [0.13333333 0.86666667]]
-----
Accuracy of Decision Tree classifier on training set: 0.84168337


In [226]:
X6 = df_drop_age_na[['pclass', 'age', 'parch', 'sex_encode', 'sibsp']]
y6 = df_drop_age_na[['survived']]

In [227]:
analyze_decision_tree(X6, y6, 'gini', 5)

Results using gini as the measure of impurity and 5 as the depth.
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.9        0.1       ]
 [0.         1.        ]
 [0.9        0.1       ]
 [0.         1.        ]
 [0.13333333 0.86666667]]
-----
Accuracy of Decision Tree classifier on training set: 0.86372745


In [228]:
analyze_decision_tree(X6, y6, 'entropy', 5)

Results using entropy as the measure of impurity and 5 as the depth.
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.88541667 0.11458333]
 [0.         1.        ]
 [0.88541667 0.11458333]
 [0.03030303 0.96969697]
 [0.         1.        ]]
-----
Accuracy of Decision Tree classifier on training set: 0.85971944


In [224]:
def test_classification_model(X_df, y_df, string_criterion, max_depth_input):   
    
    print('Results using ' + str(string_criterion) + ' as the measure of impurity and ' + str(max_depth_input) + ' as the depth.')
    print('-----')
    X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df, y_df, test_size = .30, random_state=123)
    X_df_train.head()

    clf = DecisionTreeClassifier(criterion=string_criterion, max_depth=max_depth_input, random_state=123)

    clf.fit(X_df_train, y_df_train)

    y_df_pred = clf.predict(X_df_train)
    print('Head of predicted on X_train:')
    print(y_df_pred[0:5])
    print('-----')

    y_df_pred_proba = clf.predict_proba(X_df_train)
    print('Head of probabilities on X_train:')
    print(y_df_pred_proba[0:5])
    print('-----')

    print('Accuracy of Decision Tree classifier on training set: {:.8f}'.format(clf.score(X_df_train, y_df_train)))
    print('-----')
    
    
    print('The results of running the model on the test sample:')
    
    cm = pd.DataFrame(confusion_matrix(y_df_train, y_df_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
    print(cm)
    print('-----')
    
    print(classification_report(y_df_train, y_df_pred))
    print('-----')
    
    y_df_pred_test = clf.predict(X_df_test)
    y_df_pred_proba_test = clf.predict_proba(X_df_test)
    print('Accuracy of Logistic Regression classifier on test set: {:.6f}'
     .format(clf.score(X_df_test, y_df_test)))
    print('-----')

In [230]:
test_classification_model(X5, y5, 'gini', 5)

Results using gini as the measure of impurity and 5 as the depth.
-----
Head of predicted on X_train:
[0 0 0 1 1]
-----
Head of probabilities on X_train:
[[0.9        0.1       ]
 [0.61111111 0.38888889]
 [0.9        0.1       ]
 [0.         1.        ]
 [0.13333333 0.86666667]]
-----
Accuracy of Decision Tree classifier on training set: 0.84168337
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     258      35
Actual +      44     162
-----
              precision    recall  f1-score   support

           0       0.85      0.88      0.87       293
           1       0.82      0.79      0.80       206

   micro avg       0.84      0.84      0.84       499
   macro avg       0.84      0.83      0.84       499
weighted avg       0.84      0.84      0.84       499

-----
Accuracy of Logistic Regression classifier on test set: 0.800000
-----


In [231]:
test_classification_model(X6, y6, 'gini', 6)

Results using gini as the measure of impurity and 6 as the depth.
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.87857143 0.12142857]
 [0.         1.        ]
 [0.87857143 0.12142857]
 [0.         1.        ]
 [0.09090909 0.90909091]]
-----
Accuracy of Decision Tree classifier on training set: 0.86973948
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     275      18
Actual +      47     159
-----
              precision    recall  f1-score   support

           0       0.85      0.94      0.89       293
           1       0.90      0.77      0.83       206

   micro avg       0.87      0.87      0.87       499
   macro avg       0.88      0.86      0.86       499
weighted avg       0.87      0.87      0.87       499

-----
Accuracy of Logistic Regression classifier on test set: 0.827907
-----
