In [8]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree

import graphviz
from graphviz import Graph

In [9]:
import acquire

In [10]:
import prep_titanic

In [11]:
from acquire import get_titanic_data
from prep_titanic import prep_titanic_data

In [12]:
df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [13]:
df = prep_titanic_data(df)
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [14]:
def encode_sex(df):
    encoder = LabelEncoder()
    encoder.fit(df.sex)
    return df.assign(sex_encode = encoder.transform(df.sex))
df = encode_sex(df)

In [15]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode,sex_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3,1
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3,0
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3,1


In [16]:
df_drop_age_na = df.dropna()

In [17]:
df_drop_age_na.isnull().sum()

passenger_id       0
survived           0
pclass             0
sex                0
age                0
sibsp              0
parch              0
fare               0
embarked           0
class              0
embark_town        0
alone              0
embarked_encode    0
sex_encode         0
dtype: int64

In [18]:
X = df_drop_age_na[['pclass', 'age', 'sibsp', 'parch', 'sex_encode']]
y = df_drop_age_na[['survived']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,sex_encode
60,3,22.0,0,0,1
348,3,3.0,1,1,1
606,3,30.0,0,0,1
195,1,58.0,0,0,0
56,2,21.0,0,0,0


In [19]:
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

In [20]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.25329634  0.01860232 -0.04746574  0.4788608  -1.28895344]]
Intercept: 
 [0.77998646]


In [22]:
y_pred = logit.predict(X_train)

In [23]:
y_pred_proba = logit.predict_proba(X_train)

In [24]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.76


In [25]:
print(confusion_matrix(y_train, y_pred))

[[221  72]
 [ 48 158]]


In [26]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,221,72
Actual +,48,158


In [27]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.75      0.79       293
           1       0.69      0.77      0.72       206

   micro avg       0.76      0.76      0.76       499
   macro avg       0.75      0.76      0.76       499
weighted avg       0.77      0.76      0.76       499



For the sample that excludes the 177 passengers without an age:

206 passengers actually survived. Of those that did actually survive, this model was able to predict 77% of them accurately based on the recall.

230 passengers were predicted to have survived with this model.
Out of those that were predicted to have survived, our precision for TP's, were 69%.
This would mean that out of predicted positives, 31% were false positives.

293 did not survive.  Of those that did not actually survive, this model was able to predict 75% of them accurately based on the recall.

269 passengers were predicted to not have survived.
Out of those that were predicted to have survived, our precision for TP's, was 82%.
This would mean that out of predicted positives, 18% were false negatives.

In [28]:
X1 = df_drop_age_na[['pclass', 'age', 'parch', 'sex_encode']]
y1 = df_drop_age_na[['survived']]

def analyze_log_reg_model(X_df, y_df, solver_name):
    print('Results using ' + str(solver_name) + ' as the solver.')
    print('-----')
    
    X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df, y_df, test_size = .30, random_state = 123)
    print(X_df_train.head())
    
    logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver=solver_name)
    logit.fit(X_df_train, y_df_train)
    print('-----')
    
    print('Coefficient: \n', logit.coef_)
    print('Intercept: \n', logit.intercept_)
    print('-----')
    
    y_df_pred = logit.predict(X_df_train)
    y_df_pred_proba = logit.predict_proba(X_df_train)
    print('Accuracy of Logistic Regression classifier on training set: {:.5f}'
     .format(logit.score(X_df_train, y_df_train)))
    print('-----')
    
    print('The results of running the model on the test sample:')
    
    cm = pd.DataFrame(confusion_matrix(y_df_train, y_df_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
    print(cm)
    print('-----')
    
    print(classification_report(y_df_train, y_df_pred, digits=4))
    print('-----')
    
    y_df_pred_test = logit.predict(X_df_test)
    y_df_pred_proba_test = logit.predict_proba(X_df_test)
    print('Accuracy of Logistic Regression classifier on test set: {:.5f}'
     .format(logit.score(X_df_test, y_df_test)))
    print('-----')
    
    print('-----')
    print('For the sample that excludes the 177 passengers without an age:')


#     print('{:.0f} passengers actually survived. Of those that did not actually survive, \
#     this model was able to predict {:.2f}% of them accurately based on the recall.'.format((cm.iloc[0][0]) + (cm.iloc[0][1])), #need equation for percentage)

#     230 passengers were predicted to have survived with this model.
#     Out of those that were predicted to have survived, our precision for TP's, was 69%.
#     This would mean that out of predicted positives, 31% were false positives.

#     293 did not survive.  Of those that did not actually survive, this model was able to predict 75% of them accurately based on the recall.

#     269 passengers were predicted to not have survived.
#     Out of those that were predicted to have survived, our precision for TP's, was 82%.
#     This would mean that out of predicted positives, 18% were false negatives.


In [359]:
analyze_log_reg_model(X1, y1, 'saga')

Results using saga as the solver.
-----
     pclass   age  parch  sex_encode
60        3  22.0      0           1
348       3   3.0      1           1
606       3  30.0      0           1
195       1  58.0      0           0
56        2  21.0      0           0
-----
Coefficient: 
 [[-0.26035825  0.01864     0.47079312 -1.29017474]]
Intercept: 
 [0.77681435]
-----
Accuracy of Logistic Regression classifier on training set: 0.75551
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     218      75
Actual +      47     159
-----
              precision    recall  f1-score   support

           0     0.8226    0.7440    0.7814       293
           1     0.6795    0.7718    0.7227       206

   micro avg     0.7555    0.7555    0.7555       499
   macro avg     0.7511    0.7579    0.7520       499
weighted avg     0.7635    0.7555    0.7572       499

-----
Accuracy of Logistic Regression classifier on test set: 0.66512
-----
-----
For the sample t

In [360]:
analyze_log_reg_model(X1, y1, 'liblinear')

Results using liblinear as the solver.
-----
     pclass   age  parch  sex_encode
60        3  22.0      0           1
348       3   3.0      1           1
606       3  30.0      0           1
195       1  58.0      0           0
56        2  21.0      0           0
-----
Coefficient: 
 [[-0.92706373 -0.01779685  0.09747168 -2.29284896]]
Intercept: 
 [4.20419336]
-----
Accuracy of Logistic Regression classifier on training set: 0.78958
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     219      74
Actual +      31     175
-----
              precision    recall  f1-score   support

           0     0.8760    0.7474    0.8066       293
           1     0.7028    0.8495    0.7692       206

   micro avg     0.7896    0.7896    0.7896       499
   macro avg     0.7894    0.7985    0.7879       499
weighted avg     0.8045    0.7896    0.7912       499

-----
Accuracy of Logistic Regression classifier on test set: 0.75814
-----
-----
For the sam

In [361]:
X2 = df_drop_age_na[['pclass', 'age', 'parch', 'sex_encode', 'sibsp']]
y2 = df_drop_age_na[['survived']]

In [362]:
analyze_log_reg_model(X2, y2, 'saga')

Results using saga as the solver.
-----
     pclass   age  parch  sex_encode  sibsp
60        3  22.0      0           1      0
348       3   3.0      1           1      1
606       3  30.0      0           1      0
195       1  58.0      0           0      0
56        2  21.0      0           0      0
-----
Coefficient: 
 [[-0.25329634  0.01860232  0.4788608  -1.28895344 -0.04746574]]
Intercept: 
 [0.77998646]
-----
Accuracy of Logistic Regression classifier on training set: 0.75952
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     221      72
Actual +      48     158
-----
              precision    recall  f1-score   support

           0     0.8216    0.7543    0.7865       293
           1     0.6870    0.7670    0.7248       206

   micro avg     0.7595    0.7595    0.7595       499
   macro avg     0.7543    0.7606    0.7556       499
weighted avg     0.7660    0.7595    0.7610       499

-----
Accuracy of Logistic Regression classi

In [363]:
analyze_log_reg_model(X2, y2, 'liblinear')

Results using liblinear as the solver.
-----
     pclass   age  parch  sex_encode  sibsp
60        3  22.0      0           1      0
348       3   3.0      1           1      1
606       3  30.0      0           1      0
195       1  58.0      0           0      0
56        2  21.0      0           0      0
-----
Coefficient: 
 [[-0.93271629 -0.02030594  0.18904603 -2.29565391 -0.26499086]]
Intercept: 
 [4.3924343]
-----
Accuracy of Logistic Regression classifier on training set: 0.77956
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     222      71
Actual +      39     167
-----
              precision    recall  f1-score   support

           0     0.8506    0.7577    0.8014       293
           1     0.7017    0.8107    0.7523       206

   micro avg     0.7796    0.7796    0.7796       499
   macro avg     0.7761    0.7842    0.7768       499
weighted avg     0.7891    0.7796    0.7811       499

-----
Accuracy of Logistic Regression cl

In [29]:
X1_df_train, X1_df_test, y1_df_train, y1_df_test = train_test_split(X1, y1, test_size = .30, random_state = 123)
print(X1_df_train.head())

     pclass   age  parch  sex_encode
60        3  22.0      0           1
348       3   3.0      1           1
606       3  30.0      0           1
195       1  58.0      0           0
56        2  21.0      0           0


In [32]:
logit_fit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='liblinear')
logit_fit.fit(X1_df_train, y1_df_train)

LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

# Decision Tree

## Iris

In [33]:
from acquire import get_iris_data

In [34]:
from prep_iris import prep_iris_data

In [35]:
df = prep_iris_data(get_iris_data())
df.head()

Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species,species_enc
0,1,5.1,3.5,1.4,0.2,setosa,0
1,2,4.9,3.0,1.4,0.2,setosa,0
2,3,4.7,3.2,1.3,0.2,setosa,0
3,4,4.6,3.1,1.5,0.2,setosa,0
4,5,5.0,3.6,1.4,0.2,setosa,0


In [36]:
df.isnull().sum()

measurement_id    0
sepal_length      0
sepal_width       0
petal_length      0
petal_width       0
species           0
species_enc       0
dtype: int64

In [37]:
X3 = df.drop(['measurement_id', 'species', 'species_enc'], axis=1)
y3 = df[['species']]
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = .30, random_state=123)
X3_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
114,5.8,2.8,5.1,2.4
136,6.3,3.4,5.6,2.4
53,5.5,2.3,4.0,1.3
19,5.1,3.8,1.5,0.3
38,4.4,3.0,1.3,0.2


In [38]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=123)

In [252]:
clf.fit(X3_train, y3_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [253]:
y3_pred = clf.predict(X3_train)
y3_pred[0:5]

array(['virginica', 'virginica', 'versicolor', 'setosa', 'setosa'],
      dtype=object)

In [254]:
y3_pred_proba = clf.predict_proba(X3_train)
y3_pred_proba[0:5]

array([[0.   , 0.   , 1.   ],
       [0.   , 0.   , 1.   ],
       [0.   , 0.975, 0.025],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ]])

In [255]:
print('Accuracy of Decision Tree classifier on training set: {:.5f}'.format(clf.score(X3_train, y3_train)))

Accuracy of Decision Tree classifier on training set: 0.98095


In [277]:
y3_train.head()

Unnamed: 0,species
114,virginica
136,virginica
53,versicolor
19,setosa
38,setosa


In [289]:
y3_train['species'].value_counts()


versicolor    40
virginica     33
setosa        32
Name: species, dtype: int64

In [290]:
cm = pd.DataFrame(confusion_matrix(y3_train, y3_pred),
             columns=['Pred Setosa', 'Pred Versicolor', 'Predicted Virginica'], index=['Actual Setosa', 'Actual Versicolor', 'Actual Virginica'])
cm

Unnamed: 0,Pred Setosa,Pred Versicolor,Predicted Virginica
Actual Setosa,32,0,0
Actual Versicolor,0,40,0
Actual Virginica,0,2,31


In [257]:
print(classification_report(y3_train, y3_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        32
  versicolor       0.95      1.00      0.98        40
   virginica       1.00      0.94      0.97        33

   micro avg       0.98      0.98      0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105



In [258]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(X3_test, y3_test)))

Accuracy of Decision Tree classifier on test set: 0.93


In [90]:
from sklearn.datasets import load_iris

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)

graph.render('iris_decision_tree2', view=True)

'iris_decision_tree2.pdf'

In [42]:
X4 = df.drop(['measurement_id', 'species', 'species_enc'], axis=1)
y4 = df[['species']]

def analyze_decision_tree(X_df, y_df, string_criterion, max_depth_input):
    features = list(X_df)
    
    print('Results using ' + str(string_criterion) + ' as the measure of impurity and ' + str(max_depth_input) + ' as the depth.')
    print('The features being used: ' + str(features))
    print('-----')
    X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df, y_df, test_size = .30, random_state=123)
    X_df_train.head()

    clf = DecisionTreeClassifier(criterion=string_criterion, max_depth=max_depth_input, random_state=123)

    clf.fit(X_df_train, y_df_train)

    y_df_pred = clf.predict(X_df_train)
    print('Head of predicted on X_train:')
    print(y_df_pred[0:5])
    print('-----')

    y_df_pred_proba = clf.predict_proba(X_df_train)
    print('Head of probabilities on X_train:')
    print(y_df_pred_proba[0:5])
    print('-----')

    print('Accuracy of Decision Tree classifier on training set: {:.8f}'.format(clf.score(X_df_train, y_df_train)))

In [267]:
analyze_decision_tree(X4, y4, 'entropy', 3)

Results using entropy as the measure of impurity and 3 as the depth.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.    0.    1.   ]
 [0.    0.    1.   ]
 [0.    0.975 0.025]
 [1.    0.    0.   ]
 [1.    0.    0.   ]]
-----
Accuracy of Decision Tree classifier on training set: 0.98095238


In [291]:
analyze_decision_tree(X4, y4, 'gini', 3)

Results using gini as the measure of impurity and 3 as the depth.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.    0.    1.   ]
 [0.    0.    1.   ]
 [0.    0.975 0.025]
 [1.    0.    0.   ]
 [1.    0.    0.   ]]
-----
Accuracy of Decision Tree classifier on training set: 0.98095238


In [292]:
analyze_decision_tree(X4, y4, 'gini', 4)

Results using gini as the measure of impurity and 4 as the depth.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of Decision Tree classifier on training set: 0.99047619


In [293]:
analyze_decision_tree(X4, y4, 'entropy', 4)

Results using entropy as the measure of impurity and 4 as the depth.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of Decision Tree classifier on training set: 0.99047619


In [294]:
analyze_decision_tree(X4, y4, 'entropy', 2)

Results using entropy as the measure of impurity and 2 as the depth.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.         0.03225806 0.96774194]
 [0.         0.03225806 0.96774194]
 [0.         0.92857143 0.07142857]
 [1.         0.         0.        ]
 [1.         0.         0.        ]]
-----
Accuracy of Decision Tree classifier on training set: 0.96190476


In [295]:
analyze_decision_tree(X4, y4, 'gini', 2)

Results using gini as the measure of impurity and 2 as the depth.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.         0.03225806 0.96774194]
 [0.         0.03225806 0.96774194]
 [0.         0.92857143 0.07142857]
 [1.         0.         0.        ]
 [1.         0.         0.        ]]
-----
Accuracy of Decision Tree classifier on training set: 0.96190476


In [296]:
analyze_decision_tree(X4, y4, 'entropy', 3)

Results using entropy as the measure of impurity and 3 as the depth.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.    0.    1.   ]
 [0.    0.    1.   ]
 [0.    0.975 0.025]
 [1.    0.    0.   ]
 [1.    0.    0.   ]]
-----
Accuracy of Decision Tree classifier on training set: 0.98095238


In [297]:
analyze_decision_tree(X4, y4, 'gini', 6)

Results using gini as the measure of impurity and 6 as the depth.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of Decision Tree classifier on training set: 1.00000000


In [40]:
analyze_decision_tree(X4, y4, 'entropy', 6)

Results using entropy as the measure of impurity and 6 as the depth.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of Decision Tree classifier on training set: 1.00000000


In [43]:
X4_df_train, X4_df_test, y4_df_train, y4_df_test = train_test_split(X4, y4, test_size = .30, random_state=123)
X4_df_train.head()

tree_fit = DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=123)

tree_fit.fit(X4_df_train, y4_df_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

## Titanic

In [299]:
X5 = df_drop_age_na[['pclass', 'age', 'parch', 'sex_encode']]
y5 = df_drop_age_na[['survived']]

In [300]:
analyze_decision_tree(X5, y5, 'entropy', 5)

Results using entropy as the measure of impurity and 5 as the depth.
The features being used: ['pclass', 'age', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 0 0 1 1]
-----
Head of probabilities on X_train:
[[0.9        0.1       ]
 [0.61111111 0.38888889]
 [0.9        0.1       ]
 [0.03030303 0.96969697]
 [0.         1.        ]]
-----
Accuracy of Decision Tree classifier on training set: 0.83567134


In [301]:
analyze_decision_tree(X5, y5, 'gini', 5)

Results using gini as the measure of impurity and 5 as the depth.
The features being used: ['pclass', 'age', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 0 0 1 1]
-----
Head of probabilities on X_train:
[[0.9        0.1       ]
 [0.61111111 0.38888889]
 [0.9        0.1       ]
 [0.         1.        ]
 [0.13333333 0.86666667]]
-----
Accuracy of Decision Tree classifier on training set: 0.84168337


In [302]:
X6 = df_drop_age_na[['pclass', 'age', 'parch', 'sex_encode', 'sibsp']]
y6 = df_drop_age_na[['survived']]

In [303]:
analyze_decision_tree(X6, y6, 'gini', 5)

Results using gini as the measure of impurity and 5 as the depth.
The features being used: ['pclass', 'age', 'parch', 'sex_encode', 'sibsp']
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.9        0.1       ]
 [0.         1.        ]
 [0.9        0.1       ]
 [0.         1.        ]
 [0.13333333 0.86666667]]
-----
Accuracy of Decision Tree classifier on training set: 0.86372745


In [304]:
analyze_decision_tree(X6, y6, 'entropy', 5)

Results using entropy as the measure of impurity and 5 as the depth.
The features being used: ['pclass', 'age', 'parch', 'sex_encode', 'sibsp']
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.88541667 0.11458333]
 [0.         1.        ]
 [0.88541667 0.11458333]
 [0.03030303 0.96969697]
 [0.         1.        ]]
-----
Accuracy of Decision Tree classifier on training set: 0.85971944


In [364]:
def test_classification_model(X_df, y_df, string_criterion, max_depth_input):   
    
    print('Results using ' + str(string_criterion) + ' as the measure of impurity and ' + str(max_depth_input) + ' as the depth.')
    print('-----')
    X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df, y_df, test_size = .30, random_state=123)
    X_df_train.head()

    clf = DecisionTreeClassifier(criterion=string_criterion, max_depth=max_depth_input, random_state=123)

    clf.fit(X_df_train, y_df_train)

    y_df_pred = clf.predict(X_df_train)
    print('Head of predicted on X_train:')
    print(y_df_pred[0:5])
    print('-----')

    y_df_pred_proba = clf.predict_proba(X_df_train)
    print('Head of probabilities on X_train:')
    print(y_df_pred_proba[0:5])
    print('-----')

    print('Accuracy of Decision Tree classifier on training set: {:.8f}'.format(clf.score(X_df_train, y_df_train)))
    print('-----')
    
    
    print('The results of running the model on the test sample:')
    
    cm = pd.DataFrame(confusion_matrix(y_df_train, y_df_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
    print(cm)
    print('-----')
    
    print(classification_report(y_df_train, y_df_pred, digits=4))
    print('-----')
    
    y_df_pred_test = clf.predict(X_df_test)
    y_df_pred_proba_test = clf.predict_proba(X_df_test)
    print('Accuracy of Logistic Regression classifier on test set: {:.6f}'
     .format(clf.score(X_df_test, y_df_test)))
    print('-----')

In [365]:
test_classification_model(X5, y5, 'gini', 5)

Results using gini as the measure of impurity and 5 as the depth.
-----
Head of predicted on X_train:
[0 0 0 1 1]
-----
Head of probabilities on X_train:
[[0.9        0.1       ]
 [0.61111111 0.38888889]
 [0.9        0.1       ]
 [0.         1.        ]
 [0.13333333 0.86666667]]
-----
Accuracy of Decision Tree classifier on training set: 0.84168337
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     258      35
Actual +      44     162
-----
              precision    recall  f1-score   support

           0     0.8543    0.8805    0.8672       293
           1     0.8223    0.7864    0.8040       206

   micro avg     0.8417    0.8417    0.8417       499
   macro avg     0.8383    0.8335    0.8356       499
weighted avg     0.8411    0.8417    0.8411       499

-----
Accuracy of Logistic Regression classifier on test set: 0.800000
-----


In [366]:
test_classification_model(X6, y6, 'gini', 6)

Results using gini as the measure of impurity and 6 as the depth.
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.87857143 0.12142857]
 [0.         1.        ]
 [0.87857143 0.12142857]
 [0.         1.        ]
 [0.09090909 0.90909091]]
-----
Accuracy of Decision Tree classifier on training set: 0.86973948
-----
The results of running the model on the test sample:
          Pred -  Pred +
Actual -     275      18
Actual +      47     159
-----
              precision    recall  f1-score   support

           0     0.8540    0.9386    0.8943       293
           1     0.8983    0.7718    0.8303       206

   micro avg     0.8697    0.8697    0.8697       499
   macro avg     0.8762    0.8552    0.8623       499
weighted avg     0.8723    0.8697    0.8679       499

-----
Accuracy of Logistic Regression classifier on test set: 0.827907
-----


## KNN

## Iris

In [44]:
def analyze_knn_iris(X_df, y_df, n_neighbor, weight):
    features = list(X_df)
    
    print('Results using ' + str(weight) + ' as the measure of impurity and ' + str(n_neighbor) + ' as the number of neighbors.')
    print('The features being used: ' + str(features))
    print('-----')
    X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df, y_df, test_size = .30, random_state=123)
    X_df_train.head()

    knn = KNeighborsClassifier(n_neighbors=n_neighbor, weights=weight)

    knn.fit(X_df_train, y_df_train)

    y_df_pred = knn.predict(X_df_train)
    print('Head of predicted on X_train:')
    print(y_df_pred[0:5])
    print('-----')

    y_df_pred_proba = knn.predict_proba(X_df_train)
    print('Head of probabilities on X_train:')
    print(y_df_pred_proba[0:5])
    print('-----')

    print('Accuracy of KNN classifier on training set: {:.8f}'.format(knn.score(X_df_train, y_df_train)))
    print('-----')
    
    cm = pd.DataFrame(confusion_matrix(y_df_train, y_df_pred),
             columns=['Pred Setosa', 'Pred Versicolor', 'Predicted Virginica'], index=['Actual Setosa', 'Actual Versicolor', 'Actual Virginica'])
    
    print(cm)
    print('-----')
    
    print(classification_report(y_df_train, y_df_pred, digits=4))
    print('-----')
    
    y_df_pred_test = knn.predict(X_df_test)
    y_df_pred_proba_test = knn.predict_proba(X_df_test)
    print('Accuracy of KNN classifier on train set: {:.6f}'
     .format(knn.score(X_df_train, y_df_train)))
    print('-----')
    

In [45]:
df.head()

Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species,species_enc
0,1,5.1,3.5,1.4,0.2,setosa,0
1,2,4.9,3.0,1.4,0.2,setosa,0
2,3,4.7,3.2,1.3,0.2,setosa,0
3,4,4.6,3.1,1.5,0.2,setosa,0
4,5,5.0,3.6,1.4,0.2,setosa,0


In [47]:
X_iris_KNN_1 = df.drop(['measurement_id', 'species', 'species_enc'], axis=1)
y_iris_KNN_1 = df[['species']]

In [371]:
analyze_knn_iris(X_iris_KNN_1, y_iris_KNN_1, 5, 'uniform')

Results using uniform as the measure of impurity and 5 as the number of neighbors.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of KNN classifier on training set: 0.98095238
-----
                   Pred Setosa  Pred Versicolor  Predicted Virginica
Actual Setosa               32                0                    0
Actual Versicolor            0               39                    1
Actual Virginica             0                1                   32
-----
              precision    recall  f1-score   support

      setosa     1.0000    1.0000    1.0000        32
  versicolor     0.9750    0.9750    0.9750        40
   virginica     0.9697    0.9697    0.9697        33

   micro avg     0.9810    0.9810    0.9810       105
   macro avg  

### Summary
The accuracy for the model was 0.980952.  
Precision for Versicolor indicates that of the 40 times this model predicted Versicolor, it was correct on 39 instances, 97.5%.
Precision for Virginica indicates that of the 33 times this model predicted Versicolor, it was correct on 32 instances, 96.97%.
Recall for Versicolor indicates that out of the actual Versicolor (40) the model predicted 39 correctly, 97.5%
Recall for Virginica indicates that out of the actual Virginica (33) the model predicted 32 correctly, 96.97%
F1-score is the average of the precision and recall for each actual value.
Support indicates that there are a total of 32 Setosa, 40 Versicolor, and Virginica actuals.

In [48]:
analyze_knn_iris(X_iris_KNN_1, y_iris_KNN_1, 5, 'distance')

Results using distance as the measure of impurity and 5 as the number of neighbors.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
-----
Accuracy of KNN classifier on training set: 1.00000000
-----
                   Pred Setosa  Pred Versicolor  Predicted Virginica
Actual Setosa               32                0                    0
Actual Versicolor            0               40                    0
Actual Virginica             0                0                   33
-----
              precision    recall  f1-score   support

      setosa     1.0000    1.0000    1.0000        32
  versicolor     1.0000    1.0000    1.0000        40
   virginica     1.0000    1.0000    1.0000        33

   micro avg     1.0000    1.0000    1.0000       105
   macro avg 

In [373]:
analyze_knn_iris(X_iris_KNN_1, y_iris_KNN_1, 10, 'uniform')

Results using uniform as the measure of impurity and 10 as the number of neighbors.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.  0.1 0.9]
 [0.  0.  1. ]
 [0.  1.  0. ]
 [1.  0.  0. ]
 [1.  0.  0. ]]
-----
Accuracy of KNN classifier on training set: 0.97142857
-----
                   Pred Setosa  Pred Versicolor  Predicted Virginica
Actual Setosa               32                0                    0
Actual Versicolor            0               39                    1
Actual Virginica             0                2                   31
-----
              precision    recall  f1-score   support

      setosa     1.0000    1.0000    1.0000        32
  versicolor     0.9512    0.9750    0.9630        40
   virginica     0.9688    0.9394    0.9538        33

   micro avg     0.9714    0.9714    0.9714       10

In [374]:
analyze_knn_iris(X_iris_KNN_1, y_iris_KNN_1, 20, 'uniform')

Results using uniform as the measure of impurity and 20 as the number of neighbors.
The features being used: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-----
Head of predicted on X_train:
['virginica' 'virginica' 'versicolor' 'setosa' 'setosa']
-----
Head of probabilities on X_train:
[[0.   0.15 0.85]
 [0.   0.05 0.95]
 [0.   0.95 0.05]
 [1.   0.   0.  ]
 [1.   0.   0.  ]]
-----
Accuracy of KNN classifier on training set: 0.96190476
-----
                   Pred Setosa  Pred Versicolor  Predicted Virginica
Actual Setosa               32                0                    0
Actual Versicolor            0               39                    1
Actual Virginica             0                3                   30
-----
              precision    recall  f1-score   support

      setosa     1.0000    1.0000    1.0000        32
  versicolor     0.9286    0.9750    0.9512        40
   virginica     0.9677    0.9091    0.9375        33

   micro avg     0.9619    0.9619    

### The best KNN model out of the 4 above was the model using 'Distance' as weights and setting K=5.  As K was increased, the model seemed to prone to being overfit and less acurate.

In [51]:
X_iris_KNN_train, X_iris_KNN_test, y_iris_KNN_train, y_iris_KNN_test = train_test_split(X_iris_KNN_1, y_iris_KNN_1, test_size = .30, random_state=123)
X_iris_KNN_train.head()

knn_fit = KNeighborsClassifier(n_neighbors=5, weights='distance')

knn_fit.fit(X_iris_KNN_train, y_iris_KNN_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='distance')

## Titanic

In [385]:
def analyze_knn_binomial(X_df, y_df, n_neighbor, weight):
    features = list(X_df)
    
    print('Results using ' + str(weight) + ' as the measure of impurity and ' + str(n_neighbor) + ' as the number of neighbors.')
    print('The features being used: ' + str(features))
    print('-----')
    X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df, y_df, test_size = .30, random_state=123)
    X_df_train.head()

    knn = KNeighborsClassifier(n_neighbors=n_neighbor, weights=weight)

    knn.fit(X_df_train, y_df_train)

    y_df_pred = knn.predict(X_df_train)
    print('Head of predicted on X_train:')
    print(y_df_pred[0:5])
    print('-----')

    y_df_pred_proba = knn.predict_proba(X_df_train)
    print('Head of probabilities on X_train:')
    print(y_df_pred_proba[0:5])
    print('-----')

    print('Accuracy of KNN classifier on training set: {:.8f}'.format(knn.score(X_df_train, y_df_train)))
    print('-----')
    
    cm = pd.DataFrame(confusion_matrix(y_df_train, y_df_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
    
    print(cm)
    print('-----')
    
    print(classification_report(y_df_train, y_df_pred, digits=4))
    print('-----')
    
    y_df_pred_test = knn.predict(X_df_test)
    y_df_pred_proba_test = knn.predict_proba(X_df_test)
    print('Accuracy of KNN classifier on train set: {:.6f}'
     .format(knn.score(X_df_train, y_df_train)))
    print('-----')
    

In [53]:
X_titanic_KNN_1 = df_drop_age_na[['pclass', 'age', 'sibsp', 'parch', 'sex_encode']]
y_titanic_KNN_1 = df_drop_age_na[['survived']]

In [386]:
analyze_knn_binomial(X_titanic_KNN_1, y_titanic_KNN_1, 5, 'uniform')

Results using uniform as the measure of impurity and 5 as the number of neighbors.
The features being used: ['pclass', 'age', 'sibsp', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[1.  0. ]
 [0.2 0.8]
 [1.  0. ]
 [0.2 0.8]
 [0.4 0.6]]
-----
Accuracy of KNN classifier on training set: 0.84368737
-----
          Pred -  Pred +
Actual -     267      26
Actual +      52     154
-----
              precision    recall  f1-score   support

           0     0.8370    0.9113    0.8725       293
           1     0.8556    0.7476    0.7979       206

   micro avg     0.8437    0.8437    0.8437       499
   macro avg     0.8463    0.8294    0.8352       499
weighted avg     0.8447    0.8437    0.8417       499

-----
Accuracy of KNN classifier on train set: 0.843687
-----


In [387]:
analyze_knn_binomial(X_titanic_KNN_1, y_titanic_KNN_1, 5, 'distance')

Results using distance as the measure of impurity and 5 as the number of neighbors.
The features being used: ['pclass', 'age', 'sibsp', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]
-----
Accuracy of KNN classifier on training set: 0.94589178
-----
          Pred -  Pred +
Actual -     290       3
Actual +      24     182
-----
              precision    recall  f1-score   support

           0     0.9236    0.9898    0.9555       293
           1     0.9838    0.8835    0.9309       206

   micro avg     0.9459    0.9459    0.9459       499
   macro avg     0.9537    0.9366    0.9432       499
weighted avg     0.9484    0.9459    0.9454       499

-----
Accuracy of KNN classifier on train set: 0.945892
-----


In [390]:
analyze_knn_binomial(X_titanic_KNN_1, y_titanic_KNN_1, 10, 'uniform')

Results using uniform as the measure of impurity and 10 as the number of neighbors.
The features being used: ['pclass', 'age', 'sibsp', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 1 0 0 0]
-----
Head of probabilities on X_train:
[[0.9 0.1]
 [0.2 0.8]
 [0.9 0.1]
 [0.5 0.5]
 [0.7 0.3]]
-----
Accuracy of KNN classifier on training set: 0.78356713
-----
          Pred -  Pred +
Actual -     280      13
Actual +      95     111
-----
              precision    recall  f1-score   support

           0     0.7467    0.9556    0.8383       293
           1     0.8952    0.5388    0.6727       206

   micro avg     0.7836    0.7836    0.7836       499
   macro avg     0.8209    0.7472    0.7555       499
weighted avg     0.8080    0.7836    0.7700       499

-----
Accuracy of KNN classifier on train set: 0.783567
-----


In [391]:
analyze_knn_binomial(X_titanic_KNN_1, y_titanic_KNN_1, 20, 'uniform')

Results using uniform as the measure of impurity and 20 as the number of neighbors.
The features being used: ['pclass', 'age', 'sibsp', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 1 0 0 0]
-----
Head of probabilities on X_train:
[[0.85 0.15]
 [0.2  0.8 ]
 [0.8  0.2 ]
 [0.55 0.45]
 [0.75 0.25]]
-----
Accuracy of KNN classifier on training set: 0.72745491
-----
          Pred -  Pred +
Actual -     271      22
Actual +     114      92
-----
              precision    recall  f1-score   support

           0     0.7039    0.9249    0.7994       293
           1     0.8070    0.4466    0.5750       206

   micro avg     0.7275    0.7275    0.7275       499
   macro avg     0.7555    0.6858    0.6872       499
weighted avg     0.7465    0.7275    0.7068       499

-----
Accuracy of KNN classifier on train set: 0.727455
-----


### The most accurate model was using distance and 5 max levels deep.  While using uniform as the weight, there was a significant lack in accuracy as max level increased.

# Random Forest

## Titanic

In [52]:
def analyze_rf_binomial(X_df, y_df, string_criterion, min_sample_leaf_input, max_depth_input):
    features = list(X_df)
    
    print('Results using ' + str(string_criterion) + ' as the measure of impurity and ' + str(max_depth_input) + ' as max depth level and ' +str(min_sample_leaf_input) + ' as the min_sample_leaf.')
    print('The features being used: ' + str(features))
    print('-----')
    X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df, y_df, test_size = .30, random_state=123)
    X_df_train.head()

    rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion=string_criterion,
                            min_samples_leaf=min_sample_leaf_input,
                            n_estimators=100,
                            max_depth=max_depth_input, 
                            random_state=123)

    rf.fit(X_df_train, y_df_train)

    y_df_pred = rf.predict(X_df_train)
    print('Head of predicted on X_train:')
    print(y_df_pred[0:5])
    print('-----')

    y_df_pred_proba = rf.predict_proba(X_df_train)
    print('Head of probabilities on X_train:')
    print(y_df_pred_proba[0:5])
    print('-----')

    print('Accuracy of rf classifier on training set: {:.8f}'.format(rf.score(X_df_train, y_df_train)))
    print('-----')
    
    cm = pd.DataFrame(confusion_matrix(y_df_train, y_df_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])
    
    print(cm)
    print('-----')
    
    print(classification_report(y_df_train, y_df_pred, digits=4))
    print('-----')
    
    y_df_pred_test = rf.predict(X_df_test)
    y_df_pred_proba_test = rf.predict_proba(X_df_test)
    print('Accuracy of RF classifier on train set: {:.6f}'
     .format(rf.score(X_df_train, y_df_train)))
    print('-----')
    

In [401]:
analyze_rf_binomial(X_titanic_KNN_1, y_titanic_KNN_1, 'gini', 1, 20)

Results using gini as the measure of impurity and 20 as max depth level and 1 as the min_sample_leaf.
The features being used: ['pclass', 'age', 'sibsp', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.87633067 0.12366933]
 [0.03       0.97      ]
 [1.         0.        ]
 [0.         1.        ]
 [0.02       0.98      ]]
-----
Accuracy of rf classifier on training set: 0.94589178
-----
          Pred -  Pred +
Actual -     284       9
Actual +      18     188
-----
              precision    recall  f1-score   support

           0     0.9404    0.9693    0.9546       293
           1     0.9543    0.9126    0.9330       206

   micro avg     0.9459    0.9459    0.9459       499
   macro avg     0.9474    0.9410    0.9438       499
weighted avg     0.9461    0.9459    0.9457       499

-----
Accuracy of RF classifier on train set: 0.945892
-----


In [402]:
analyze_rf_binomial(X_titanic_KNN_1, y_titanic_KNN_1, 'entropy', 1, 20)

Results using entropy as the measure of impurity and 20 as max depth level and 1 as the min_sample_leaf.
The features being used: ['pclass', 'age', 'sibsp', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.87633067 0.12366933]
 [0.06666667 0.93333333]
 [1.         0.        ]
 [0.         1.        ]
 [0.02       0.98      ]]
-----
Accuracy of rf classifier on training set: 0.94589178
-----
          Pred -  Pred +
Actual -     284       9
Actual +      18     188
-----
              precision    recall  f1-score   support

           0     0.9404    0.9693    0.9546       293
           1     0.9543    0.9126    0.9330       206

   micro avg     0.9459    0.9459    0.9459       499
   macro avg     0.9474    0.9410    0.9438       499
weighted avg     0.9461    0.9459    0.9457       499

-----
Accuracy of RF classifier on train set: 0.945892
-----


In [404]:
analyze_rf_binomial(X_titanic_KNN_1, y_titanic_KNN_1, 'gini', 5, 3)

Results using gini as the measure of impurity and 3 as max depth level and 5 as the min_sample_leaf.
The features being used: ['pclass', 'age', 'sibsp', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.85118297 0.14881703]
 [0.49624102 0.50375898]
 [0.83718195 0.16281805]
 [0.14479103 0.85520897]
 [0.22619012 0.77380988]]
-----
Accuracy of rf classifier on training set: 0.83567134
-----
          Pred -  Pred +
Actual -     282      11
Actual +      71     135
-----
              precision    recall  f1-score   support

           0     0.7989    0.9625    0.8731       293
           1     0.9247    0.6553    0.7670       206

   micro avg     0.8357    0.8357    0.8357       499
   macro avg     0.8618    0.8089    0.8201       499
weighted avg     0.8508    0.8357    0.8293       499

-----
Accuracy of RF classifier on train set: 0.835671
-----


In [403]:
analyze_rf_binomial(X_titanic_KNN_1, y_titanic_KNN_1, 'entropy', 5, 3)

Results using entropy as the measure of impurity and 3 as max depth level and 5 as the min_sample_leaf.
The features being used: ['pclass', 'age', 'sibsp', 'parch', 'sex_encode']
-----
Head of predicted on X_train:
[0 1 0 1 1]
-----
Head of probabilities on X_train:
[[0.85085113 0.14914887]
 [0.49409506 0.50590494]
 [0.83759766 0.16240234]
 [0.13541377 0.86458623]
 [0.20952814 0.79047186]]
-----
Accuracy of rf classifier on training set: 0.83567134
-----
          Pred -  Pred +
Actual -     273      20
Actual +      62     144
-----
              precision    recall  f1-score   support

           0     0.8149    0.9317    0.8694       293
           1     0.8780    0.6990    0.7784       206

   micro avg     0.8357    0.8357    0.8357       499
   macro avg     0.8465    0.8154    0.8239       499
weighted avg     0.8410    0.8357    0.8318       499

-----
Accuracy of RF classifier on train set: 0.835671
-----


### For training the model, increasing max depth level and going down to leaves that have one instance is much more accurate.  There is a possibility that running this on the test sample would lead to poor results as the model could be overfit.

In [55]:
X_titanic_rf_train, X_titanic_rf_test, y_titanic_rf_train, y_titanic_rf_test = train_test_split(X_titanic_KNN_1, y_titanic_KNN_1, test_size = .30, random_state=123)
X_titanic_rf_train.head()

forest_fit = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

forest_fit.fit(X_titanic_rf_train, y_titanic_rf_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)