In [1]:
import graphviz
from graphviz import Graph

In [2]:
import numpy as np
import pandas as pd
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

# Decision Tree

## 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [4]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [5]:
df["baseline_prediction"] = 0
# baseline is 0 for not survived, this is the majority 

baseline_accuracy = (df.survived == df.baseline_prediction).mean()
baseline_accuracy

0.6161616161616161

# Planning

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,baseline_prediction
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,0
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,0


In [7]:
df.shape

(891, 15)

In [8]:
df.dtypes

Unnamed: 0               int64
passenger_id             int64
survived                 int64
pclass                   int64
sex                     object
age                    float64
sibsp                    int64
parch                    int64
fare                   float64
embarked                object
class                   object
deck                    object
embark_town             object
alone                    int64
baseline_prediction      int64
dtype: object

In [9]:
df["is_female"] = df.sex == "female"

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,baseline_prediction,is_female
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,0,False
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0,True
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,0,True
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,0,True
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,0,False


In [10]:
df = pd.concat([df, (pd.get_dummies(df[["class"]], drop_first=True))], axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,baseline_prediction,is_female,class_Second,class_Third
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,0,False,0,1
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0,True,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,0,True,0,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,0,True,0,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,0,False,0,1


In [11]:
df = pd.concat([df, (pd.get_dummies(df[["embark_town"]], drop_first=True))], axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,baseline_prediction,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,0,False,0,1,0,1
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0,True,0,0,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,0,True,0,1,0,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,0,True,0,0,0,1
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,0,False,0,1,0,1


In [12]:
df = df.drop(columns=["Unnamed: 0", 'pclass', 'sex', 'embarked', 'class', 'deck', 'embark_town',])
df.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,baseline_prediction,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
0,0,0,22.0,1,0,7.25,0,0,False,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,True,0,0,0,0
2,2,1,26.0,0,0,7.925,1,0,True,0,1,0,1
3,3,1,35.0,1,0,53.1,0,0,True,0,0,0,1
4,4,0,35.0,0,0,8.05,1,0,False,0,1,0,1


In [13]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test

train, validate, test = split(df, stratify_by="survived")
train.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,baseline_prediction,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,583,0,36.0,0,0,40.125,1,0,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,0,True,1,0,0,1
306,306,1,,0,0,110.8833,1,0,True,0,0,0,0


In [14]:
train.isna().sum()

passenger_id                0
survived                    0
age                        97
sibsp                       0
parch                       0
fare                        0
alone                       0
baseline_prediction         0
is_female                   0
class_Second                0
class_Third                 0
embark_town_Queenstown      0
embark_town_Southampton     0
dtype: int64

In [15]:
#Calculate our fill value using train dataset only.

avg_age = int(train.age.mean())

# Fill null values in all of our datasets using our hardcoded value.

train.age = train.age.fillna(avg_age)

validate.age = validate.age.fillna(avg_age)

test.age = test.age.fillna(avg_age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [118]:
train.isna().sum()

passenger_id               0
survived                   0
age                        0
sibsp                      0
parch                      0
fare                       0
alone                      0
baseline_prediction        0
is_female                  0
class_Second               0
class_Third                0
embark_town_Queenstown     0
embark_town_Southampton    0
dtype: int64

In [16]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [17]:
train.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,baseline_prediction,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,583,0,36.0,0,0,40.125,1,0,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,0,True,1,0,0,1
306,306,1,29.0,0,0,110.8833,1,0,True,0,0,0,0


In [18]:
train.shape

(498, 13)

In [19]:
train.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,baseline_prediction,is_female,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,583,0,36.0,0,0,40.125,1,0,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,0,True,1,0,0,1
306,306,1,29.0,0,0,110.8833,1,0,True,0,0,0,0


In [20]:
train.dtypes

passenger_id                 int64
survived                     int64
age                        float64
sibsp                        int64
parch                        int64
fare                       float64
alone                        int64
baseline_prediction          int64
is_female                     bool
class_Second                 uint8
class_Third                  uint8
embark_town_Queenstown       uint8
embark_town_Southampton      uint8
dtype: object

## 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [21]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
# clf = DecisionTreeClassifier(max_depth=3, random_state=123)

clf = DecisionTreeClassifier(max_depth=3, random_state=319)

In [22]:
# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=3, random_state=319)

In [23]:
# Visualize the model so it can explain itself!
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [24]:
# Now we'll make a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array([0, 0, 0])

In [25]:
# Estimate the probabilities for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.62222222, 0.37777778],
       [0.62222222, 0.37777778],
       [0.89285714, 0.10714286]])

In [26]:
y_train.head(3)

583    0
165    1
50     0
Name: survived, dtype: int64

## 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [27]:
confusion_matrix(y_train, y_pred)

array([[277,  30],
       [ 57, 134]])

In [28]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,277,30
1,57,134


## 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [29]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       307
           1       0.82      0.70      0.75       191

    accuracy                           0.83       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.83      0.82       498



## 5. Run through steps 2-4 using a different max_depth value.

### first we'll try max depth 2

In [30]:
clf = DecisionTreeClassifier(max_depth=2, random_state=319)

In [31]:
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=2, random_state=319)

In [32]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [33]:
y_pred = clf.predict(X_train)
y_pred[0:3]

array([0, 0, 0])

In [34]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.68644068, 0.31355932],
       [0.68644068, 0.31355932],
       [0.68644068, 0.31355932]])

In [35]:
y_train.head(3)

583    0
165    1
50     0
Name: survived, dtype: int64

In [36]:
confusion_matrix(y_train, y_pred)

array([[265,  42],
       [ 58, 133]])

In [37]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,265,42
1,58,133


In [38]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



### now we'll try max depth of 4

In [39]:
clf = DecisionTreeClassifier(max_depth=4, random_state=319)

In [40]:
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=4, random_state=319)

In [41]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [42]:
y_pred = clf.predict(X_train)
y_pred[0:3]

array([0, 0, 0])

In [43]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.6626506, 0.3373494],
       [0.6626506, 0.3373494],
       [1.       , 0.       ]])

In [44]:
y_train.head(3)

583    0
165    1
50     0
Name: survived, dtype: int64

In [45]:
confusion_matrix(y_train, y_pred)

array([[279,  28],
       [ 48, 143]])

In [46]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,279,28
1,48,143


In [47]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88       307
           1       0.84      0.75      0.79       191

    accuracy                           0.85       498
   macro avg       0.84      0.83      0.84       498
weighted avg       0.85      0.85      0.85       498



## 6. Which model performs better on your in-sample data?

### The model with max depth of 4 appears to perform the best

## 7. Which model performs best on your out-of-sample data, the validate set?

### Max depth of 2

In [48]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [49]:
y_validate.head(3)

610    0
424    0
568    0
Name: survived, dtype: int64

In [50]:
y_pred_proba = clf.predict_proba(X_validate)
y_pred_proba[0:3]

array([[1.        , 0.        ],
       [0.6626506 , 0.3373494 ],
       [0.91919192, 0.08080808]])

In [51]:
labels = sorted(y_validate.unique())

pd.DataFrame(confusion_matrix(y_validate, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,112,20
1,25,57


In [52]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       132
           1       0.74      0.70      0.72        82

    accuracy                           0.79       214
   macro avg       0.78      0.77      0.77       214
weighted avg       0.79      0.79      0.79       214



### max depth 3

In [53]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [54]:
y_validate.head(3)

610    0
424    0
568    0
Name: survived, dtype: int64

In [55]:
y_pred_proba = clf.predict_proba(X_validate)
y_pred_proba[0:3]

array([[1.        , 0.        ],
       [0.6626506 , 0.3373494 ],
       [0.91919192, 0.08080808]])

In [56]:
labels = sorted(y_validate.unique())

pd.DataFrame(confusion_matrix(y_validate, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,112,20
1,25,57


In [57]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       132
           1       0.74      0.70      0.72        82

    accuracy                           0.79       214
   macro avg       0.78      0.77      0.77       214
weighted avg       0.79      0.79      0.79       214



### max depth 4

In [58]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [59]:
y_validate.head(3)

610    0
424    0
568    0
Name: survived, dtype: int64

In [60]:
y_pred_proba = clf.predict_proba(X_validate)
y_pred_proba[0:3]

array([[1.        , 0.        ],
       [0.6626506 , 0.3373494 ],
       [0.91919192, 0.08080808]])

In [61]:
labels = sorted(y_validate.unique())

pd.DataFrame(confusion_matrix(y_validate, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,112,20
1,25,57


In [62]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       132
           1       0.74      0.70      0.72        82

    accuracy                           0.79       214
   macro avg       0.78      0.77      0.77       214
weighted avg       0.79      0.79      0.79       214



# Random Forest

## 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [63]:
from sklearn.ensemble import RandomForestClassifier

In [64]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=319)

In [65]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=319)

In [66]:
print(rf.feature_importances_)

[0.15338898 0.15520168 0.04421496 0.02948743 0.18816377 0.01770791
 0.         0.28987571 0.01987658 0.06722058 0.011318   0.0235444 ]


In [67]:
y_pred = rf.predict(X_train)

In [68]:
y_pred_proba = rf.predict_proba(X_train)

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [69]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [70]:
print(confusion_matrix(y_train, y_pred))

[[307   0]
 [ 13 178]]


In [71]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       307
           1       1.00      0.93      0.96       191

    accuracy                           0.97       498
   macro avg       0.98      0.97      0.97       498
weighted avg       0.97      0.97      0.97       498



## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [85]:
# crate a function to calculate these metrics
def get_metrics_binary(rf):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = rf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [86]:
get_metrics_binary(rf)


    The accuracy for our model is 0.9739
    The True Positive Rate is 0.696, The False Positive Rate is 0.0358,
    The True Negative Rate is 0.964, and the False Negative Rate is 0.304
    


Unnamed: 0,precision,recall,f1-score,support
0,0.836158,0.964169,0.895613,307.0
1,0.923611,0.696335,0.79403,191.0
accuracy,0.861446,0.861446,0.861446,0.861446
macro avg,0.879885,0.830252,0.844821,498.0
weighted avg,0.869699,0.861446,0.856652,498.0


## 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth

In [87]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=319)

In [88]:
rf2.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=3, random_state=319)

In [89]:
print(rf2.feature_importances_)

[0.07581927 0.08563903 0.04714623 0.02562955 0.14975149 0.01783644
 0.         0.43389568 0.01994868 0.11514895 0.00925593 0.01992875]


In [90]:
y_pred = rf2.predict(X_train)

In [91]:
y_pred_proba = rf2.predict_proba(X_train)

In [92]:
get_metrics_binary(rf2)


    The accuracy for our model is 0.8614
    The True Positive Rate is 0.696, The False Positive Rate is 0.0358,
    The True Negative Rate is 0.964, and the False Negative Rate is 0.304
    


Unnamed: 0,precision,recall,f1-score,support
0,0.836158,0.964169,0.895613,307.0
1,0.923611,0.696335,0.79403,191.0
accuracy,0.861446,0.861446,0.861446,0.861446
macro avg,0.879885,0.830252,0.844821,498.0
weighted avg,0.869699,0.861446,0.856652,498.0


## 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [93]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.80


In [94]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf2.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.81


# K-Nearest Neighbors 

## 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [96]:
from sklearn.neighbors import KNeighborsClassifier



In [97]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [98]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [99]:
y_pred = knn.predict(X_train)

In [100]:
y_pred_proba = knn.predict_proba(X_train)

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [101]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.74


In [102]:
print(confusion_matrix(y_train, y_pred))

[[266  41]
 [ 88 103]]


In [103]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.87      0.80       307
           1       0.72      0.54      0.61       191

    accuracy                           0.74       498
   macro avg       0.73      0.70      0.71       498
weighted avg       0.74      0.74      0.73       498



## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [104]:
get_metrics_binary(knn)


    The accuracy for our model is 0.741
    The True Positive Rate is 0.539, The False Positive Rate is 0.134,
    The True Negative Rate is 0.866, and the False Negative Rate is 0.461
    


Unnamed: 0,precision,recall,f1-score,support
0,0.751412,0.86645,0.804841,307.0
1,0.715278,0.539267,0.614925,191.0
accuracy,0.740964,0.740964,0.740964,0.740964
macro avg,0.733345,0.702858,0.709883,498.0
weighted avg,0.737554,0.740964,0.732002,498.0


## 4. Run through steps 2-4 setting k to 10

In [105]:
knn2 = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [106]:
knn2.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [107]:
y_pred = knn2.predict(X_train)

In [108]:
y_pred_proba = knn2.predict_proba(X_train)

In [110]:
get_metrics_binary(knn2)


    The accuracy for our model is 0.6988
    The True Positive Rate is 0.335, The False Positive Rate is 0.0749,
    The True Negative Rate is 0.925, and the False Negative Rate is 0.665
    


Unnamed: 0,precision,recall,f1-score,support
0,0.690998,0.925081,0.791086,307.0
1,0.735632,0.335079,0.460432,191.0
accuracy,0.698795,0.698795,0.698795,0.698795
macro avg,0.713315,0.63008,0.625759,498.0
weighted avg,0.708116,0.698795,0.664269,498.0


## 5. Run through setps 2-4 setting k to 20

In [111]:
knn3 = KNeighborsClassifier(n_neighbors=20, weights='uniform')

In [112]:
knn3.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [113]:
y_pred = knn3.predict(X_train)

In [114]:
y_pred_proba = knn3.predict_proba(X_train)

In [115]:
get_metrics_binary(knn3)


    The accuracy for our model is 0.6707
    The True Positive Rate is 0.236, The False Positive Rate is 0.0586,
    The True Negative Rate is 0.941, and the False Negative Rate is 0.764
    


Unnamed: 0,precision,recall,f1-score,support
0,0.664368,0.941368,0.778976,307.0
1,0.714286,0.235602,0.354331,191.0
accuracy,0.670683,0.670683,0.670683,0.670683
macro avg,0.689327,0.588485,0.566653,498.0
weighted avg,0.683513,0.670683,0.61611,498.0


## 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [116]:
## the model that works best on evaluating the test data appears to be the k-5 model

## 7. Which model performs best on our out-of-sample data from validate?

In [117]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn2.score(X_validate, y_validate)))

print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn3.score(X_validate, y_validate)))

Accuracy of KNN classifier on test set: 0.61
Accuracy of KNN classifier on test set: 0.62
Accuracy of KNN classifier on test set: 0.65


# Logistic Regression