In [51]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import acquire
import prepare

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

## Using titanic data

In [2]:
# pulling in titanic data with function
df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
# splitting titanic data into train, test, validate with function
train, validate, test = prepare.prep_titanic_data(df)
train.shape

(498, 14)

In [4]:
validate.shape

(214, 14)

In [5]:
test.shape

(179, 14)

In [6]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
583,0,1,male,36.0,0,0,40.125,Cherbourg,1,0,1,1,0,0
165,1,3,male,9.0,0,2,20.525,Southampton,0,0,1,0,0,1
50,0,3,male,7.0,4,1,39.6875,Southampton,0,0,1,0,0,1
259,1,2,female,50.0,0,1,26.0,Southampton,0,1,0,0,0,1
306,1,1,female,,0,0,110.8833,Cherbourg,1,1,0,1,0,0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 498 non-null    int64  
 1   pclass                   498 non-null    int64  
 2   sex                      498 non-null    object 
 3   age                      401 non-null    float64
 4   sibsp                    498 non-null    int64  
 5   parch                    498 non-null    int64  
 6   fare                     498 non-null    float64
 7   embark_town              498 non-null    object 
 8   alone                    498 non-null    int64  
 9   sex_female               498 non-null    uint8  
 10  sex_male                 498 non-null    uint8  
 11  embark_town_Cherbourg    498 non-null    uint8  
 12  embark_town_Queenstown   498 non-null    uint8  
 13  embark_town_Southampton  498 non-null    uint8  
dtypes: float64(2), int64(5),

## Exercise 1

What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [8]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

**Baseline is 0 (did not survive) since that is the most prevalent value**

In [9]:
# creating a baseline column to compare to actual
train['baseline'] = 0
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,baseline
583,0,1,male,36.0,0,0,40.125,Cherbourg,1,0,1,1,0,0,0
165,1,3,male,9.0,0,2,20.525,Southampton,0,0,1,0,0,1,0
50,0,3,male,7.0,4,1,39.6875,Southampton,0,0,1,0,0,1,0
259,1,2,female,50.0,0,1,26.0,Southampton,0,1,0,0,0,1,0
306,1,1,female,,0,0,110.8833,Cherbourg,1,1,0,1,0,0,0


In [10]:
# calculating baseline accuracy
(train.baseline == train.survived).mean()

0.6164658634538153

**Baseline accuracy is 62% so, to add value, a model needs to have greater accuracy**

## Exercise 2 

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [11]:
# creating x and y version of train where x includes everything but the target variable and y contains only the target variable
x_train = train.drop(columns=['survived', 'baseline', 'sex', 'embark_town', 'sex_female', 'age'])
y_train = train.survived

x_validate = validate.drop(columns=['survived', 'sex', 'embark_town', 'sex_female', 'age'])
y_validate = validate.survived

x_test = test.drop(columns=['survived', 'sex', 'embark_town', 'sex_female', 'age'])
y_test = test.survived

In [12]:
# creating the Decision Tree object with desired hyper-parameters
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [13]:
# fitting the algorithm to the training data
clf = clf.fit(x_train, y_train)

In [14]:
# creating visualization of tree
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [15]:
# making prediction on observations
y_pred = clf.predict(x_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [16]:
# estimating probability of each species
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.69827586, 0.30172414],
       [0.69827586, 0.30172414],
       [0.69827586, 0.30172414],
       [0.07142857, 0.92857143],
       [0.01923077, 0.98076923]])

## Exercise 3. 

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [17]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [18]:
# creating confusion matrix
confusion_matrix(y_train, y_pred)

array([[276,  31],
       [ 57, 134]])

In [19]:
# adding clarity
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,276,31
1,57,134


In [20]:
# creating classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       307
           1       0.81      0.70      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



## Exercise 4 

Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [21]:
# confusion matrix with tp (upper left), fp (lower left), tn (lower right), & fn (upper right)
pd.DataFrame(confusion_matrix(y_train, y_pred, normalize='true'), index=labels, columns=labels)

Unnamed: 0,0,1
0,0.899023,0.100977
1,0.298429,0.701571


In [22]:
# classification report with accuracy, precision, recall, f1-score, & support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       307
           1       0.81      0.70      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



## Exercise 5, Step II

Run through steps 2-4 using a different max_depth value.

In [23]:
# creating the Decision Tree object with desired hyper-parameters
clf = DecisionTreeClassifier(max_depth=5, random_state=123)

In [24]:
# fitting the algorithm to the training data
clf = clf.fit(x_train, y_train)

In [25]:
# creating visualization of tree
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [26]:
# making prediction on observations
y_pred = clf.predict(x_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [27]:
# estimating probability of each species
y_pred_proba = clf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.62025316, 0.37974684],
       [0.82608696, 0.17391304],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ]])

## Exercise 5, Step III

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [28]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(x_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.84


In [29]:
# creating confusion matrix
confusion_matrix(y_train, y_pred)

array([[303,   4],
       [ 77, 114]])

In [30]:
# adding clarity
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,303,4
1,77,114


In [31]:
# creating classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.99      0.88       307
           1       0.97      0.60      0.74       191

    accuracy                           0.84       498
   macro avg       0.88      0.79      0.81       498
weighted avg       0.86      0.84      0.83       498



## Exercise 5, Step IV 

Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [32]:
# confusion matrix with tp (upper left), fp (lower left), tn (lower right), & fn (upper right)
pd.DataFrame(confusion_matrix(y_train, y_pred, normalize='true'), index=labels, columns=labels)

Unnamed: 0,0,1
0,0.986971,0.013029
1,0.403141,0.596859


In [33]:
# classification report with accuracy, precision, recall, f1-score, & support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.99      0.88       307
           1       0.97      0.60      0.74       191

    accuracy                           0.84       498
   macro avg       0.88      0.79      0.81       498
weighted avg       0.86      0.84      0.83       498



## Exercise 6

Which model performs better on your in-sample data?

**Model with max_depth_value of 5 performed slightly better on in-sample data (0.84 accuracy) than that with max_depth_value of 3 (0.82 accuracy)**

## Exercise 7, 

Which model performs best on your out-of-sample data, the validate set?

In [34]:
# running this after creating and running Decision Tree classifier with desired hyper-parameters and fitting to the data
# first with max_depth set to 3, then with max_depth set to 5
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(x_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.75


**Model with max_depth set to 3 performs better on the validate set with accuracy of 0.79**

**Model with max_depth set to 5 has accuracy of 0.75**

# Random Forest Exercises

### Exercise 1

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [35]:
# creating random forest object with desired hyper-parameters
rf = RandomForestClassifier(max_depth=10, 
                            random_state=369, min_samples_leaf=1)
rf

RandomForestClassifier(max_depth=10, random_state=369)

In [36]:
# fitting model to train set
rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=10, random_state=369)

In [37]:
# evaluating importance of each feature, higher score = more importance 
print(rf.feature_importances_)

[0.10270126 0.06734553 0.04841256 0.36655503 0.02072547 0.344281
 0.01592391 0.01512833 0.0189269 ]


In [38]:
x_train.head()

Unnamed: 0,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
583,1,0,0,40.125,1,1,1,0,0
165,3,0,2,20.525,0,1,0,0,1
50,3,4,1,39.6875,0,1,0,0,1
259,2,0,1,26.0,0,0,0,0,1
306,1,0,0,110.8833,1,0,1,0,0


In [39]:
# classifying each observation as survived or not survived
y_pred = rf.predict(x_train)
y_pred

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

In [40]:
# estimating the probability of those classifications
y_pred_proba = rf.predict_proba(x_train)
y_pred_proba[0:5]

array([[0.93859479, 0.06140521],
       [0.31441667, 0.68558333],
       [0.98071429, 0.01928571],
       [0.02132669, 0.97867331],
       [0.        , 1.        ]])

## Exercise 2

Evaluate your results using the model score, confusion matrix, and classification report.

In [41]:
# calculating accuracy of the model
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(x_train, y_train)))

Accuracy of random forest classifier on training set: 0.94


In [42]:
# creating confusion matrix
print(confusion_matrix(y_train, y_pred))

[[302   5]
 [ 23 168]]


In [43]:
# creating classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       307
           1       0.97      0.88      0.92       191

    accuracy                           0.94       498
   macro avg       0.95      0.93      0.94       498
weighted avg       0.95      0.94      0.94       498



## Exercise 3

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

**In this case, positive = 0 (did not survive), negative = 1 (survived)**

In [44]:
tp = 302
fp = 23
fn = 5
tn = 168
all_4 = tp + fp + fn + tn

accuracy = (tp + tn) / all_4
true_positive_rate = tp / (tp + fn)
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn /(fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * ((precision * recall) / (precision + recall))
support_pos = tp + fn
support_neg = fp + tn

In [45]:
print(f'accuracy: {accuracy}')
print(f'true_positive_rate: {true_positive_rate}')
print(f'false_positive_rate: {false_positive_rate}')
print(f'true_negative_rate: {true_negative_rate}')
print(f'false_negative_rate: {false_negative_rate}')
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1_score: {f1_score}')
print(f'support: {support}')

accuracy: 0.9437751004016064
true_positive_rate: 0.9837133550488599
false_positive_rate: 0.12041884816753927
true_negative_rate: 0.8795811518324608
false_negative_rate: 0.016286644951140065
precision: 0.9292307692307692
recall: 0.9837133550488599
f1_score: 0.9556962025316454


NameError: name 'support' is not defined

## Exercise 4

Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [None]:
# running through 15 models with min_samples_leaf set to 1 and increasing by 1 each time, max_depth starting at 15 and decreasing by 1 each time

metrics = []
max_depth = 16

# setting range from 1 - 15
for i in range(1, max_depth):
    # used to set max_depth to 16 - number of current iteration
    depth = max_depth - i
    # used to set min_samples_leaf to number of current iteration
    n_samples = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n_samples, random_state=123)

    # fitting model on train
    forest = forest.fit(x_train, y_train)

    # checking accuracy of model on the train data set
    train_accuracy = forest.score(x_train, y_train)
    # checking accuracy of model on the validate data set
    validate_accuracy = forest.score(x_validate, y_validate)
    
    # creating output of # min_samples_leaf, max_depth, train accuracy, and validate accuracy 
    output = {
        "min_samples_per_leaf": n_samples,
        "max_depth": depth,
        "train_accuracy": train_accuracy,
        "validate_accuracy": validate_accuracy
    }
    # appending output to metrics list
    metrics.append(output)
# once all iterations have run, creating df from metrics list, adding a column that calculates the difference between trainn and validate accuracy, and prints out df   
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

## Exercise 5

What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

**The model with the highest max_depth and lowest min_samples_per_leaf (15, 1) performs best on the in-sample data because it has the ability to ask more questions and detect finer differences.**

**This does, however, decrease the accuracy of the model on the out-of-sample data as it is overfitted to the train data**

**The difference column shows us that the largest difference in accuracy between the in-sample and out-of-sample data is for the model that performed best on the in-sample data**

**To get a good balance of accuracy without overfitting, we would want to choose models for which the difference is less dramatic**

**Models with indexes 4 - 6 would likely be good choices as they have the best accuracy with less difference between the two predictions**

# KNN Exercises

## Exercise 1

Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
# creating knn object
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [None]:
# fitting the model
knn.fit(x_train, y_train)

In [None]:
# making predictions
y_pred = knn.predict(x_train)
y_pred[0:5]

In [None]:
# estimating probability
y_pred_proba = knn.predict_proba(x_train)
y_pred_proba[0:5]

In [None]:
# checking classes
knn.classes_

## Exercise 2

Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
# computing accuracy
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(x_train, y_train)))

In [None]:
# creating confusion matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# finding a better way to make sense of confusion matrices
a = confusion_matrix(y_train, y_pred)

In [None]:
disp = ConfusionMatrixDisplay(a, display_labels=None)

In [None]:
disp.plot()

In [None]:
# creating classification report
print(classification_report(y_train, y_pred))

## Exercise 3

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
tp = 266
fp = 52
fn = 41
tn = 139
all_4 = tp + fp + fn + tn

print(f'accuracy: {(tp + tn) / all_4}')
print(f'true_positive_rate: {tp / (tp + fn)}')
print(f'false_positive_rate: {fp / (fp + tn)}')
print(f'true_negative_rate: {tn / (tn + fp)}')
print(f'false_negative_rate: {fn /(fn + tp)}')
print(f'precision: {tp / (tp + fp)}')
print(f'recall: {tp / (tp + fn)}')
print(f'f1_score: {2 * ((precision * recall) / (precision + recall))}')
print(f'support_pos: {tp + fn}')
print(f'support_neg: {fp + tn}')

## Exercise 4

Run through steps 2-4 setting k to 10

In [None]:
# creating knn object
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [None]:
# fitting the model
knn.fit(x_train, y_train)

In [None]:
# making predictions
y_pred = knn.predict(x_train)
y_pred[0:5]

In [None]:
# estimating probability
y_pred_proba = knn.predict_proba(x_train)
y_pred_proba[0:5]

In [None]:
# computing accuracy
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(x_train, y_train)))

In [None]:
# finding a better way to make sense of confusion matrices
a = confusion_matrix(y_train, y_pred)
disp = ConfusionMatrixDisplay(a, display_labels=None)
disp.plot()

In [None]:
# creating classification report
print(classification_report(y_train, y_pred))

In [None]:
tp = 267
fp = 69
fn = 40
tn = 122
all_4 = tp + fp + fn + tn

print(f'accuracy: {(tp + tn) / all_4}')
print(f'true_positive_rate: {tp / (tp + fn)}')
print(f'false_positive_rate: {fp / (fp + tn)}')
print(f'true_negative_rate: {tn / (tn + fp)}')
print(f'false_negative_rate: {fn /(fn + tp)}')
print(f'precision: {tp / (tp + fp)}')
print(f'recall: {tp / (tp + fn)}')
print(f'f1_score: {2 * ((precision * recall) / (precision + recall))}')
print(f'support_pos: {tp + fn}')
print(f'support_neg: {fp + tn}')

## Exercise 5

Run through setps 2-4 setting k to 20

In [None]:
# creating knn object
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')

In [None]:
# fitting the model
knn.fit(x_train, y_train)

In [None]:
# making predictions
y_pred = knn.predict(x_train)
y_pred[0:5]

In [None]:
# estimating probability
y_pred_proba = knn.predict_proba(x_train)
y_pred_proba[0:5]

In [None]:
# computing accuracy
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(x_train, y_train)))

In [None]:
# finding a better way to make sense of confusion matrices
a = confusion_matrix(y_train, y_pred)
disp = ConfusionMatrixDisplay(a, display_labels=None)
disp.plot()

In [None]:
# creating classification report
print(classification_report(y_train, y_pred))

In [None]:
tp = 263
fp = 86
fn = 44
tn = 105
all_4 = tp + fp + fn + tn

print(f'accuracy: {(tp + tn) / all_4}')
print(f'true_positive_rate: {tp / (tp + fn)}')
print(f'false_positive_rate: {fp / (fp + tn)}')
print(f'true_negative_rate: {tn / (tn + fp)}')
print(f'false_negative_rate: {fn /(fn + tp)}')
print(f'precision: {tp / (tp + fp)}')
print(f'recall: {tp / (tp + fn)}')
print(f'f1_score: {2 * ((precision * recall) / (precision + recall))}')
print(f'support_pos: {tp + fn}')
print(f'support_neg: {fp + tn}')

## Exercise 6

What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

**The difference in accuracy between the two models is 0.04 with n_neighbors set to 10 performing slightly better on the in-sample data**

**With lower n_neighbors, the model distinguishes more of the distinctive characteristics of each of the neighbors which improves accuracy for the in-sample data but may overfit the model which decreases the accuracy on the out-of-sample data**

**With higher n_neighbors, the model generalizes more to accommodate more data points which may decrease accuracy on the in-sample data but may increase the accuracy on the out-of-sample data**

## Exercise 7

Which model performs best on our out-of-sample data from validate?

In [None]:
# computing accuracy
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(x_validate, y_validate)))

**Model with n_neighbors set to 10 has accuracy of 0.72**

**Model with n_nighbors set to 20 has accuracy of 0.67**

**In this case, n_neighbors set to 10, is not overfit and performs best on the out-of-sample data**


# Logistic Regression Exercises

## Exercise 1

Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [52]:
# pulling in, cleaning, and splitting titanic data
train, validate, test = prepare.prep_titanic_data(acquire.get_titanic_data())
train.shape, validate.shape, test.shape

((498, 14), (214, 14), (179, 14))

In [53]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
583,0,1,male,36.0,0,0,40.125,Cherbourg,1,0,1,1,0,0
165,1,3,male,9.0,0,2,20.525,Southampton,0,0,1,0,0,1
50,0,3,male,7.0,4,1,39.6875,Southampton,0,0,1,0,0,1
259,1,2,female,50.0,0,1,26.0,Southampton,0,1,0,0,0,1
306,1,1,female,,0,0,110.8833,Cherbourg,1,1,0,1,0,0


In [56]:
train, validate, test = prepare.impute_mean_age(train, validate, test)
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
583,0,1,male,36.0,0,0,40.125,Cherbourg,1,0,1,1,0,0
165,1,3,male,9.0,0,2,20.525,Southampton,0,0,1,0,0,1
50,0,3,male,7.0,4,1,39.6875,Southampton,0,0,1,0,0,1
259,1,2,female,50.0,0,1,26.0,Southampton,0,1,0,0,0,1
306,1,1,female,29.678105,0,0,110.8833,Cherbourg,1,1,0,1,0,0


In [57]:
# designating which features I want to use for modeling
x_cols = ['pclass', 'age', 'fare']
y_col = 'survived'

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [58]:
X_train.head()

Unnamed: 0,pclass,age,fare
583,1,36.0,40.125
165,3,9.0,20.525
50,3,7.0,39.6875
259,2,50.0,26.0
306,1,29.678105,110.8833


In [59]:
# defining the logistic regression model
logit = LogisticRegression(C=1, random_state=123)

In [60]:
#  fitting the model on train data
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [63]:
# using model to make predictions
y_pred = logit.predict(X_train)
y_pred[0:5]

array([1, 0, 0, 0, 1])

In [65]:
# checking probability of those predictions
y_pred_proba = logit.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.36988206, 0.63011794],
       [0.63810638, 0.36189362],
       [0.61748053, 0.38251947],
       [0.70385285, 0.29614715],
       [0.30445826, 0.69554174]])

In [67]:
# creating classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78       307
           1       0.67      0.44      0.53       191

    accuracy                           0.70       498
   macro avg       0.69      0.65      0.66       498
weighted avg       0.70      0.70      0.69       498

