In [1]:
import numpy as np
import pandas as pd 

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import graphviz
from graphviz import Graph

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

## Decision Tree - Exercises
### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

Acquire the data

In [2]:
titanic_df = acquire.get_titanic_data()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
titanic_df.shape

(891, 13)

Prepare the data

In [4]:
def prep_titanic(titanic_df):
    '''
    This function will clean the titanic data...
    '''
    titanic_df = titanic_df.drop_duplicates()
    cols_to_drop = ['deck', 'embarked', 'class', 'age']
    titanic_df = titanic_df.drop(columns=cols_to_drop)
    titanic_df.embark_town = titanic_df.embark_town.fillna(value=titanic_df.embark_town.mode())
    dummy_df = pd.get_dummies(titanic_df[['sex', 'embark_town']], dummy_na=False, drop_first=[True, True])
    titanic_df = pd.concat([titanic_df, dummy_df], axis=1)
    return titanic_df

In [5]:
titanic_df = prep_titanic(titanic_df)
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [6]:
titanic_df = titanic_df.drop(columns=['sex', 'embark_town'])

In [7]:
titanic_df.shape

(891, 10)

In [8]:
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,1,0,71.2833,0,0,0,0
2,2,1,3,0,0,7.925,1,0,0,1
3,3,1,1,1,0,53.1,0,0,0,1
4,4,0,3,0,0,8.05,1,1,0,1


Split the data

In [9]:
def split_data(titanic_df):
    '''
    Takes in a dataframe and return train, validate, test subset dataframes
    '''
    train, test = train_test_split(titanic_df, test_size = .2, random_state=123, stratify=titanic_df.survived)
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)
    return train, validate, test

In [10]:
train, validate, test = split_data(titanic_df)

In [11]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,0,1,0,0,40.125,1,1,0,0
165,165,1,3,0,2,20.525,0,1,0,1
50,50,0,3,4,1,39.6875,0,1,0,1
259,259,1,2,0,1,26.0,0,0,0,1
306,306,1,1,0,0,110.8833,1,0,0,0


In [12]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [13]:
train.shape

(498, 10)

In [14]:
validate.shape

(214, 10)

In [15]:
test.shape

(179, 10)

In [16]:
X_train.shape

(498, 9)

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

Find base line prediction which is the most frequently occuring common value


In [17]:
baseline = y_train.mode()
baseline

0    0
dtype: int64

In [18]:
match_bsl_prediction = y_train == 0


In [19]:
baseline_accuracy = match_bsl_prediction.mean()
baseline_accuracy


0.6164658634538153

we know baseline accuracy is 62%

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [20]:
tree1 = DecisionTreeClassifier(max_depth=3, random_state=123)

In [21]:

tree1 = tree1.fit(X_train, y_train)

visualize the decision tree

In [22]:
dot1 = export_graphviz(tree1, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot1) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [23]:
y_pred = tree1.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [24]:
y_pred_proba = tree1.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.62222222, 0.37777778],
       [0.62222222, 0.37777778],
       [0.89285714, 0.10714286],
       [0.14814815, 0.85185185],
       [0.        , 1.        ]])

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

Checking Accuracy

In [25]:
print('Accuracy of Decision Tree 1 classifier on training set: {:.2f}'
      .format(tree1.score(X_train, y_train)))

Accuracy of Decision Tree 1 classifier on training set: 0.82


Confusion Matrix

In [26]:
confusion_matrix(y_train, y_pred)

array([[274,  33],
       [ 56, 135]])

In [27]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [28]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,274,33
1,56,135


Classification Report

In [29]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       307
           1       0.80      0.71      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



Make classifcation report in dataframe

In [30]:
class_report = classification_report(y_train, y_pred, output_dict=(True))
print("Tree1 depth")
pd.DataFrame(class_report)

Tree1 depth


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.830303,0.803571,0.821285,0.816937,0.820051
recall,0.892508,0.706806,0.821285,0.799657,0.821285
f1-score,0.860283,0.752089,0.821285,0.806186,0.818787
support,307.0,191.0,0.821285,498.0,498.0


### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


Let's say nobody is survived in our positive case  (Not Survived = Positive)

In [31]:
TP = 265
FP = 58
FN = 42
TN = 133
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

Accuracy: 0.7991967871485943
True Positive Rate: 0.8631921824104235
False Positive Rate: 0.3036649214659686
True Negative Rate: 0.6963350785340314
False Negative Rate: 0.13680781758957655
Precision: 0.8204334365325078
Recall: 0.8631921824104235
F1 Score: 0.8412698412698413
Support (0): 307
Support (1): 191


### 5. Run through steps 2-4 using a different max_depth value.

In [32]:
for i in range(2, 11):
    
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)
  
    tree = tree.fit(X_train, y_train)
  
    y_pred = tree.predict(X_train)
  
    report = classification_report(y_train, y_pred, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.830303    0.803571  0.821285    0.816937      0.820051
recall       0.892508    0.706806  0.821285    0.799657      0.821285
f1-score     0.860283    0.752089  0.821285    0.806186      0.818787
support    307.000000  191.000000  0.821285  498.000000    498.000000

Tree with max depth of 4
                    0           1  accuracy   macro avg  weighted avg
precision    0.815642    0.892857  0.837349    0.854250      0.845257
recall       0.951140    0.654450  0.837349    0.802795      0.837349
f1-score     

Which model to use? Creating a dataframe with themax_depth, train_accuracy, validate_accuracy, difference.

In [33]:
metrics = []  

In [34]:
for i in range(1, 11):
    
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)  
    
    tree = tree.fit(X_train, y_train)    
    
    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_sample_accuracy = tree.score(x_validate, y_validate)
    
    output = {'max_depth': i, 'train_accuracy': in_sample_accuracy, 'validate_accuracy': out_sample_accuracy}
    
    metrics.append(output)
    
tree_df = pd.DataFrame(metrics)
tree_df["difference"] = tree_df.train_accuracy - tree_df.validate_accuracy

tree_df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,1,0.799197,0.761682,0.037515
1,2,0.799197,0.761682,0.037515
2,3,0.821285,0.775701,0.045584
3,4,0.837349,0.761682,0.075667
4,5,0.849398,0.761682,0.087715
5,6,0.871486,0.738318,0.133168
6,7,0.893574,0.752336,0.141238
7,8,0.913655,0.733645,0.18001
8,9,0.935743,0.724299,0.211444
9,10,0.947791,0.728972,0.218819


In order to avoid over-fitting, set a threshhold.

In [35]:
threshold = 0.10  #threshold set for amount of overfit that is tolerated

models = []
metrics = []

for i in range(1, 11):
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)
    #^^^ creates the model
    
    tree = tree.fit(X_train, y_train)   #fit model to train data and only TRAIN data
    
    in_sample_accuracy = tree.score(X_train, y_train)
    out_sample_accuracy = tree.score(x_validate, y_validate)
    #^^^evaluates the models performance on train data first
    
    difference = in_sample_accuracy - out_sample_accuracy
    #^^calculates the difference in accuracy
    
    if difference > threshold:
        break
    #^^adds conditions to check the accuracy vs the threshold
    
    output = {
        'max_depth': i,
        'train_accuracy': in_sample_accuracy,
        'validate_accuracy': out_sample_accuracy,
        'difference': difference}
    #^^^formats the output for each models performance o train and validate
    
    metrics.append(output)
    
    models.append(output)
    
model_df = pd.DataFrame(metrics)
model_df["difference"] = tree_df.train_accuracy - tree_df.validate_accuracy


model_df.head()

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,1,0.799197,0.761682,0.037515
1,2,0.799197,0.761682,0.037515
2,3,0.821285,0.775701,0.045584
3,4,0.837349,0.761682,0.075667
4,5,0.849398,0.761682,0.087715



### 6. Which model performs better on your in-sample data?


max_depth 5 Model

### 6. Which model performs best on your out-of-sample data, the validate set?

max_depth 3 Model

## Random Forest - Exercises
### Continue working in your model file with titanic data to do the following:

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [36]:
rf = RandomForestClassifier(min_samples_leaf=1, max_depth=10, random_state=5)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=5)

In [37]:
rf.feature_importances_

array([0.22797024, 0.0835123 , 0.05146076, 0.03771296, 0.23002317,
       0.01885045, 0.31209281, 0.01244536, 0.02593195])

In [38]:
y_pred_rf_train = rf.predict(X_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.


In [39]:
rf.score(X_train, y_train)

0.9678714859437751

In [40]:
confusion_matrix(y_train, y_pred_rf_train)

array([[307,   0],
       [ 16, 175]])

In [41]:
print(classification_report(y_train, y_pred_rf_train))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       307
           1       1.00      0.92      0.96       191

    accuracy                           0.97       498
   macro avg       0.98      0.96      0.97       498
weighted avg       0.97      0.97      0.97       498



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [42]:
# If not-survived is our positive case
TP = 285
FP = 68
FN = 22
TN = 123
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

Accuracy: 0.8192771084337349
True Positive Rate: 0.9283387622149837
False Positive Rate: 0.35602094240837695
True Negative Rate: 0.643979057591623
False Negative Rate: 0.07166123778501629
Precision: 0.8073654390934845
Recall: 0.9283387622149837
F1 Score: 0.8636363636363636
Support (0): 307
Support (1): 191


### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [43]:
max_depth = 16

for i in range(1, max_depth):
    # Create Model
    depth = max_depth - i
    n = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_pred = forest.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_pred, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

Tree with max depth of 1
               0      1  accuracy  macro avg  weighted avg
precision    1.0    1.0       1.0        1.0           1.0
recall       1.0    1.0       1.0        1.0           1.0
f1-score     1.0    1.0       1.0        1.0           1.0
support    307.0  191.0       1.0      498.0         498.0

Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.912121    0.964286  0.929719    0.938203      0.932128
recall       0.980456    0.848168  0.929719    0.914312      0.929719
f1-score     0.945055    0.902507  0.929719    0.923781      0.928736
support    307.000000  191.000000  0.929719  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.898507    0.963190  0.919679    0.930849      0.923315
recall       0.980456    0.821990  0.919679    0.901223      0.919679
f1-score     0.937695    0.887006  0.919679    0.912350      0.91825

In [44]:
metrics = []
max_depth = 16

for i in range(1, max_depth):
    # Create model
    depth = max_depth - i
    n = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(x_validate, y_validate)

    output = {
        "min_samples_per_leaf": n,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
0,1,15,1.0,0.771028,0.228972
1,2,14,0.929719,0.775701,0.154018
2,3,13,0.919679,0.78972,0.129959
3,4,12,0.87751,0.799065,0.078445
4,5,11,0.873494,0.78972,0.083774
5,6,10,0.863454,0.785047,0.078407
6,7,9,0.853414,0.785047,0.068367
7,8,8,0.851406,0.785047,0.066359
8,9,7,0.841365,0.775701,0.065665
9,10,6,0.833333,0.780374,0.05296


### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [45]:
metrics = []
max_depth = 16

for i in range(1, max_depth):
    # Create model
    depth = max_depth - i
    n = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(x_validate, y_validate)

    output = {
        "min_samples_per_leaf": n,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
0,1,15,1.0,0.771028,0.228972
1,2,14,0.929719,0.775701,0.154018
2,3,13,0.919679,0.78972,0.129959
3,4,12,0.87751,0.799065,0.078445
4,5,11,0.873494,0.78972,0.083774
5,6,10,0.863454,0.785047,0.078407
6,7,9,0.853414,0.785047,0.068367
7,8,8,0.851406,0.785047,0.066359
8,9,7,0.841365,0.775701,0.065665
9,10,6,0.833333,0.780374,0.05296


### After making a few models, which one has the best performance (or closest metrics) on both train and validate?



#### min_samples_per_leaf	max_depth	train_accuracy	validate_accuracy	difference
#### 4	                    12	        0.877510	    0.799065	        0.078445


## Exercises - KNN

### Continue working in your model file with the titanic dataset.

### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [46]:
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_train, y_train)



KNeighborsClassifier(n_jobs=-1)

In [47]:
y_pred_knn_train = knn.predict(X_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [48]:
knn.score(X_train, y_train)

0.7409638554216867

In [49]:
confusion_matrix(y_train, y_pred_knn_train)

array([[270,  37],
       [ 92,  99]])

In [50]:

print(classification_report(y_train, y_pred_knn_train))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       307
           1       0.73      0.52      0.61       191

    accuracy                           0.74       498
   macro avg       0.74      0.70      0.71       498
weighted avg       0.74      0.74      0.73       498



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [51]:
# If not-survived is our positive case
TP = 270
FP = 37
FN = 92
TN = 99
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

Accuracy: 0.7409638554216867
True Positive Rate: 0.7458563535911602
False Positive Rate: 0.27205882352941174
True Negative Rate: 0.7279411764705882
False Negative Rate: 0.2541436464088398
Precision: 0.8794788273615635
Recall: 0.7458563535911602
F1 Score: 0.8071748878923767
Support (0): 362
Support (1): 136


In [52]:
support = train["survived"].value_counts()
support

0    307
1    191
Name: survived, dtype: int64

### 4. Run through steps 2-4 setting k to 10

In [53]:
knn1 = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
knn1.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=-1, n_neighbors=10)

In [54]:
y_pred_knn1_train = knn1.predict(X_train)

In [55]:

knn1.score(X_train, y_train)

0.7068273092369478

In [56]:
confusion_matrix(y_train, y_pred_knn1_train)

array([[282,  25],
       [121,  70]])

In [57]:
print(classification_report(y_train, y_pred_knn1_train))

              precision    recall  f1-score   support

           0       0.70      0.92      0.79       307
           1       0.74      0.37      0.49       191

    accuracy                           0.71       498
   macro avg       0.72      0.64      0.64       498
weighted avg       0.71      0.71      0.68       498



### 5. Run through setps 2-4 setting k to 20

In [58]:
knn2 = KNeighborsClassifier(n_neighbors=20, n_jobs=-1)
knn2.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=-1, n_neighbors=20)

In [59]:
y_pred_knn2_train = knn2.predict(X_train)

In [60]:
knn2.score(X_train, y_train)

0.6767068273092369

In [61]:
confusion_matrix(y_train, y_pred_knn2_train)

array([[283,  24],
       [137,  54]])

In [62]:
print(classification_report(y_train, y_pred_knn2_train))

              precision    recall  f1-score   support

           0       0.67      0.92      0.78       307
           1       0.69      0.28      0.40       191

    accuracy                           0.68       498
   macro avg       0.68      0.60      0.59       498
weighted avg       0.68      0.68      0.63       498



### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



I noticed K-10 & k-11 performs better as they are on the border lines on the close to overfitting.

### 7. Which model performs best on our out-of-sample data from validate?

In [63]:
metrics = []

# loop through different values of k
for k in range(1, 21):
            
    # define the thing
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # fit the thing (remmeber only fit on training data)
    knn.fit(X_train, y_train)
    
    # use the thing (calculate accuracy)
    train_accuracy = knn.score(X_train, y_train)
    validate_accuracy = knn.score(x_validate, y_validate)
    
    output = {
        "k": k,
        "train_accuracy": train_accuracy,
        "validate_accuracy": validate_accuracy
    }
    
    metrics.append(output)


df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,k,train_accuracy,validate_accuracy,difference
0,1,1.0,0.560748,0.439252
1,2,0.785141,0.607477,0.177664
2,3,0.811245,0.616822,0.194423
3,4,0.73494,0.593458,0.141482
4,5,0.740964,0.570093,0.17087
5,6,0.708835,0.593458,0.115377
6,7,0.718876,0.593458,0.125418
7,8,0.706827,0.626168,0.080659
8,9,0.722892,0.607477,0.115415
9,10,0.706827,0.598131,0.108696


K - 20 performed best since it is not overfitting to the train data.

## Exercises - Logistic Regression

### In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

### For all of the models you create, choose a threshold that optimizes for accuracy.

### Do your work for these exercises in either a notebook or a python script named model within your classification-exercises repository. Add, commit, and push your work.

In [79]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.linear_model

# ignore warnings
import warnings
warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
import seaborn as sns


from acquire import get_titanic_data
from prepare import split_data


### 1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [80]:
df = get_titanic_data()

In [81]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


Prepare data

In [82]:
avg_age = df.age.mean()
df.age = df.age.fillna(avg_age)
df["is_female"] = (df.sex == "female").astype('int')
dummy_df = pd.get_dummies(df[['embark_town']], dummy_na=False, drop_first=True)
df = pd.concat([df, dummy_df], axis=1)
df = df.drop(columns=["passenger_id", "deck", "class", "embarked", "sex", "embark_town"])


In [83]:
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,0,0,1
1,1,1,38.0,1,0,71.2833,0,1,0,0
2,1,3,26.0,0,0,7.925,1,1,0,1
3,1,1,35.0,1,0,53.1,0,1,0,1
4,0,3,35.0,0,0,8.05,1,0,0,1


In [84]:
df.isna().sum()

survived                   0
pclass                     0
age                        0
sibsp                      0
parch                      0
fare                       0
alone                      0
is_female                  0
embark_town_Queenstown     0
embark_town_Southampton    0
dtype: int64

In [86]:
logr = LogisticRegression(penalty="l1", max_iter=500, solver="liblinear", random_state=5, C=1.0)
logr.fit(X_train, y_train)
logr.score(X_train, y_train)

0.8152610441767069

### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

Sex already included in my model.

### 3. Try out other combinations of features and models.

In [87]:
logr1 = LogisticRegression(penalty="l2", max_iter=500, solver="lbfgs", random_state=5, C=0.5)
logr1.fit(X_train, y_train)
logr1.score(X_train, y_train)
y_pred_lr1_train = logr1.predict(X_train)
print(classification_report(y_train, y_pred_lr1_train))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85       307
           1       0.78      0.70      0.74       191

    accuracy                           0.81       498
   macro avg       0.80      0.79      0.80       498
weighted avg       0.81      0.81      0.81       498



In [89]:
logr2 = LogisticRegression(penalty="l1", max_iter=500, solver="liblinear", random_state=5, C=0.25)
logr2.fit(X_train, y_train)
logr2.score(X_train, y_train)



0.8072289156626506

### 4. Use you best 3 models to predict and evaluate on your validate sample.

In [90]:
y_pred_lr_val = logr.predict(X_val)
print(classification_report(y_val, y_pred_lr_val))

NameError: name 'X_val' is not defined

### 5. Choose your best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?


In [92]:
y_pred_lr1_test = logr1.predict(X_test)
print(classification_report(y_test, y_pred_lr1_test))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       110
           1       0.77      0.68      0.72        69

    accuracy                           0.80       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



 It is slightly worse than validate, but roughly same as train.