In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic

In [29]:
# Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:
df = get_titanic_data()
df = prep_titanic(df)

In [30]:
df

Unnamed: 0.1,sex_male,embark_town_Queenstown,embark_town_Southampton,Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,1,0,1,0,0,3,male,1,0,7.2500,Southampton,0
1,0,0,0,1,1,1,female,1,0,71.2833,Cherbourg,0
2,0,0,1,2,1,3,female,0,0,7.9250,Southampton,1
3,0,0,1,3,1,1,female,1,0,53.1000,Southampton,0
4,1,0,1,4,0,3,male,0,0,8.0500,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,1,0,1,886,0,2,male,0,0,13.0000,Southampton,1
887,0,0,1,887,1,1,female,0,0,30.0000,Southampton,1
888,0,0,1,888,0,3,female,1,2,23.4500,Southampton,0
889,1,0,0,889,1,1,male,0,0,30.0000,Cherbourg,1


In [37]:
# What is your baseline prediction? What is your baseline accuracy? 
# remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode).
#  When you make those predictions, what is your accuracy? This is your baseline accuracy.

train_validate, test = train_test_split(df, test_size=.2, 
                                            random_state=123, 
                                            stratify=df.survived)
train, validate = train_test_split(train_validate, test_size=.3,
                                                   random_state=123,
                                                   stratify=train_validate.survived)
train.shape, validate.shape, test.shape


((498, 12), (214, 12), (179, 12))

In [39]:
# drop non-numerical columns
drops = ['sex', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton', 'Unnamed: 0']
[dset.drop(columns=drops, inplace=True) for dset in [train, validate, test]]


[None, None, None]

In [40]:
train.head()

Unnamed: 0,sex_male,survived,pclass,sibsp,parch,fare,alone
583,1,0,1,0,0,40.125,1
165,1,1,3,0,2,20.525,0
50,1,0,3,4,1,39.6875,0
259,0,1,2,0,1,26.0,0
306,0,1,1,0,0,110.8833,1


In [76]:
validate.head()

Unnamed: 0,sex_male,survived,pclass,sibsp,parch,fare,alone
610,0,0,3,1,5,31.275,0
424,1,0,3,1,1,20.2125,0
568,1,0,3,0,0,7.2292,1
334,0,1,1,1,0,133.65,0
101,1,0,3,0,0,7.8958,1


In [41]:
# baseline prediction -- not survived(0)
train.survived.value_counts()


0    307
1    191
Name: survived, dtype: int64

In [42]:
# baseline accuracy is 61.6%
(train.survived == 0).mean()

0.6164658634538153

In [79]:
# Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)
clf = DecisionTreeClassifier()

In [80]:
#Setting targets
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

In [81]:
clf = clf.fit(X_train, y_train)


In [82]:
# Evaluate your in-sample results using the model score, confusion matrix, and classification report.
#Model Score
print("Model Score:", clf.score(X_train, y_train))
y_pred = clf.predict(X_train)

Model Score: 0.9417670682730924


In [83]:
#Confusion Matrix
print("Confusion Matrix")
conf = confusion_matrix(y_train, y_pred)
conf


Confusion Matrix


array([[303,   4],
       [ 25, 166]])

In [84]:
#Classification Report
print("Classification Report")
report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
print(report)

Classification Report
                    0           1  accuracy   macro avg  weighted avg
precision    0.923780    0.976471  0.941767    0.950126      0.943989
recall       0.986971    0.869110  0.941767    0.928040      0.941767
f1-score     0.954331    0.919668  0.941767    0.936999      0.941036
support    307.000000  191.000000  0.941767  498.000000    498.000000


In [85]:
# Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print (f'true positive rate: {tpr:.3} \nfalse positive rate: {fpr: .3} \ntrue negative rate: {tnr: .3} \nfalse negative rate; {fnr: .3}')


true positive rate: 0.869 
false positive rate:  0.013 
true negative rate:  0.987 
false negative rate;  0.131


In [86]:
# Run through steps 2-4 using a different max_depth value.
clf1 = DecisionTreeClassifier(max_depth=3)
clf1 = clf1.fit(X_train, y_train)

print("Score:", clf1.score(X_train, y_train))
y_pred_1 = clf1.predict(X_train)

print("Confusion Matrix")
conf = confusion_matrix(y_train, y_pred_1)
print(conf)

print("Classification Report")
report = pd.DataFrame(classification_report(y_train, y_pred_1, output_dict=True))
print(report)

Score: 0.8232931726907631
Confusion Matrix
[[276  31]
 [ 57 134]]
Classification Report
                    0           1  accuracy   macro avg  weighted avg
precision    0.828829    0.812121  0.823293    0.820475      0.822421
recall       0.899023    0.701571  0.823293    0.800297      0.823293
f1-score     0.862500    0.752809  0.823293    0.807654      0.820430
support    307.000000  191.000000  0.823293  498.000000    498.000000


In [None]:
# Which model performs better on your in-sample data?
# Model 1 performs better on the in-sample data with accuracy of 94.18%. Model 2's accuracy score is 82.33% with depth level of 3

In [91]:
# Which model performs best on your out-of-sample data, the validate set?
# model 1
y_val_pred = clf.predict(X_validate)
# model 2
y_val_pred_1 = clf1.predict(X_validate)

0.32710280373831774

In [88]:
val_score = clf.score(X_validate, y_validate)
val_score_1 = clf1.score(X_validate, y_validate)
val_score, val_score_1
# there is a drop in model 1. This suggests over-fit in the training dataset, and the model 2 is higher than model 1.

(0.7757009345794392, 0.7850467289719626)

In [None]:
# Work through these same exercises using the Telco dataset.
# Experiment with this model on other datasets with a higher number of output classes.

In [None]:
# RANDOM FOREST
# Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) 
# setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
rf = RandomForestClassifier()
# Evaluate your results using the model score, confusion matrix, and classification report.

# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate,
#  false negative rate, precision, recall, f1-score, and support.

# Run through steps increasing your min_samples_leaf and decreasing your max_depth.

# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

# After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [None]:
# Continue working in your model file with the titanic dataset.

# Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

# Evaluate your results using the model score, confusion matrix, and classification report.

# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, 
# precision, recall, f1-score, and support.

# Run through steps 2-4 setting k to 10

# Run through setps 2-4 setting k to 20

# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

# Which model performs best on our out-of-sample data from validate?

In [None]:
# In these exercises, we'll continue working with the titanic dataset and building logistic regression models. 
# Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. 
# The test dataset should only be used for your final model.

# For all of the models you create, choose a threshold that optimizes for accuracy.

# Do your work for these exercises in either a notebook or a python script named model within your classification-exercises repository. 
# Add, commit, and push your work.

# Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

# Include sex in your model as well. 
# Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

# Try out other combinations of features and models.

# Use you best 3 models to predict and evaluate on your validate sample.

# Choose you best model from the validation performation, and evaluate it on the test dataset. 
# How do the performance metrics compare to validate? to train?