### Use the titanic data

In [1]:
import numpy as np
import pandas as pd
import acquire
import prepare
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Acquire the data

In [2]:
titanic = acquire.get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


### Prepare the data

In [3]:
titanic = prepare.prep_titanic(titanic)
titanic.head()

Unnamed: 0,survived,pclass,num_sib_and_sp,num_par_and_ch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1
3,1,1,1,0,53.1,0,0,0,1
4,0,3,0,0,8.05,1,1,0,1


In [4]:
#Split the data into train, validate, and test subsets
train, validate, test = prepare.train_validate_test_split(titanic, 'survived')

In [5]:
#Separate the X and y variables
X_train, y_train = train.drop('survived', axis = 1), train.survived
X_validate, y_validate = validate.drop('survived', axis = 1), validate.survived
X_test, y_test = test.drop('survived', axis = 1), test.survived

### What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [6]:
#Target variable is survived. Baseline will be the most common value.
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [7]:
#Since most people did not survive, this will be the baseline prediction.
#Use the dummy classifier to set the baseline
from sklearn.dummy import DummyClassifier

baseline = DummyClassifier(strategy = 'constant', constant = 0)
baseline.fit(X_train, y_train)

#Now get the baseline accuracy
baseline.score(X_validate, y_validate)

0.616822429906542

### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [26]:
#Instantiate the decision tree classifier
model1 = DecisionTreeClassifier(max_depth = 5, random_state = 123)

#Fit the model
model1.fit(X_train, y_train)

#Make predictions
model1_preds = model1.predict(X_train)

### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [9]:
#Model score
model1.score(X_train, y_train)

0.8373493975903614

In [27]:
#Confusion Matrix
pd.DataFrame(confusion_matrix(y_train, model1_preds))

Unnamed: 0,0,1
0,303,4
1,77,114


In [28]:
#Classification Report
print(classification_report(y_train, model1_preds))

              precision    recall  f1-score   support

           0       0.80      0.99      0.88       307
           1       0.97      0.60      0.74       191

    accuracy                           0.84       498
   macro avg       0.88      0.79      0.81       498
weighted avg       0.86      0.84      0.83       498



### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [17]:
#Precision, recall, f1-score, and support are listed above
tp_1 = 114
tn_1 = 303
fp_1 = 4
fn_1 = 77

### Run through steps 2-4 using a different max_depth value.

In [29]:
#Instantiate model
model2 = DecisionTreeClassifier(max_depth = 9, random_state = 123)

#Fit the model
model2.fit(X_train, y_train)

#Make predictions
model2_preds = model2.predict(X_train)

In [30]:
#Model score
model2.score(X_train, y_train)

0.9096385542168675

In [21]:
#Confusion Matrix
#index = actual
#columns = predictions
pd.DataFrame(confusion_matrix(y_train, model2_preds))

Unnamed: 0,0,1
0,298,9
1,36,155


In [22]:
#Classification Report
print(classification_report(y_train, model2_preds))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       307
           1       0.95      0.81      0.87       191

    accuracy                           0.91       498
   macro avg       0.92      0.89      0.90       498
weighted avg       0.91      0.91      0.91       498



In [23]:
#Calculate tp, tn, fp, fn
tp = 155
tn = 298
fp = 9
fn = 36

### Which model performs better on your in-sample data?

My model2 performs better on in sample data.

### Which model performs best on your out-of-sample data, the validate set?

In [24]:
#Compare general accuracies

#For model1
model1.score(X_validate, y_validate)

0.7570093457943925

In [25]:
#For model2
model2.score(X_validate, y_validate)

0.7616822429906542

In [31]:
#Check classification reports

#For model1
model1_preds = model1.predict(X_validate)
print(classification_report(y_validate, model1_preds))

              precision    recall  f1-score   support

           0       0.74      0.94      0.83       132
           1       0.83      0.46      0.59        82

    accuracy                           0.76       214
   macro avg       0.78      0.70      0.71       214
weighted avg       0.77      0.76      0.74       214



In [32]:
#For model2
model2_preds = model2.predict(X_validate)
print(classification_report(y_validate, model2_preds))

              precision    recall  f1-score   support

           0       0.78      0.86      0.82       132
           1       0.72      0.61      0.66        82

    accuracy                           0.76       214
   macro avg       0.75      0.73      0.74       214
weighted avg       0.76      0.76      0.76       214



In [33]:
#Check confusion matrices

#For model1
pd.DataFrame(confusion_matrix(y_validate, model1_preds))

Unnamed: 0,0,1
0,124,8
1,44,38


In [34]:
#For model2
pd.DataFrame(confusion_matrix(y_validate, model2_preds))

Unnamed: 0,0,1
0,113,19
1,32,50


While accuracy is about the same for both models, their precision and recall for survivor predictions were quite different. Based on these numbers, and assuming its more important to predict who survived rather than those who did not, it seems model2 is the better performer.