# Modeling

## Decision Tree Exercises

In [21]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import acquire as aq

### Titanic Data Set

In [2]:
titanic_df = aq.get_titanic_data()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
cols_to_drop = ['deck', 'embarked', 'class']
titanic_df = titanic_df.drop(columns=cols_to_drop)
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,Southampton,1


In [4]:
dummy_df = pd.get_dummies(titanic_df[['sex','embark_town']], dummy_na=False)
dummy_df.head()

Unnamed: 0,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1
3,1,0,0,0,1
4,0,1,0,0,1


In [5]:
titanic_df = pd.concat([titanic_df, dummy_df], axis=1)
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,22.0,1,0,7.25,Southampton,0,0,1,0,0,1
1,1,1,1,female,38.0,1,0,71.2833,Cherbourg,0,1,0,1,0,0
2,2,1,3,female,26.0,0,0,7.925,Southampton,1,1,0,0,0,1
3,3,1,1,female,35.0,1,0,53.1,Southampton,0,1,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,Southampton,1,0,1,0,0,1


In [12]:
cols_to_drop = ['sex', 'embark_town']
titanic_df = titanic_df.drop(columns=cols_to_drop)
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,22.0,1,0,7.25,0,0,1,0,0,1
1,1,1,1,38.0,1,0,71.2833,0,1,0,1,0,0
2,2,1,3,26.0,0,0,7.925,1,1,0,0,0,1
3,3,1,1,35.0,1,0,53.1,0,1,0,0,0,1
4,4,0,3,35.0,0,0,8.05,1,0,1,0,0,1


In [19]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   passenger_id             891 non-null    int64  
 1   survived                 891 non-null    int64  
 2   pclass                   891 non-null    int64  
 3   age                      714 non-null    float64
 4   sibsp                    891 non-null    int64  
 5   parch                    891 non-null    int64  
 6   fare                     891 non-null    float64
 7   alone                    891 non-null    int64  
 8   sex_female               891 non-null    uint8  
 9   sex_male                 891 non-null    uint8  
 10  embark_town_Cherbourg    891 non-null    uint8  
 11  embark_town_Queenstown   891 non-null    uint8  
 12  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(2), int64(6), uint8(5)
memory usage: 67.0 KB


In [25]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(titanic_df)
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   passenger_id             891 non-null    int64  
 1   survived                 891 non-null    int64  
 2   pclass                   891 non-null    int64  
 3   age                      714 non-null    float64
 4   sibsp                    891 non-null    int64  
 5   parch                    891 non-null    int64  
 6   fare                     891 non-null    float64
 7   alone                    891 non-null    int64  
 8   sex_female               891 non-null    uint8  
 9   sex_male                 891 non-null    uint8  
 10  embark_town_Cherbourg    891 non-null    uint8  
 11  embark_town_Queenstown   891 non-null    uint8  
 12  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(2), int64(6), uint8(5)
memory usage: 67.0 KB


In [26]:
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,22.0,1,0,7.25,0,0,1,0,0,1
1,1,1,1,38.0,1,0,71.2833,0,1,0,1,0,0
2,2,1,3,26.0,0,0,7.925,1,1,0,0,0,1
3,3,1,1,35.0,1,0,53.1,0,1,0,0,0,1
4,4,0,3,35.0,0,0,8.05,1,0,1,0,0,1


In [13]:
train, test = train_test_split(titanic_df, test_size=.2, random_state=123, stratify=titanic_df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (498, 13)
validate -> (214, 13)
test -> (179, 13)


In [14]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

#### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.



In [15]:
titanic_df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [16]:
549 / (549 + 342)

0.6161616161616161

#### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)



In [17]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [18]:
clf = clf.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

#### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.



#### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



#### 5. Run through steps 2-4 using a different max_depth value.



#### 6. Which model performs better on your in-sample data?



#### 7. Which model performs best on your out-of-sample data, the validate set?