In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic

In [3]:
# Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:
df = get_titanic_data()
df = prep_titanic(df)

In [4]:
df

Unnamed: 0.1,sex_male,embark_town_Queenstown,embark_town_Southampton,Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,1,0,1,0,0,3,male,1,0,7.2500,Southampton,0
1,0,0,0,1,1,1,female,1,0,71.2833,Cherbourg,0
2,0,0,1,2,1,3,female,0,0,7.9250,Southampton,1
3,0,0,1,3,1,1,female,1,0,53.1000,Southampton,0
4,1,0,1,4,0,3,male,0,0,8.0500,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,1,0,1,886,0,2,male,0,0,13.0000,Southampton,1
887,0,0,1,887,1,1,female,0,0,30.0000,Southampton,1
888,0,0,1,888,0,3,female,1,2,23.4500,Southampton,0
889,1,0,0,889,1,1,male,0,0,30.0000,Cherbourg,1


In [5]:
# What is your baseline prediction? What is your baseline accuracy? 
# remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode).
#  When you make those predictions, what is your accuracy? This is your baseline accuracy.

train_validate, test = train_test_split(df, test_size=.2, 
                                            random_state=123, 
                                            stratify=df.survived)
train, validate = train_test_split(train_validate, test_size=.3,
                                                   random_state=123,
                                                   stratify=train_validate.survived)
train.shape, validate.shape, test.shape


((498, 12), (214, 12), (179, 12))

In [6]:
# drop non-numerical columns
drops = ['sex', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton', 'Unnamed: 0']
[dset.drop(columns=drops, inplace=True) for dset in [train, validate, test]]


[None, None, None]

In [7]:
train.head()

Unnamed: 0,sex_male,survived,pclass,sibsp,parch,fare,alone
583,1,0,1,0,0,40.125,1
165,1,1,3,0,2,20.525,0
50,1,0,3,4,1,39.6875,0
259,0,1,2,0,1,26.0,0
306,0,1,1,0,0,110.8833,1


In [8]:
validate.head()

Unnamed: 0,sex_male,survived,pclass,sibsp,parch,fare,alone
610,0,0,3,1,5,31.275,0
424,1,0,3,1,1,20.2125,0
568,1,0,3,0,0,7.2292,1
334,0,1,1,1,0,133.65,0
101,1,0,3,0,0,7.8958,1


In [9]:
# baseline prediction -- not survived(0)
train.survived.value_counts()


0    307
1    191
Name: survived, dtype: int64

In [10]:
# baseline accuracy is 61.6%
(train.survived == 0).mean()

0.6164658634538153

In [11]:
# Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)
clf = DecisionTreeClassifier()

In [12]:
#Setting targets
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

In [13]:
clf = clf.fit(X_train, y_train)


In [14]:
# Evaluate your in-sample results using the model score, confusion matrix, and classification report.
#Model Score
print("Model Score:", clf.score(X_train, y_train))
y_pred = clf.predict(X_train)

Model Score: 0.9417670682730924


In [15]:
#Confusion Matrix
print("Confusion Matrix")
conf = confusion_matrix(y_train, y_pred)
conf


Confusion Matrix


array([[303,   4],
       [ 25, 166]])

In [16]:
#Classification Report
print("Classification Report")
report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
print(report)

Classification Report
                    0           1  accuracy   macro avg  weighted avg
precision    0.923780    0.976471  0.941767    0.950126      0.943989
recall       0.986971    0.869110  0.941767    0.928040      0.941767
f1-score     0.954331    0.919668  0.941767    0.936999      0.941036
support    307.000000  191.000000  0.941767  498.000000    498.000000


In [17]:
# Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()
print (f'true positive rate: {tpr:.3} \nfalse positive rate: {fpr: .3} \ntrue negative rate: {tnr: .3} \nfalse negative rate; {fnr: .3}')


true positive rate: 0.869 
false positive rate:  0.013 
true negative rate:  0.987 
false negative rate;  0.131


In [18]:
# Run through steps 2-4 using a different max_depth value.
clf1 = DecisionTreeClassifier(max_depth=3)
clf1 = clf1.fit(X_train, y_train)

print("Score:", clf1.score(X_train, y_train))
y_pred_1 = clf1.predict(X_train)

print("Confusion Matrix")
conf = confusion_matrix(y_train, y_pred_1)
print(conf)

print("Classification Report")
report = pd.DataFrame(classification_report(y_train, y_pred_1, output_dict=True))
print(report)

Score: 0.8232931726907631
Confusion Matrix
[[276  31]
 [ 57 134]]
Classification Report
                    0           1  accuracy   macro avg  weighted avg
precision    0.828829    0.812121  0.823293    0.820475      0.822421
recall       0.899023    0.701571  0.823293    0.800297      0.823293
f1-score     0.862500    0.752809  0.823293    0.807654      0.820430
support    307.000000  191.000000  0.823293  498.000000    498.000000


In [19]:
# Which model performs better on your in-sample data?
# Model 1 performs better on the in-sample data with accuracy of 94.18%. Model 2's accuracy score is 82.33% with depth level of 3

In [20]:
# Which model performs best on your out-of-sample data, the validate set?
# model 1
y_val_pred = clf.predict(X_validate)
# model 2
y_val_pred_1 = clf1.predict(X_validate)

In [21]:
val_score = clf.score(X_validate, y_validate)
val_score_1 = clf1.score(X_validate, y_validate)
val_score, val_score_1
# there is a drop in model 1. This suggests over-fit in the training dataset, and the model 2 is higher than model 1.

(0.7850467289719626, 0.7850467289719626)

In [22]:
# Work through these same exercises using the Telco dataset.
# Experiment with this model on other datasets with a higher number of output classes.

In [23]:
# RANDOM FOREST
# Continue working in your model file with titanic data to do the following:

# Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) 
# setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.
rf = RandomForestClassifier(max_depth=10, min_samples_leaf=1, random_state=123)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_train)


In [24]:
# Evaluate your results using the model score, confusion matrix, and classification report.
print('Model Score')
rf_score = rf.score(X_train, y_train)
rf_score

Model Score


0.9397590361445783

In [25]:
print('Confusion Matrix')
conf = confusion_matrix(y_train, y_pred)
conf

Confusion Matrix


array([[301,   6],
       [ 24, 167]])

In [26]:
report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

In [27]:
# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate,
#  false negative rate, precision, recall, f1-score, and support.
tpr = conf[1][1] / conf[1].sum()
fpr = conf[0][1] / conf[0].sum()
tnr = conf[0][0] / conf[0].sum()
fnr = conf[1][0] / conf[1].sum()


In [28]:
print(f'The accuracy for the model is {rf_score:.4}')
print (f'true positive rate: {tpr:.3} \nfalse positive rate: {fpr: .3} \ntrue negative rate: {tnr: .3} \nfalse negative rate; {fnr: .3}')
report


The accuracy for the model is 0.9398
true positive rate: 0.874 
false positive rate:  0.0195 
true negative rate:  0.98 
false negative rate;  0.126


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.926154,0.965318,0.939759,0.945736,0.941175
recall,0.980456,0.874346,0.939759,0.927401,0.939759
f1-score,0.952532,0.917582,0.939759,0.935057,0.939127
support,307.0,191.0,0.939759,498.0,498.0


In [29]:
# Run through steps increasing your min_samples_leaf and decreasing your max_depth.
rf1 = RandomForestClassifier(min_samples_leaf=3, max_depth=3, random_state=123)
rf1.fit(X_train, y_train)
y_pred1 = rf1.predict(X_train)

In [30]:
print('Model_1 Score')
rf_score1 = rf1.score(X_train, y_train)
rf_score1

Model_1 Score


0.8293172690763052

In [31]:
print('Confusion Matrix')
conf1 = confusion_matrix(y_train, y_pred1)
conf1

Confusion Matrix


array([[284,  23],
       [ 62, 129]])

In [32]:
report1 = pd.DataFrame(classification_report(y_train, y_pred1, output_dict=True))

In [33]:
tpr = conf1[1][1] / conf1[1].sum()
fpr = conf1[0][1] / conf1[0].sum()
tnr = conf1[0][0] / conf1[0].sum()
fnr = conf1[1][0] / conf1[1].sum()
print(f'The accuracy for the model_1 is {rf_score1:.4}')
print (f'true positive rate: {tpr:.3} \nfalse positive rate: {fpr: .3} \ntrue negative rate: {tnr: .3} \nfalse negative rate; {fnr: .3}')
report1

The accuracy for the model_1 is 0.8293
true positive rate: 0.675 
false positive rate:  0.0749 
true negative rate:  0.925 
false negative rate;  0.325


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.820809,0.848684,0.829317,0.834747,0.8315
recall,0.925081,0.675393,0.829317,0.800237,0.829317
f1-score,0.869832,0.752187,0.829317,0.811009,0.824711
support,307.0,191.0,0.829317,498.0,498.0


In [34]:
# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
# model 1 performed worse in terms of accuracy than the first model because it has lower max depth and higher min-sample.
# This is due to lower max depth covering less depth of classfication level and higher min-sample putting limitation on the number of samples,
# making it less flexible to classify the samples.

In [35]:
# After making a few models, which one has the best performance (or closest metrics) on both train and validate?
print("Model in-sample score:", rf.score(X_train, y_train))
print("Model out-of-sample score:", rf.score(X_validate, y_validate))

print("Model 1 in-sample score:", rf1.score(X_train, y_train))
print("Model 1 out-of-sample score:", rf1.score(X_validate, y_validate))
#The out of sample accuracy for both models are identical while the model 1 performed worse than than the model on in -sample accuracy.

Model in-sample score: 0.9397590361445783
Model out-of-sample score: 0.794392523364486
Model 1 in-sample score: 0.8293172690763052
Model 1 out-of-sample score: 0.794392523364486


In [36]:
# Continue working in your model file with the titanic dataset.

# Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)

In [37]:
# Evaluate your results using the model score, confusion matrix, and classification report.
print('KNN accuracy')
score = knn.score(X_train, y_train)
score

KNN accuracy


0.8052208835341366

In [38]:
print('Confusion Matrix')
knn_conf = confusion_matrix(y_train, y_pred)
knn_conf

Confusion Matrix


array([[259,  48],
       [ 49, 142]])

In [39]:
print(pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)))

                    0           1  accuracy   macro avg  weighted avg
precision    0.840909    0.747368  0.805221    0.794139      0.805033
recall       0.843648    0.743455  0.805221    0.793552      0.805221
f1-score     0.842276    0.745407  0.805221    0.793842      0.805124
support    307.000000  191.000000  0.805221  498.000000    498.000000


In [40]:
# Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, 
# precision, recall, f1-score, and support.
tpr = knn_conf[1][1] / knn_conf[1].sum()
fpr = knn_conf[0][1] / knn_conf[0].sum()
tnr = knn_conf[0][0] / knn_conf[0].sum()
fnr = knn_conf[1][0] / knn_conf[1].sum()
print(f'The accuracy Score for the model is {score:.4}')
print (f'true positive rate: {tpr:.3} \nfalse positive rate: {fpr: .3} \ntrue negative rate: {tnr: .3} \nfalse negative rate; {fnr: .3}')

The accuracy Score for the model is 0.8052
true positive rate: 0.743 
false positive rate:  0.156 
true negative rate:  0.844 
false negative rate;  0.257


In [41]:
# Run through steps 2-4 setting k to 10
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_10 = knn_10.fit(X_train, y_train)
y_pred10 = knn_10.predict(X_train)

print("Accuracy Score:", knn_10.score(X_train, y_train))
print("Confusion Matrix\n", confusion_matrix(y_train, y_pred10))
print(pd.DataFrame(classification_report(y_train, y_pred10, output_dict=True)))


Accuracy Score: 0.7911646586345381
Confusion Matrix
 [[262  45]
 [ 59 132]]
                    0           1  accuracy   macro avg  weighted avg
precision    0.816199    0.745763  0.791165    0.780981      0.789185
recall       0.853420    0.691099  0.791165    0.772260      0.791165
f1-score     0.834395    0.717391  0.791165    0.775893      0.789520
support    307.000000  191.000000  0.791165  498.000000    498.000000


In [42]:
# Run through setps 2-4 setting k to 20
knn_20 = KNeighborsClassifier(n_neighbors=20)
knn_20 = knn_20.fit(X_train, y_train)
y_pred20 = knn_20.predict(X_train)

print("Accuracy Score:", knn_20.score(X_train, y_train))
print("Confusion Matrix\n", confusion_matrix(y_train, y_pred20))
print(pd.DataFrame(classification_report(y_train, y_pred20, output_dict=True)))


Accuracy Score: 0.7409638554216867
Confusion Matrix
 [[256  51]
 [ 78 113]]
                    0           1  accuracy   macro avg  weighted avg
precision    0.766467    0.689024  0.740964    0.727746      0.736765
recall       0.833876    0.591623  0.740964    0.712750      0.740964
f1-score     0.798752    0.636620  0.740964    0.717686      0.736569
support    307.000000  191.000000  0.740964  498.000000    498.000000


In [43]:
# What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
#  The higher the n-neighbors, the lesser the accuracy score. The first model performs the best as the default n-neighbor is 5.

In [44]:
# Which model performs best on our out-of-sample data from validate?
print("knn in-sample score:", knn.score(X_train, y_train))
print("knn out-of-sample score:", knn.score(X_validate, y_validate), "\n")

print("knn_10 in-sample score:", knn_10.score(X_train, y_train))
print("knn_10 out-of-sample score:", knn_10.score(X_validate, y_validate), "\n")

print("knn_20 in-sample score:", knn_20.score(X_train, y_train))
print("knn_20 out-of-sample score:", knn_20.score(X_validate, y_validate))
# The first model performs best on the out-of-sample data from validate.

knn in-sample score: 0.8052208835341366
knn out-of-sample score: 0.7429906542056075 

knn_10 in-sample score: 0.7911646586345381
knn_10 out-of-sample score: 0.7149532710280374 

knn_20 in-sample score: 0.7409638554216867
knn_20 out-of-sample score: 0.6682242990654206


In [54]:
# In these exercises, we'll continue working with the titanic dataset and building logistic regression models. 
# Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. 
# The test dataset should only be used for your final model.
df = get_titanic_data()


Unnamed: 0,sex_male,survived,pclass,sibsp,parch,fare,alone
583,1,0,1,0,0,40.1250,1
165,1,1,3,0,2,20.5250,0
50,1,0,3,4,1,39.6875,0
259,0,1,2,0,1,26.0000,0
306,0,1,1,0,0,110.8833,1
...,...,...,...,...,...,...,...
313,1,0,3,0,0,7.8958,1
636,1,0,3,0,0,7.9250,1
222,1,0,3,0,0,8.0500,1
485,0,0,3,3,1,25.4667,0


In [None]:
# For all of the models you create, choose a threshold that optimizes for accuracy.
def prep_titanic(df):
    df.drop_duplicates(inplace=True)
    df = df.drop(columns=['deck', 'embarked', 'class','passenger_id'])
    df['embark_town'] = df.embark_town.fillna(value='Southampton')
    cat_col = [col for col in df.columns if df[col].dtypes == 'object']
    dummy_df = pd.get_dummies(df[cat_col], dummy_na=False, drop_first = [True,True])
    df = pd.concat([dummy_df, df], axis =1)
    return df
df = prep_titanic(df)


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sex_male                 891 non-null    uint8  
 1   embark_town_Queenstown   891 non-null    uint8  
 2   embark_town_Southampton  891 non-null    uint8  
 3   Unnamed: 0               891 non-null    int64  
 4   survived                 891 non-null    int64  
 5   pclass                   891 non-null    int64  
 6   sex                      891 non-null    object 
 7   age                      714 non-null    float64
 8   sibsp                    891 non-null    int64  
 9   parch                    891 non-null    int64  
 10  fare                     891 non-null    float64
 11  embark_town              891 non-null    object 
 12  alone                    891 non-null    int64  
dtypes: float64(2), int64(6), object(2), uint8(3)
memory usage: 79.2+ KB


In [74]:
df = df[df.age.notna()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sex_male                 714 non-null    uint8  
 1   embark_town_Queenstown   714 non-null    uint8  
 2   embark_town_Southampton  714 non-null    uint8  
 3   Unnamed: 0               714 non-null    int64  
 4   survived                 714 non-null    int64  
 5   pclass                   714 non-null    int64  
 6   sex                      714 non-null    object 
 7   age                      714 non-null    float64
 8   sibsp                    714 non-null    int64  
 9   parch                    714 non-null    int64  
 10  fare                     714 non-null    float64
 11  embark_town              714 non-null    object 
 12  alone                    714 non-null    int64  
dtypes: float64(2), int64(6), object(2), uint8(3)
memory usage: 63.5+ KB


In [75]:

train_validate, test = train_test_split(df, test_size=.2, 
                                            random_state=123, 
                                            stratify=df.survived)
train, validate = train_test_split(train_validate, test_size=.3,
                                                   random_state=123,
                                                   stratify=train_validate.survived)
train.shape, validate.shape, test.shape


((399, 13), (172, 13), (143, 13))

In [76]:
# Do your work for these exercises in either a notebook or a python script named model within your classification-exercises repository. 
# Add, commit, and push your work.
drops = ['embark_town', 'embark_town_Queenstown', 'embark_town_Southampton', 'Unnamed: 0']
[dset.drop(columns=drops, inplace=True) for dset in [train, validate, test]]
# Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?
x_cols= ['pclass','age','fare']
y_cols = ['survived']
train

Unnamed: 0,sex_male,survived,pclass,sex,age,sibsp,parch,fare,alone
652,1,0,3,male,21.00,0,0,8.4333,1
813,0,0,3,female,6.00,4,2,31.2750,0
194,0,1,1,female,44.00,0,0,27.7208,1
417,0,1,2,female,18.00,0,2,13.0000,0
460,1,1,1,male,48.00,0,0,26.5500,1
...,...,...,...,...,...,...,...,...,...
856,0,1,1,female,45.00,1,1,164.8667,0
644,0,1,3,female,0.75,2,1,19.2583,0
523,0,1,1,female,44.00,0,1,57.9792,0
842,0,1,1,female,30.00,0,0,31.0000,1


In [77]:
X_train, y_train = train[x_cols], train[y_cols]
X_validate, y_validate = validate[x_cols], validate[y_cols]
X_test, y_test = train[x_cols], train[y_cols]


In [78]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399 entries, 652 to 834
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  399 non-null    int64  
 1   age     399 non-null    float64
 2   fare    399 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 12.5 KB


In [79]:
#baseline accuracy
train['baseline'] = 0

In [80]:
(train.baseline == train.survived).mean()

0.5939849624060151

In [82]:
logit =LogisticRegression(C=1.0 , random_state=123)
logit.fit(X_train, y_train)

LogisticRegression(random_state=123)

In [83]:
print(f'training score: {logit.score(X_train, y_train):.2%}')
print(f'validate score: {logit.score(X_validate, y_validate):.2%}')

training score: 69.92%
validate score: 69.19%


In [86]:
# Include sex in your model as well. 
# Note that you'll need to encode or create a dummy variable of this feature before including it in a model.
x_cols= ['sex_male', 'pclass','age','fare']
y_col = 'survived'

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [87]:
logit2 =LogisticRegression(C=1.0 , random_state=123)
logit2.fit(X_train, y_train)
print(f'training score: {logit2.score(X_train, y_train):.2%}')
print(f'validate score: {logit2.score(X_validate, y_validate):.2%}')

training score: 78.70%
validate score: 76.74%


In [88]:
# Try out other combinations of features and models.
# Use you best 3 models to predict and evaluate on your validate sample.
x_cols= ['sex_male', 'pclass','age','fare', 'alone']
y_col = 'survived'

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

logit3 =LogisticRegression(C=1.0 , random_state=123)
logit3.fit(X_train, y_train)
print(f'training score: {logit3.score(X_train, y_train):.2%}')
print(f'validate score: {logit3.score(X_validate, y_validate):.2%}')

training score: 79.70%
validate score: 79.07%


In [94]:
x_cols= ['sex_male', 'pclass','age','fare', 'sibsp']
y_col = 'survived'

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

logit4 =LogisticRegression(C=1.0 , random_state=123)
logit4.fit(X_train, y_train)
print(f'training score: {logit4.score(X_train, y_train):.2%}')
print(f'validate score: {logit4.score(X_validate, y_validate):.2%}')

training score: 81.45%
validate score: 78.49%


In [96]:
# Choose you best model from the validation performation, and evaluate it on the test dataset. 
#logit3 is the best model.
print('test score: {:.2f}'.format(logit3.score(X_test, y_test)))


test score: 0.83


In [None]:
# How do the performance metrics compare to validate? to train?
# the training score and validate score had some differences with training scores overfitting. However, the test score is at 83%, and 
# this is better than the baseline accuracy and the training/validate accuracy.