# Decision Tree Exercises

#### Using Titanic Data
- Remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
import graphviz
from graphviz import Graph
import acquire
import prepare

In [None]:
# Acquire Data
df = acquire.get_titanic()

In [None]:
df = prepare.clean_titanic(df)

In [None]:
train

In [None]:
train.info()

## <font color = 'red'> 1a) What is your baseline prediction?

In [None]:
# Create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns='survived')
y_train = train.survived

X_validate = validate.drop(columns='survived')
y_validate = validate.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [None]:
# Find most often used case in survived
y_train.value_counts() #most frequent is not_survived

In [None]:
# The mode is a great baseline
baseline = y_train.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train == 0)
matches_baseline_prediction.head()

In [None]:
### Baseline Prediction to 0 or Not_Survived

## <font color = 'red'> 1a) What is your baseline accuracy?

In [None]:

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")### ??? Removed column for baseline because it threw off the validation

### Baseline accuracy is 61.6%

## <font color = 'red'>2) Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
X_train.info()

In [None]:
X_train.shape

In [None]:
X_validate.shape

In [None]:
# Create the Decision Tree Object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
clf

In [None]:
# Fit the Model
clf = clf.fit(X_train, y_train)
clf

In [None]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

In [None]:
# make prediction on train obeservations

y_pred = clf.predict(X_train)
y_pred[0:5]

In [None]:
# provide probabiblity on train observations
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

## <font color = 'red' >3) Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
# Computer Accuracy Score
clf.score(X_train, y_train)

In [None]:
# Create Confusion Matrix
confusion_matrix(y_train, y_pred)

In [None]:
y_train.value_counts()

In [None]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
# Create Classification Report
print(classification_report(y_train, y_pred))

## <font color = 'red'>4) Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support

In [None]:
# Find true positive rate, false positive rate, true negative rate, false negative rate
# Compute TPR,TNR,FPR,FNR
TP = 294
TN = 114
FP = 77
FN = 13
ALL = TP + TN + FP + FN

accuracy = (TP + TN) / ALL
TPR = TP / (TP + FN) 
TNR = TN / (TN + FP) 
FNR = FN / (FN + TP)  
FPR = FP / (FP + TN) 
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2* (precision*recall) / (precision+recall)
support_pos = TP + FN
support_neg = TN + FP


print(f'Accuracy: {accuracy}')
print(f'True Positive Rate: {TPR}')
print(f'True Negative Rate: {TNR}')
print(f'False Positive Rate: {FPR}')
print(f'False Negative Rate: {FNR}')
print(f'Precision: {precision}')
print(f'recall: {recall}')
print(f'support_pos: {support_pos}')
print(f'support_neg: {support_neg}')


## <font color = 'red'>5) Run through steps 2-4 using a different max_depth value.

### <font color = 'red'> Using Max_Depth 4

In [None]:
# Create the Decision Tree Object increasing max_depth to 4
clf = DecisionTreeClassifier(max_depth=4, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)

In [None]:
# make prediction on train obeservations
y_pred = clf.predict(X_train)

In [None]:
# find probability of train observations
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

In [None]:
# Compute Accuracy
clf.score(X_train, y_train)

In [None]:
# Create Confusion Matrix
confusion_matrix(y_train, y_pred)

In [None]:
y_train.value_counts()

In [None]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
# Create classification report to find precision, reall, f1-score, support
print(classification_report(y_train, y_pred))

In [None]:
# Find true positive rate, false positive rate, true negative rate, false negative rate
# Compute TPR,TNR,FPR,FNR
TP = 298
TN = 116
FP = 75
FN = 9

TPR = TP / (TP + FN) 
TNR = TN / (TN + FP) 
FNR = FN / (FN + TP)  
FPR = FP / (FP + TN) 


print(f'True Positive Rate: {TPR}')
print(f'True Negative Rate: {TNR}')
print(f'False Positive Rate: {FPR}')
print(f'False Negative Rate: {FNR}')

### <font color = 'red'> Using Max_Depth 5

In [None]:
# Create the Decision Tree Object increasing max_depth to 5
clf = DecisionTreeClassifier(max_depth=5, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)
# make prediction on train obeservations
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

In [None]:
# Compute Accuracy
clf.score(X_train, y_train)

In [None]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
# Find true positive rate, false positive rate, true negative rate, false negative rate
# Compute TPR,TNR,FPR,FNR
TP = 291
TN = 131
FP = 60
FN = 16

TPR = TP / (TP + FN) 
TNR = TN / (TN + FP) 
FNR = FN / (FN + TP)  
FPR = FP / (FP + TN) 


print(f'True Positive Rate: {TPR}')
print(f'True Negative Rate: {TNR}')
print(f'False Positive Rate: {FPR}')
print(f'False Negative Rate: {FNR}')

### <font color = 'red'> Using Max_Depth 6

In [None]:
# Create the Decision Tree Object increasing max_depth to 6
clf = DecisionTreeClassifier(max_depth=6, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)
# make prediction on train obeservations
y_pred = clf.predict(X_train)
print(classification_report(y_train, y_pred))

In [None]:
# Compute Accuracy
clf.score(X_train, y_train)

In [None]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
# Find true positive rate, false positive rate, true negative rate, false negative rate
# Compute TPR,TNR,FPR,FNR
TP = 282
TN = 151
FP = 40
FN = 25

TPR = TP / (TP + FN) 
TNR = TN / (TN + FP) 
FNR = FN / (FN + TP)  
FPR = FP / (FP + TN) 


print(f'True Positive Rate: {TPR}')
print(f'True Negative Rate: {TNR}')
print(f'False Positive Rate: {FPR}')
print(f'False Negative Rate: {FNR}')

In [None]:
for i in range(2,20):
    # Create the Decision Tree Object increasing max_depth to 6
    clf = DecisionTreeClassifier(max_depth=i, random_state=123)
    
    # Fit the Model
    clf = clf.fit(X_train, y_train)
    
    # make prediction on train obeservations
    y_pred = clf.predict(X_train)
    
    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_pred, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

## <font color='red'>6) Which model performs better on your in-sample data?

In [None]:
# Model using Max Depth 6 because it has the best accuracy

## <font color = 'red'>7) Which model performs best on your out-of-sample data, the validate set?

In [None]:
# Create the Decision Tree Object increasing max_depth to 3
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)
# make prediction on train obeservations
y_pred = clf.predict(X_validate)

print(clf.score(X_validate, y_validate))


In [None]:
X_train.shape

In [None]:
X_validate.shape

In [None]:
# Create the Decision Tree Object increasing max_depth to 4
clf = DecisionTreeClassifier(max_depth=4, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)

print(clf.score(X_validate, y_validate))
print(classification_report(y_validate, y_pred))

In [None]:
# Create the Decision Tree Object increasing max_depth to 5
clf = DecisionTreeClassifier(max_depth=5, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)

print(clf.score(X_validate, y_validate))
print(classification_report(y_validate, y_pred))

In [None]:
# Create the Decision Tree Object increasing max_depth to 6
clf = DecisionTreeClassifier(max_depth=6, random_state=123)
# Fit the Model
clf = clf.fit(X_train, y_train)

print(clf.score(X_validate, y_validate))
print(classification_report(y_validate, y_pred))

In [None]:
# Let's continue getting loopy, so we can compare in-sample to out-of-sample
metrics = []

for i in range(2, 25):
    # Make the model
    clf = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    clf = clf.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = clf.score(X_train, y_train)
    
    out_of_sample_accuracy = clf.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

### Model with Max_Depth 6 has the best F1-Scores

# <font color='red'> Random Forest

##  <font color='red'> 1) Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [None]:
# Create the Model's Object
rf = RandomForestClassifier(max_depth=10,
                           random_state=123,
                           min_samples_leaf=1)
rf

In [None]:
#Fit the Data to the model
rf = rf.fit(X_train,y_train)

In [None]:
# Get Feature (column) Importances. sex_male (48%) has the most importance
print(rf.feature_importances_)

In [None]:
# Make Predictions
y_pred = rf.predict(X_train)
y_pred

In [None]:
# Find Probabability of each Prediction
y_pred_proba = rf.predict_proba(X_train)
y_pred_proba

##  <font color='red'> 2) Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
# Find Model Score for Accuracy
rf.score(X_train,y_train)

In [None]:
# Make Confusion Matrix
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train,y_pred))

In [None]:
# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train, y_pred, output_dict=True)
pd.DataFrame(report)

##  <font color='red'> 3) Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
TP = 293
FP = 56
FN = 14
TN = 135
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

##  <font color='red'> 4) Run through steps increasing your min_samples_leaf and decreasing your max_depth

In [None]:
for i in range(1,11):
    rf = RandomForestClassifier(max_depth=11-i,
                           random_state=123,
                           min_samples_leaf=i)
    rf = rf.fit(X_train,y_train)
    y_pred = rf.predict(X_train)
    report = classification_report(y_train, y_pred, output_dict=True)
    print(f"Random Forest with max_depth of {11-i} and min_sample_leaf {i}")
    print(pd.DataFrame(report))
    print()

##  <font color='red'> 5a) What are the differences in the evaluation metrics? 

???

##  <font color='red'> 5a)Which performs better on your in-sample data? Why?

Random Forest with max_depth of 8 and min_sample_leaf 3 performs best. ???

##  <font color='red'> 6) After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [None]:
# compare in-sample to out-of-sample
metrics = []

for i in range(1,11):
    # Make the model
    rf = RandomForestClassifier(max_depth=11-i,
                           random_state=123,
                           min_samples_leaf=i)

    # Fit the model (on train and only train)
    rf = rf.fit(X_train, y_train)
    
    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = rf.score(X_train, y_train)
    
    out_of_sample_accuracy = rf.score(X_validate, y_validate)

    output = {
        "max_depth": 11-i,
        "min_sample": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

### model with max_depth = 2, min_sample = 9

# <font color = 'red'> K-Nearest Neighbor

## <font color = 'red'> 1) Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
# create model object
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [None]:
# fit data to model object
knn.fit(X_train, y_train)

In [None]:
# create prediction array based on train data set using model
y_pred = knn.predict(X_train)
y_pred[0:5]

In [None]:
# See class of data. 0=Did Not Surviv, 1 = Survived
knn.classes_

In [None]:
# Create Probablity of Prediction (Did Not Survive) being correction
y_pred_proba = knn.predict_proba(X_train)
y_pred_proba[0:5]

## <font color = 'red'> 2) Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
# Get accuracy Score for KNN. 83% accurate
knn.score(X_train, y_train)

In [None]:
# Make Confusion Matrix
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
print(classification_report(y_train, y_pred))

## <font color = 'red'> 3) Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
TP = 273
TN = 139
FP = 52
FN = 34

ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

## <font color = 'red'> 4) Run through steps 2-4 setting k to 10

In [None]:
# create model object based on k = 10
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
# fit data to model object
knn.fit(X_train, y_train)
# Create Prediction Array
y_pred = knn.predict(X_train)

print(knn.score(X_train, y_train))
print(classification_report(y_train, y_pred))

## <font color = 'red'> 5) Run through setps 2-4 setting k to 20

In [None]:
# create model object based on k = 20
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
# fit data to model object
knn.fit(X_train, y_train)
# Create Prediction Array
y_pred = knn.predict(X_train)

print(knn.score(X_train, y_train))
print(classification_report(y_train, y_pred))

## <font color = 'red'> 6) What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

### Model with K=10 provides 81% Accuracy vs Model with K=20 provides 82% Accuracy. 

## <font color = 'red'> 7) Which model performs best on our out-of-sample data from validate?

### Model with K=10

In [None]:
# create model object based on k = 10
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
# fit data to model object
knn.fit(X_train, y_train)
# Create Prediction Array
y_pred = knn.predict(X_train)

print(knn.score(X_validate, y_validate))

### Model with K=20

In [None]:
# create model object based on k = 20
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
# fit data to model object
knn.fit(X_train, y_train)
# Create Prediction Array
y_pred = knn.predict(X_train)

print(knn.score(X_validate, y_validate))

### Model with K=10 has better accuracy because K=20 includes too much noise.

In [None]:
# compare in-sample to out-of-sample
metrics = []

for i in range(1,21):
    # Make the model
    knn = KNeighborsClassifier(n_neighbors=i, weights='uniform')

    # Fit the model (on train and only train)
    knn = knn.fit(X_train, y_train)
    
    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = knn.score(X_train, y_train)
    
    out_of_sample_accuracy = knn.score(X_validate, y_validate)

    output = {
        "K_Neighbor": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
df.difference.abs().idxmin()

In [None]:
k_range = range(1, 21)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))
plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20])
plt.show()

In [None]:
# id columns
train.columns

In [None]:
# Reduce features
x_cols = ['pclass','alone', 'sex_male']
y_col = 'survived'

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [None]:
# compare in-sample to out-of-sample
metrics = []

for i in range(1,21):
    # Make the model
    knn = KNeighborsClassifier(n_neighbors=i, weights='uniform')

    # Fit the model (on train and only train)
    knn = knn.fit(X_train, y_train)
    
    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = knn.score(X_train, y_train)
    
    out_of_sample_accuracy = knn.score(X_validate, y_validate)

    output = {
        "K_Neighbor": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

In [None]:
df.difference.abs().idxmin()

In [None]:
metrics = []

# loop through different values of k
for k in range(1, 21):
            
    # define the thing
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # fit the thing (remmeber only fit on training data)
    knn.fit(X_train, y_train)
    
    # use the thing (calculate accuracy)
    train_accuracy = knn.score(X_train, y_train)
    validate_accuracy = knn.score(X_validate, y_validate)
    
    output = {
        "k": k,
        "train_accuracy": train_accuracy,
        "validate_accuracy": validate_accuracy
    }
    
    metrics.append(output)

# make a dataframe
results = pd.DataFrame(metrics)

# plot the data
results.set_index('k').plot(figsize = (16,9))
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,21,1))
plt.grid()

In [None]:
# Reduce features
x_cols = ['pclass','alone', 'sex_male']
y_col = 'survived'

X_train, y_train = train[x_cols], train[y_col]
X_validate, y_validate = validate[x_cols], validate[y_col]
X_test, y_test = test[x_cols], test[y_col]

In [None]:
metrics = []

# loop through different values of k
for k in range(1, 21):
            
    # define the thing
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # fit the thing (remmeber only fit on training data)
    knn.fit(X_train, y_train)
    
    # use the thing (calculate accuracy)
    train_accuracy = knn.score(X_train, y_train)
    validate_accuracy = knn.score(X_validate, y_validate)
    
    output = {
        "k": k,
        "train_accuracy": train_accuracy,
        "validate_accuracy": validate_accuracy
    }
    
    metrics.append(output)

# make a dataframe
results = pd.DataFrame(metrics)

# plot the data
results.set_index('k').plot(figsize = (16,9))
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,21,1))
plt.grid()

# <font color='red'> Logistic Regression

## <font color = 'red'> 1) Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [2]:
df = acquire.get_titanic()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
train, validate, test = prepare.prep_titanic(df)
train.shape, validate.shape, test.shape

((498, 12), (214, 12), (179, 12))

In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179 entries, 561 to 53
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 179 non-null    int64  
 1   pclass                   179 non-null    int64  
 2   sex                      179 non-null    object 
 3   age                      179 non-null    float64
 4   sibsp                    179 non-null    int64  
 5   parch                    179 non-null    int64  
 6   fare                     179 non-null    float64
 7   embark_town              179 non-null    object 
 8   alone                    179 non-null    int64  
 9   sex_male                 179 non-null    uint8  
 10  embark_town_Queenstown   179 non-null    uint8  
 11  embark_town_Southampton  179 non-null    uint8  
dtypes: float64(2), int64(5), object(2), uint8(3)
memory usage: 14.5+ KB


In [5]:
train = train.drop(columns=['sex','embark_town'])
validate = validate.drop(columns=['sex','embark_town'])
test = test.drop(columns=['sex','embark_town'])

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 498 non-null    int64  
 1   pclass                   498 non-null    int64  
 2   age                      498 non-null    float64
 3   sibsp                    498 non-null    int64  
 4   parch                    498 non-null    int64  
 5   fare                     498 non-null    float64
 6   alone                    498 non-null    int64  
 7   sex_male                 498 non-null    uint8  
 8   embark_town_Queenstown   498 non-null    uint8  
 9   embark_town_Southampton  498 non-null    uint8  
dtypes: float64(2), int64(5), uint8(3)
memory usage: 32.6 KB


In [7]:
# Create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns='survived')
y_train = train.survived

X_validate = validate.drop(columns='survived')
y_validate = validate.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [8]:
# id most frequent value for creating baseline
y_train.mode()

0    0
dtype: int64

In [9]:
# create baseline with 0 value
baseline = (y_train == 0).mean()
baseline

0.6164658634538153

In [10]:
# 3 features
logit1 = LogisticRegression(random_state=123)

In [11]:
features = ['age', 'fare', 'pclass']

In [12]:
logit1.fit(X_train[features],y_train)

LogisticRegression(random_state=123)

In [14]:
print('Coefficient: \n', logit1.coef_)
print('Intercept: \n', logit1.intercept_)

Coefficient: 
 [[-0.03063266  0.00141012 -0.94966047]]
Intercept: 
 [2.52857789]


In [15]:
y_pred = logit1.predict(X_train[features])
y_pred[0:5]

array([1, 0, 0, 0, 1])

In [16]:
y_proba = logit1.predict_proba(X_train[features])
y_proba[0:5]

array([[0.36988206, 0.63011794],
       [0.63810638, 0.36189362],
       [0.61748053, 0.38251947],
       [0.70385285, 0.29614715],
       [0.30445826, 0.69554174]])

In [17]:
print(confusion_matrix(y_train, y_pred))

[[266  41]
 [107  84]]


In [18]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78       307
           1       0.67      0.44      0.53       191

    accuracy                           0.70       498
   macro avg       0.69      0.65      0.66       498
weighted avg       0.70      0.70      0.69       498



## <font color = 'red'> 2) Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [19]:
features = ['age', 'pclass', 'fare', 'sex_male']

In [20]:
# 4 features 
logit2 = LogisticRegression(random_state=123)
logit2.fit(X_train[features], y_train)
print(logit2.score(X_train[features], y_train))

0.8132530120481928


## <font color = 'red'> 3) Try out other combinations of features and models.

In [21]:
# all features
logit3 = LogisticRegression(random_state=123)
logit3.fit(X_train, y_train)
print(logit3.score(X_train, y_train))

0.8132530120481928


In [22]:
# 4 features with class_weight=balanced
logit4 = LogisticRegression(random_state=123, class_weight='balanced')
logit4.fit(X_train[features], y_train)
print(logit4.score(X_train[features], y_train).mean())

0.8012048192771084


In [23]:
# all features with class_weight=balanced
logit5 = LogisticRegression(random_state=123, class_weight='balanced')
logit5.fit(X_train, y_train)
print(logit5.score(X_train, y_train).mean())

0.8072289156626506


In [32]:
# 4 features with c=.001
logit6 = LogisticRegression(random_state=123, C=.0001)
logit6.fit(X_train[features], y_train)
print(logit6.score(X_train[features], y_train).mean())

0.6445783132530121


In [33]:
# all feature with c=.001
logit7 = LogisticRegression(random_state=123, C=.001)
logit7.fit(X_train, y_train)
print(logit7.score(X_train, y_train).mean())

0.6485943775100401


In [36]:
# a4 feature with class_weight='balanced and c=.001
logit8 = LogisticRegression(random_state=123, class_weight='balanced',C=.001)
logit8.fit(X_train[features], y_train)
print(logit8.score(X_train[features], y_train).mean())

0.6847389558232931


In [37]:
# all feature with class_weight='balanced and c=.001
logit9 = LogisticRegression(random_state=123, class_weight='balanced', C=.001)
logit9.fit(X_train, y_train)
print(logit9.score(X_train, y_train).mean())

0.6847389558232931


## <font color = 'red'> 4) Use you best 3 models to predict and evaluate on your validate sample.

In [None]:
# logit2,logit3, and logit5 has the highest accuracy

In [40]:
#logit2 with 4 features
y_pred2 = logit2.predict(X_validate[features])
print(classification_report(y_validate, y_pred2))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.70        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg       0.77      0.78      0.77       214



In [41]:
# logit3 with all features
y_pred3 = logit3.predict(X_validate)
print(classification_report(y_validate, y_pred3))

              precision    recall  f1-score   support

           0       0.80      0.86      0.82       132
           1       0.74      0.65      0.69        82

    accuracy                           0.78       214
   macro avg       0.77      0.75      0.76       214
weighted avg       0.77      0.78      0.77       214



In [42]:
# logi5 with all features and class_weight='balanced'
y_pred5 = logit5.predict(X_validate)
print(classification_report(y_validate, y_pred5))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       132
           1       0.71      0.71      0.71        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg       0.78      0.78      0.78       214



## <font color = 'red'> 5) Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [44]:
# logit2 with 4 features performed the best
y_pred2 = logit2.predict(X_test[features])
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       110
           1       0.77      0.71      0.74        69

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [None]:
# train: .81
# validate: .78
# test: .80