In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, accuracy_score, precision_score, recall_score
from pydataset import data
import prepare
import acquire
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')


In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

In [4]:
df = acquire.new_titanic_data()

In [5]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [6]:
train, validate, test = prepare.prep_titanic_data(df)

In [7]:
train.shape, validate.shape, test.shape

((498, 12), (214, 12), (179, 12))

In [8]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,0,1,male,36.0,0,0,40.125,Cherbourg,1,1,0,0
165,1,3,male,9.0,0,2,20.525,Southampton,0,1,0,1
50,0,3,male,7.0,4,1,39.6875,Southampton,0,1,0,1
259,1,2,female,50.0,0,1,26.0,Southampton,0,0,0,1
306,1,1,female,29.678105,0,0,110.8833,Cherbourg,1,0,0,0


In [9]:
train = train.drop(columns=['sex', 'embark_town'])

In [10]:
validate = validate.drop(columns=['sex', 'embark_town'])

In [11]:
test = test.drop(columns=['sex', 'embark_town'])

In [12]:
X_train = train.drop(columns='survived')
y_train = train['survived']

X_validate = validate.drop(columns='survived')
y_validate = validate['survived']

X_test = test.drop(columns='survived')
y_test = test['survived']

In [13]:
def establish_baseline(y_train):
    #  establish the value we will predict for all observations
    baseline_prediction = y_train.mode()

    # create a series of predictions with that value, 
    # the same length as our training set
    y_train_pred = pd.Series([0]*len(y_train))

    # compute accuracy of baseline
    cm = confusion_matrix(y_train, y_train_pred)
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp+tn)/(tn+fp+fn+tp)
    return accuracy

In [14]:
establish_baseline(y_train)

0.6164658634538153

Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [50]:
X_train2 = train.drop(columns=['survived','sibsp', 'parch', 'alone', 'sex_male',
                               'embark_town_Queenstown', 'embark_town_Southampton'])
y_train2 = train['survived']

X_validate2 = validate.drop(columns=['survived','sibsp', 'parch', 'alone', 'sex_male',
                                     'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate2 = validate['survived']

X_test2 = test.drop(columns=['survived','sibsp', 'parch', 'alone', 'sex_male',
                             'embark_town_Queenstown', 'embark_town_Southampton'])
y_test2 = test['survived']

In [67]:
seed = 42

logit2 = LogisticRegression(random_state=seed)

In [68]:
logit2

LogisticRegression(random_state=42)

In [69]:
logit2.fit(X_train2, y_train2)

LogisticRegression(random_state=42)

In [70]:
logit2.score(X_train2, y_train2)

0.7028112449799196

In [71]:
logit2.score(X_validate2, y_validate2)


0.7102803738317757

In [113]:
y_train_pred2 = logit2.predict(X_validate2)

In [114]:
y_val_pred2 = logit2.predict(X_train2)

In [None]:
    train_acc = logit2.score(X_train2, y_train2)
    val_acc = logit2.score(X_validate2, y_validate2)

Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [61]:
X_train3 = train.drop(columns=['survived','sibsp', 'parch', 'alone',
                               'embark_town_Queenstown', 'embark_town_Southampton'])
y_train3 = train['survived']

X_validate3 = validate.drop(columns=['survived','sibsp', 'parch', 'alone',
                                     'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate3 = validate['survived']

X_test3 = test.drop(columns=['survived','sibsp', 'parch', 'alone',
                             'embark_town_Queenstown', 'embark_town_Southampton'])
y_test3 = test['survived']

In [72]:
seed = 42

logit3 = LogisticRegression(random_state=seed)

In [73]:
logit3

LogisticRegression(random_state=42)

In [74]:
logit3.fit(X_train3, y_train3)

LogisticRegression(random_state=42)

In [75]:
logit3.score(X_train3, y_train3)

0.8132530120481928

In [76]:
logit3.score(X_validate3, y_validate3)

0.7757009345794392

In [115]:
y_train_pred3 = logit3.predict(X_validate3)

In [116]:
y_val_pred3 = logit3.predict(X_train3)

Try out other combinations of features and models.

In [77]:
X_train4 = train.drop(columns=['survived','sibsp', 'parch', 'alone',
                               'embark_town_Queenstown', 'embark_town_Southampton'])
y_train4 = train['survived']

X_validate4 = validate.drop(columns=['survived','sibsp', 'parch', 'alone',
                                     'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate4 = validate['survived']

X_test4 = test.drop(columns=['survived','sibsp', 'parch', 'alone',
                             'embark_town_Queenstown', 'embark_town_Southampton'])
y_test4 = test['survived']

In [84]:
logit4 = LogisticRegression(C=.1, class_weight={0:1, 1:99}, random_state=42, intercept_scaling=1, solver='lbfgs')


In [85]:
logit4

LogisticRegression(C=0.1, class_weight={0: 1, 1: 99}, random_state=42)

In [86]:
logit4.fit(X_train4, y_train4)

LogisticRegression(C=0.1, class_weight={0: 1, 1: 99}, random_state=42)

In [87]:
logit4.score(X_train4, y_train4)

0.38353413654618473

In [88]:
logit4.score(X_validate4, y_validate4)

0.38317757009345793

In [117]:
y_train_pred4 = logit4.predict(X_validate4)

In [118]:
y_val_pred4 = logit4.predict(X_train4)

In [89]:
X_train5 = train.drop(columns=['survived','sibsp', 'parch', 'alone',
                               'embark_town_Queenstown', 'embark_town_Southampton'])
y_train5 = train['survived']

X_validate5 = validate.drop(columns=['survived','sibsp', 'parch', 'alone',
                                     'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate5 = validate['survived']

X_test5 = test.drop(columns=['survived','sibsp', 'parch', 'alone',
                             'embark_town_Queenstown', 'embark_town_Southampton'])
y_test5 = test['survived']

In [95]:
logit5 = LogisticRegression(penalty='l1', random_state=seed,
                            solver='liblinear', max_iter=200)

In [96]:
logit5

LogisticRegression(max_iter=200, penalty='l1', random_state=42,
                   solver='liblinear')

In [100]:
logit5.fit(X_train5, y_train5)

LogisticRegression(max_iter=200, penalty='l1', random_state=42,
                   solver='liblinear')

In [101]:
logit5.score(X_train5, y_train5)

0.8012048192771084

In [102]:
logit5.score(X_validate5, y_validate5)

0.7710280373831776

In [119]:
y_train_pred5 = logit5.predict(X_validate5)

In [120]:
y_val_pred5 = logit5.predict(X_train5)

Use your best 3 models to predict and evaluate on your validate sample.

In [132]:
def logit_fit_predict(i,X_train2, y_train2, X_validate2):
    # create the object

    logit = LogisticRegression(random_state=seed)

    # create the model
    logit2.fit(X_train2, y_train2)

    # run the model
    y_train_pred2 = logit2.predict(X_train2)
    y_val_pred2 = logit2.predict(X_validate2)
    
    return logit2, y_train_pred2, y_val_pred2

def evaluate_clf(model, X, y, y_pred):
    # model score
    accuracy = model.score(X, y)

    print(f'Accuracy: {accuracy}')

    # confusion matrix
    cm = confusion_matrix(y, y_pred)
    print('Confusion Matrix')
    print(pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], 
                       columns=['Pred 0', 'Pred 1']))

    # classification report
    print('Classification Report')
    print(pd.DataFrame(classification_report(y, y_pred,
                                             output_dict=True)))
    # confusion matrix metrics
    print_cm_metrics(cm)
    
    return accuracy

In [134]:
metrics = []

for i in range(1):
    logit2, y_train_pred2, y_val_pred2 = logit_fit_predict(i,X_train2, 
                                                           y_train2, 
                                                           X_validate2)
    train_acc = logit2.score(X_train2, y_train2)
    val_acc = logit2.score(X_validate2, y_validate2)
    
    output = {
              "train_accuracy": train_acc,
              "validate_accuracy": val_acc
    }

    metrics.append(output)
    
eval_df = pd.DataFrame(metrics)
eval_df['difference'] = eval_df['train_accuracy'] - eval_df['validate_accuracy']

eval_df

Unnamed: 0,train_accuracy,validate_accuracy,difference
0,0.702811,0.71028,-0.007469


In [137]:
metrics = []

for i in range(1):
    logit3, y_train_pred3, y_val_pred3 = logit_fit_predict(i,X_train3, 
                                                           y_train3, 
                                                           X_validate3)
    train_acc = logit3.score(X_train3, y_train3)
    val_acc = logit3.score(X_validate3, y_validate3)
    
    output = {
              "train_accuracy": train_acc,
              "validate_accuracy": val_acc
    }

    metrics.append(output)
    
eval_df = pd.DataFrame(metrics)
eval_df['difference'] = eval_df['train_accuracy'] - eval_df['validate_accuracy']

eval_df

Unnamed: 0,train_accuracy,validate_accuracy,difference
0,0.813253,0.775701,0.037552


In [138]:
metrics = []

for i in range(1):
    logit5, y_train_pred5, y_val_pred5 = logit_fit_predict(i,X_train5, 
                                                           y_train5, 
                                                           X_validate5)
    train_acc = logit5.score(X_train5, y_train5)
    val_acc = logit5.score(X_validate5, y_validate5)
    
    output = {
              "train_accuracy": train_acc,
              "validate_accuracy": val_acc
    }

    metrics.append(output)
    
eval_df = pd.DataFrame(metrics)
eval_df['difference'] = eval_df['train_accuracy'] - eval_df['validate_accuracy']

eval_df

Unnamed: 0,train_accuracy,validate_accuracy,difference
0,0.813253,0.775701,0.037552


Choose your best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?