In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score


import warnings
warnings.filterwarnings('ignore')

from acquire import get_titanic_data
from prepare import prep_titanic


# Exercise
In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Do your work for these exercises in either a notebook or a python script named model within your classification directory.

In [2]:
train, test = prep_titanic(get_titanic_data())

In [3]:
train, validate = train_test_split(train, random_state=123, train_size=.8)

In [4]:
validate.shape

(143, 16)

In [5]:
X_train = train[['pclass', 'fare']]
y_train = train[['survived']]

In [6]:
model = LogisticRegression(random_state=123).fit(X_train, y_train)

In [7]:
pd.DataFrame(model.predict_proba(X_train), columns=model.classes_)


Unnamed: 0,0,1
0,0.575003,0.424997
1,0.739567,0.260433
2,0.710199,0.289801
3,0.737715,0.262285
4,0.739381,0.260619
...,...,...
564,0.739483,0.260517
565,0.575341,0.424659
566,0.739248,0.260752
567,0.739838,0.260162


In [8]:
train['yhat'] = model.predict(X_train)
train['p_survived'] = model.predict_proba(X_train)[:, 1]

In [9]:
accuracy_score(train.survived, train.yhat)

0.6783831282952548

In [10]:
recall_score(train.survived, train.yhat)

0.40625

In [11]:
t = .65
train['yhat'] = train.p_survived > t

accuracy_score(train.survived, train.yhat), precision_score(train.survived, train.yhat), recall_score(train.survived, train.yhat)

(0.6590509666080844, 0.7272727272727273, 0.21428571428571427)

#### my validate test

In [12]:
# validate test
val_X = validate[['pclass', 'fare']]
val_y = validate.survived

t = .65
probs = model.predict(val_X)
yhat = (probs > t).astype(int)
actual = val_y

accuracy_score(actual, yhat)

0.6503496503496503

### exercise 1
Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [13]:
X_train = train[['pclass', 'fare', 'age']]
y_train = train.survived
val_X = validate[['pclass', 'fare', 'age']]
val_y = validate.survived

In [14]:
model = LogisticRegression(random_state=123).fit(X_train, y_train)
train['yhat'] = model.predict(X_train)
train['p_survived'] = model.predict_proba(X_train)[:, 1]

In [15]:
accuracy_score(train.survived, train.yhat)

0.7047451669595782

In [16]:
t = .65
train['yhat'] = train.p_survived > t

accuracy_score(train.survived, train.yhat), precision_score(train.survived, train.yhat), recall_score(train.survived, train.yhat)

(0.680140597539543, 0.8088235294117647, 0.24553571428571427)

#### validate test for pclass fare & age

In [17]:
t = .65
probs = model.predict(val_X)
yhat = (probs > t).astype(int)
actual = val_y

accuracy_score(actual, yhat)

0.6643356643356644

### exercise 2
Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.

In [18]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,embarked C,embarked Q,embarked S,age_scaled,fare_scaled,yhat,p_survived
535,535,1,2,female,7.0,0,2,26.25,Second,Southampton,0,0.0,0.0,1.0,0.082684,0.051237,False,0.603909
573,573,1,3,female,29.832908,0,0,7.75,Third,Queenstown,1,0.0,1.0,0.0,0.369602,0.015127,False,0.239684
736,736,0,3,female,48.0,1,3,34.375,Third,Southampton,0,0.0,0.0,1.0,0.597889,0.067096,False,0.173592
713,713,0,3,male,29.0,0,0,9.4833,Third,Southampton,1,0.0,0.0,1.0,0.359135,0.01851,False,0.246009
528,528,0,3,male,39.0,0,0,7.925,Third,Southampton,1,0.0,0.0,1.0,0.484795,0.015469,False,0.193076


In [19]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def encode(train, test, col_name):

    encoded_values = sorted(list(train[col_name].unique()))

    # Integer Encoding
    int_encoder = LabelEncoder()
    train.encoded = int_encoder.fit_transform(train[col_name])
    test.encoded = int_encoder.transform(test[col_name])

    # create 2D np arrays of the encoded variable (in train and test)
    train_array = np.array(train.encoded).reshape(len(train.encoded),1)
    test_array = np.array(test.encoded).reshape(len(test.encoded),1)

    # One Hot Encoding
    ohe = OneHotEncoder(sparse=False, categories='auto')
    train_ohe = ohe.fit_transform(train_array)
    test_ohe = ohe.transform(test_array)

    # Turn the array of new values into a data frame with columns names being the values
    # and index matching that of train/test
    # then merge the new dataframe with the existing train/test dataframe
    train_encoded = pd.DataFrame(data=train_ohe,
                            columns=encoded_values, index=train.index)
    train = train.join(train_encoded)

    test_encoded = pd.DataFrame(data=test_ohe,
                               columns=encoded_values, index=test.index)
    test = test.join(test_encoded)

    return train, test



In [20]:
train, validate = encode(train, validate, 'sex')

In [21]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,embarked C,embarked Q,embarked S,age_scaled,fare_scaled,yhat,p_survived,female,male
535,535,1,2,female,7.0,0,2,26.25,Second,Southampton,0,0.0,0.0,1.0,0.082684,0.051237,False,0.603909,1.0,0.0
573,573,1,3,female,29.832908,0,0,7.75,Third,Queenstown,1,0.0,1.0,0.0,0.369602,0.015127,False,0.239684,1.0,0.0
736,736,0,3,female,48.0,1,3,34.375,Third,Southampton,0,0.0,0.0,1.0,0.597889,0.067096,False,0.173592,1.0,0.0
713,713,0,3,male,29.0,0,0,9.4833,Third,Southampton,1,0.0,0.0,1.0,0.359135,0.01851,False,0.246009,0.0,1.0
528,528,0,3,male,39.0,0,0,7.925,Third,Southampton,1,0.0,0.0,1.0,0.484795,0.015469,False,0.193076,0.0,1.0


In [22]:
X_train = train[['pclass', 'fare', 'age', 'female', 'male']]
y_train = train.survived
val_X = validate[['pclass', 'fare', 'age', 'female', 'male']]
val_y = validate.survived

In [23]:
model = LogisticRegression(random_state=123).fit(X_train, y_train)
train['yhat'] = model.predict(X_train)
train['p_survived'] = model.predict_proba(X_train)[:, 1]

In [24]:
accuracy_score(train.survived, train.yhat)

0.7820738137082601

In [25]:
t = .65
train['yhat'] = train.p_survived > t

accuracy_score(train.survived, train.yhat), precision_score(train.survived, train.yhat), recall_score(train.survived, train.yhat)

(0.7768014059753954, 0.8943089430894309, 0.49107142857142855)

#### Validate test for pclass fare age and sex

In [26]:
t = .65
probs = model.predict(val_X)
yhat = (probs > t).astype(int)
actual = val_y

accuracy_score(actual, yhat)

0.8041958041958042

### exercise 3
Try out other combinations of features and models.

In [27]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,embarked C,embarked Q,embarked S,age_scaled,fare_scaled,yhat,p_survived,female,male
535,535,1,2,female,7.0,0,2,26.25,Second,Southampton,0,0.0,0.0,1.0,0.082684,0.051237,True,0.855826,1.0,0.0
573,573,1,3,female,29.832908,0,0,7.75,Third,Queenstown,1,0.0,1.0,0.0,0.369602,0.015127,False,0.570351,1.0,0.0
736,736,0,3,female,48.0,1,3,34.375,Third,Southampton,0,0.0,0.0,1.0,0.597889,0.067096,False,0.479066,1.0,0.0
713,713,0,3,male,29.0,0,0,9.4833,Third,Southampton,1,0.0,0.0,1.0,0.359135,0.01851,False,0.109269,0.0,1.0
528,528,0,3,male,39.0,0,0,7.925,Third,Southampton,1,0.0,0.0,1.0,0.484795,0.015469,False,0.089064,0.0,1.0


In [28]:
train, validate = encode(train, validate, 'class')

In [29]:
train, validate = encode(train, validate, 'embark_town')

In [30]:
train

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,...,yhat,p_survived,female,male,First,Second,Third,Cherbourg,Queenstown,Southampton
535,535,1,2,female,7.000000,0,2,26.2500,Second,Southampton,...,True,0.855826,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
573,573,1,3,female,29.832908,0,0,7.7500,Third,Queenstown,...,False,0.570351,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
736,736,0,3,female,48.000000,1,3,34.3750,Third,Southampton,...,False,0.479066,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
713,713,0,3,male,29.000000,0,0,9.4833,Third,Southampton,...,False,0.109269,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
528,528,0,3,male,39.000000,0,0,7.9250,Third,Southampton,...,False,0.089064,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,653,1,3,female,29.832908,0,0,7.8292,Third,Queenstown,...,False,0.570380,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
20,20,0,2,male,35.000000,0,0,26.0000,Second,Southampton,...,False,0.222559,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
589,589,0,3,male,29.832908,0,0,8.0500,Third,Southampton,...,False,0.107252,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
514,514,0,3,male,24.000000,0,0,7.4958,Third,Southampton,...,False,0.120361,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [31]:
X_train = train[['pclass', 'fare', 'age', 'female', 'male', 'First', 'Second', 'Third']]
y_train = train.survived
val_X = validate[['pclass', 'fare', 'age', 'female', 'male', 'First', 'Second', 'Third']]
val_y = validate.survived

model = LogisticRegression(random_state=123).fit(X_train, y_train)
train['yhat'] = model.predict(X_train)
train['p_survived'] = model.predict_proba(X_train)[:, 1]

In [32]:
accuracy_score(train.survived, train.yhat)

0.7943760984182777

In [33]:
t = .65
train['yhat'] = train.p_survived > t

accuracy_score(train.survived, train.yhat), precision_score(train.survived, train.yhat), recall_score(train.survived, train.yhat)

(0.7820738137082601, 0.8968253968253969, 0.5044642857142857)

#### validating

In [34]:
t = .65
probs = model.predict(val_X)
yhat = (probs > t).astype(int)
actual = val_y

accuracy_score(actual, yhat)

0.8041958041958042

### exercise 4
Choose you best model and evaluate it on the test dataset. Is it overfit?

In [35]:
train, test = prep_titanic(get_titanic_data())

In [36]:
train, test = encode(train, test, 'sex')

In [37]:
test.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,embarked C,embarked Q,embarked S,age_scaled,fare_scaled,female,male
172,172,1,3,female,1.0,1,1,11.1333,Third,Southampton,0,0.0,0.0,1.0,0.007288,0.021731,1.0,0.0
524,524,0,3,male,29.832908,0,0,7.2292,Third,Cherbourg,1,1.0,0.0,0.0,0.369602,0.01411,0.0,1.0
452,452,0,1,male,30.0,0,0,27.75,First,Cherbourg,1,1.0,0.0,0.0,0.371701,0.054164,0.0,1.0
170,170,0,1,male,61.0,0,0,33.5,First,Southampton,1,0.0,0.0,1.0,0.761247,0.065388,0.0,1.0
620,620,0,3,male,27.0,1,0,14.4542,Third,Cherbourg,0,1.0,0.0,0.0,0.334004,0.028213,0.0,1.0


In [38]:
X_test = test[['pclass', 'fare', 'age', 'female', 'male']]
y_test = test.survived
X_train = train[['pclass', 'fare', 'age', 'female', 'male']]
y_train = train.survived

In [39]:
model = LogisticRegression(random_state=123).fit(X_train, y_train)
train['yhat'] = model.predict(X_train)
train['p_survived'] = model.predict_proba(X_train)[:, 1]

In [40]:
accuracy_score(train.survived, train.yhat)

0.7865168539325843

In [41]:
t = .65
train['yhat'] = train.p_survived > t

accuracy_score(train.survived, train.yhat), precision_score(train.survived, train.yhat), recall_score(train.survived, train.yhat)

(0.7780898876404494, 0.8650306748466258, 0.5090252707581228)

#### my test data set

In [47]:
print('Accuracy of Logistic Regression classifier on test set: {:.4f}'
     .format(model.score(X_test, y_test)))

Accuracy of Logistic Regression classifier on test set: 0.7989


### Bonus 1:
How do different strategies for handling the missing values in the age column affect model performance?

### Bonus 2: 
How do different strategies for encoding sex affect model performance?

### Bonus 3: 
scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

$C$ =.01, .1, 1, 10, 100, 1000

### Bonus Bonus: 
how does scaling the data interact with your choice of C?