In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from pydataset import data

import acquire
import prepare

In [2]:
df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
df.dropna()
df.age = df.age.fillna(df.age.mean())
dummy_df = pd.get_dummies(df[['sex','embark_town', 'class']], dummy_na=False, drop_first=[True, True])
df = pd.concat([df, dummy_df], axis=1)
df.head()

train, validate, test = prepare.split(df, stratify_by='survived')
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [5]:
X_train.head()

Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,...,sex_male,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third,sex_male.1,embark_town_Queenstown.1,embark_town_Southampton.1,class_Second.1,class_Third.1
583,583,1,male,36.0,0,0,40.125,C,First,A,...,1,0,0,0,0,1,0,0,0,0
165,165,3,male,9.0,0,2,20.525,S,Third,,...,1,0,1,0,1,1,0,1,0,1
50,50,3,male,7.0,4,1,39.6875,S,Third,,...,1,0,1,0,1,1,0,1,0,1
259,259,2,female,50.0,0,1,26.0,S,Second,,...,0,0,1,1,0,0,0,1,1,0
306,306,1,female,29.699118,0,0,110.8833,C,First,,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X_train = X_train.drop(columns=['sex', 'class', 'deck', 'embark_town', 'embarked'])

In [None]:
X_train.head()

In [None]:
X_train.info()

In [None]:
y_train[:5]

In [None]:
#Create a model that includes age in addition to fare and pclass. 
#Does this model perform better than your baseline?



In [7]:
train.shape, validate.shape, test.shape

((498, 23), (214, 23), (179, 23))

In [8]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,...,sex_male,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third,sex_male.1,embark_town_Queenstown.1,embark_town_Southampton.1,class_Second.1,class_Third.1
583,583,0,1,male,36.0,0,0,40.125,C,First,...,1,0,0,0,0,1,0,0,0,0
165,165,1,3,male,9.0,0,2,20.525,S,Third,...,1,0,1,0,1,1,0,1,0,1
50,50,0,3,male,7.0,4,1,39.6875,S,Third,...,1,0,1,0,1,1,0,1,0,1
259,259,1,2,female,50.0,0,1,26.0,S,Second,...,0,0,1,1,0,0,0,1,1,0
306,306,1,1,female,29.699118,0,0,110.8833,C,First,...,0,0,0,0,0,0,0,0,0,0


In [13]:
baseline_accuracy = (train.survived == 0).mean()
round(baseline_accuracy, 2)

0.62

In [9]:
logit = LogisticRegression(random_state=123)

features = ["age", "pclass", "fare"]

In [10]:
logit.fit(X_train[features], y_train)

LogisticRegression(random_state=123)

In [11]:
y_pred = logit.predict(X_train[features])

In [14]:
print("Baseline is", round(baseline_accuracy, 2))
print("Logistic Regression using age, pclass, and fare features")
print('Accuracy of Logistic Regression on training set: {:.2f}'
     .format(logit.score(X_train[features], y_train)))


Baseline is 0.62
Logistic Regression using age, pclass, and fare features
Accuracy of Logistic Regression on training set: 0.70


In [19]:
logit1 = LogisticRegression(random_state=123)

features = ["age", "pclass", "fare", "sex_male"]

In [20]:
logit1.fit(X_train[features], y_train)

LogisticRegression(random_state=123)

In [None]:
logit2.fit(X_train, y_train)

In [21]:
y_pred = logit1.predict(X_train[features])

In [22]:
print("Logistic Regression using age, pclass, fare, and gender features")
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train[features], y_train)))

Logistic Regression using age, pclass, fare, and gender features
Accuracy of Logistic Regression classifier on training set: 0.81


In [24]:
logit2 = LogisticRegression(random_state=123)

logit2.fit(X_train, y_train)

y_pred = logit2.predict(X_train)

print("Model used on all features")
print('Accuracy of Logistic Regression on train set: {:.2f}'
     .format(logit2.score(X_train, y_train)))

Model used on all features
Accuracy of Logistic Regression on train set: 0.81


In [25]:
logit3 = LogisticRegression(random_state=123, class_weight='balanced')

logit3.fit(X_train, y_train)

y_pred = logit3.predict(X_train)

accuracy = logit3.score(X_train, y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.76


In [26]:
features = ["age"]
logit4 = LogisticRegression(random_state=123)
logit4.fit(X_train[features], y_train)
y_pred = logit4.predict(X_train[features])
accuracy = logit4.score(X_train[features], y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.62


In [27]:
features = ["pclass"]
logit5 = LogisticRegression(random_state=123)
logit5.fit(X_train[features], y_train)
y_pred = logit5.predict(X_train[features])
accuracy = logit5.score(X_train[features], y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 0.67


In [29]:
features = ["age", "pclass", "fare", "sex_male"]
y_pred = logit1.predict(X_validate[features])

print('Logit1 model using age, pclass, fare, and sex_male as the features')
print(classification_report(y_validate, y_pred))

Logit1 model using age, pclass, fare, and sex_male as the features
              precision    recall  f1-score   support

           0       0.80      0.83      0.82       132
           1       0.71      0.67      0.69        82

    accuracy                           0.77       214
   macro avg       0.76      0.75      0.75       214
weighted avg       0.77      0.77      0.77       214



In [31]:
y_pred = logit3.predict(X_validate)

print("Logit3 model using all features, class_weight='balanced', and all other hyperparameters as default")
print(classification_report(y_validate, y_pred))

ValueError: could not convert string to float: 'female'

In [32]:
y_pred_proba = logit3.predict_proba(X_train)


y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['not-survived', 'survived'])
y_pred_proba.head()

Unnamed: 0,not-survived,survived
0,0.672895,0.327105
1,0.907516,0.092484
2,0.923024,0.076976
3,0.126171,0.873829
4,0.075871,0.924129


In [33]:
# threshold 0.3
t = 0.3

y_pred = (y_pred_proba.survived > t).astype(int)
y_pred.head()

0    1
1    0
2    0
3    1
4    1
Name: survived, dtype: int64

In [34]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.60      0.71       307
           1       0.57      0.86      0.69       191

    accuracy                           0.70       498
   macro avg       0.72      0.73      0.70       498
weighted avg       0.76      0.70      0.70       498

