<!-- 

project introduction:

this jupyter notebook implements a machine learning workflow to predict
whether passengers survived the Titanic disaster using a Logistic
Regression model. The exercise uses the Titanic dataset from kaggle, 
which includes passenger information such as class, sex, age, and
fare, to train and evaluate a classifier.  

-->

In [61]:
import pandas as pd

In [62]:
# read the csv files and convert them into pandas data frames that we can work with
# to train a classifier that uses passenger data to make predictions about whether 
# or not each passenger survived. we will compare the predictions against a source
# of truth to determine the accuracy of the model and potentially make some improvements

# train.csv contains passenger features and survival outcomes (Survived), used to train
# the model. test.csv contains features only, used to predict survival for submission.
data = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# test_ids is used at the very end of the exercise once we have a prediction
# for each row. we use the passenger ids to indicate the results of the model
test_passenger_ids = test["PassengerId"]

In [63]:
# clean the data and limit to the "features" (column names) we want to use
# in the Logistic Regression classifier below.
def clean(data):
    
    # drop the following columns, as they will not be used in classifying
    # whether a passenger lived or died. cabin would actually be a good
    # thing to consider, if we could know where the cabin was located on
    # the ship. for example, do cabins closer to lifeboats, or further away
    # from point of impact (etc) correlate with survival rates?
    data = data.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)
    
    # these columns have some missing values. let's call them out
    # and act on them in the loop below
    cols = ["SibSp", "Parch", "Fare", "Age"]

    # if the cell value is empty, fill it with that column's median
    # value. i have mixed feelings about doing this. i guess it 
    # wouldn't throw off or skew the Logistic Regression classifier,
    # significantly, but it seems like it would reduce the variability 
    # of the data. is it enough to be a problem?
    for col in cols:
        data.fillna({col: data[col].median()}, inplace=True)
    
    # replace the Embarked column with a single U value if it is unknown
    # Fill missing Embarked values with 'U' (unknown). this is handled 
    # outside of the for loop because Embarked is categorical, unlike 
    # the numerical columns above
    data['Embarked'] = data["Embarked"].fillna("U")
    return data

# use and reuse the clean function to clean the data for both sets
data = clean(data)
test = clean(test)

# preview the cleaned data
data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [64]:
from sklearn import preprocessing

# get an instance of LabelEncoder to use on Sex and Embarked
# this will assign 0 or 1 to male and female. need to do this 
# because LogisticRegression requires all numeric inputs
le = preprocessing.LabelEncoder()

cols = ["Sex", "Embarked"]

# encode Sex (e.g., female=0, male=1) and Embarked (e.g., C=0, Q=1, S=2, U=3).
for col in cols:
    data[col] = le.fit_transform(data[col])
    test[col] = le.fit_transform(test[col])
    # le.classes_ shows the unique values that the encoder has 
    # identified and will transform into more readable values
    # that will be used in the modeling
    print(le.classes_)

# check out the first 10 rows of the transformed data
data.head(10)


['female' 'male']
['C' 'Q' 'S']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2
5,0,3,1,28.0,0,0,8.4583,1
6,0,1,1,54.0,0,0,51.8625,2
7,0,3,1,2.0,3,1,21.075,2
8,1,3,0,27.0,0,2,11.1333,2
9,1,2,0,14.0,1,0,30.0708,0


In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# extract Survived as the target (y) and other columns as features (X)

# make a copy of the survived column off of the data frame
y = data["Survived"]
# once we have a copy, drop the column before we pass it into the training
X = data.drop("Survived", axis=1)

# Split data into training (80%) and validation (20%) sets to train and evaluate
# the model. test_size=0.2 sets 20% for validation, random_state=42 ensures
# reproducible splits
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
# this seems to be the real heart of the model. LogisticRegression takes in
# the training data, which includes the `Survived` column. this is important
# because we need to start to draw conclusions from the characteristics of 
# those who survived and those who did not. the `clf` name iss a convention that
# is an abbreviation for "classifier"

# .fit optimizes coefficients to predict Survived
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

# printing the feature coefficients shows that the strongest indicator of
# survival was "Sex", which had a coefficient of 2.6. 
print("Feature Coefficients:", dict(zip(X.columns, clf.coef_[0])))

# todo: look into what intercept is
print("Intercept:", clf.intercept_)

Feature Coefficients: {'Pclass': np.float64(-0.9314377443658908), 'Sex': np.float64(-2.5987985416779416), 'Age': np.float64(-0.0304056126976557), 'SibSp': np.float64(-0.2946358116063582), 'Parch': np.float64(-0.11276422694591964), 'Fare': np.float64(0.0025629336135515324), 'Embarked': np.float64(-0.21266513877158785)}
Intercept: [4.54950335]


In [67]:
# Predict survival on the validation set. clf.predict computes probabilities and
# outputs 1 (survived) if >= 0.5, or 0 (died)
predictions = clf.predict(X_val)
from sklearn.metrics import accuracy_score

# calculate accuracy by comparing predictions to true labels (y_val). 81% accuracy
# means 81% of predictions are correct
accuracy_score(y_val, predictions)

0.8100558659217877

In [68]:
# Predict survival for the test set. clf.predict outputs 1 (survived) if probability
# >= 0.5, else 0 (died), for submission
submission_preds = clf.predict(test)

In [69]:
# create a data frame with PassengerId and Survived columns.
# test_passenger_ids.values is a list of passenger ids that was
# declared in the first codeblock at the top of this notebook.
# submission_preds is a list of 0s and 1s indicating the results of
# the predictive model. 0 is "died" and 1 is "survived"
df = pd.DataFrame({ "PassengerId": test_passenger_ids.values, "Survived": submission_preds })

In [70]:
# export the data frame into a csv. 
# index=False excludes index values from being added
# to each row
df.to_csv("submission.csv", index=False)