<!-- 

project introduction:

this jupyter notebook implements a machine learning workflow to predict
whether passengers survived the Titanic disaster using a Logistic
Regression model. The exercise uses the Titanic dataset from kaggle, 
which includes passenger information such as class, sex, age, and
fare, to train and evaluate a classifier.  

-->

In [1]:
import pandas as pd

In [2]:
# read the csv files and convert them into pandas data frames that we can work with
# to train a classifier that uses passenger data to make predictions about whether 
# or not each passenger survived. we will compare the predictions against a source
# of truth to determine the accuracy of the model and potentially make some improvements

# train.csv contains passenger features and survival outcomes (Survived), used to train
# the model. test.csv contains features only, used to predict survival for submission.
data = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# test_ids is used at the very end of the exercise once we have a prediction
# for each row. we'll use the passenger ids to indicate the results of the model
test_passenger_ids = test["PassengerId"]

In [3]:
# clean the data and limit to the "features" (column names) we want to use
# in the Logistic Regression classifier below.
def clean(data):
    
    # drop the following columns, as they will not be used in classifying
    # whether a passenger lived or died. cabin would actually be a good
    # thing to consider, if we could know where the cabin was located on
    # the ship. for example, do cabins closer to lifeboats, or further away
    # from point of impact (etc) correlate with survival rates?
    data = data.drop(["Ticket", "PassengerId"], axis=1)
    
    # these numerical columns have some missing values. let's list them
    # and clean them up in the `for` loop below
    # cols = [ "Parch"]

    # if the cell value is empty, fill it with that column's median
    # value. i have mixed feelings about replacing na values with 
    # median. i guess it wouldn't throw off or skew the Logistic 
    # Regression classifier significantly, but it seems like it 
    # would reduce the variability of the data. is it enough 
    # to be a problem?
    # for col in cols:
    #     data.fillna({col: data[col].median()}, inplace=True)
    
    data['AgeBin'] = pd.cut(data['Age'], bins=[0, 12, 18, 35, 60, 120], labels=[0, 1, 2, 3, 4])
    
    # an alternative to filling the age column's empty rows with 
    # the median is to add a new category and fill na rows with it
    data['AgeBin'] = data['AgeBin'].cat.add_categories(['Missing']).fillna('Missing')
    data = data.drop(['Age'], axis=1)
    
    # fill na with S because that is the most commonly embarked location. 
    # adding an Unknown value here would just increase the number 
    # of possible categories this is handled  # outside of the for 
    # loop because Embarked is categorical, unlike the numerical columns above
    data['Embarked'] = data["Embarked"].fillna("S")
    
    # create a new column in the data that accounts for the size of 
    # the passengers family. add one at the end to include the 
    # passenger
    data["FamilyCount"] = data["SibSp"] + data["Parch"] + 1
    data['FamilyCat'] = pd.cut(
        data['FamilyCount'], 
        bins=[0, 1, 4, 11], 
        # alone, small large. use numbers here instead of words
        labels=[0, 1, 2]
    )
    data = data.drop(['SibSp', 'Parch', 'FamilyCount'], axis=1)
    
    # fare has a very wide range of 0-500+, which can skew the 
    # Logistic Regression model, as it assumes linear relationships.
    # binning into equally sized groups reduces sensitivity to outliers
    # and captures which fare ranges may correlate with survival
    data['FareBin'] = pd.qcut(data['Fare'], 4, labels=[0, 1, 2, 3])
    data = data.drop(['Fare'], axis=1)
    
    # extract any substring of the Name column that ends with `.`
    # this is useful because, to an extent, we can infer social 
    # status and age from the title.
    data['Title'] = data['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    
    # handle some of the edge cases
    data['Title'] = data['Title'].replace(
        ['Lady', 'Countess', 'Capt', 'Col', 'Don', 
         'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 
        # replace uncommon values with a single value, so 
        # as to allow the model to focus on more common values
        'Rare'
    )
    data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    
    data['HasCabin'] = data['Cabin'].notnull().astype(int)
    
    data = data.drop("Cabin", axis=1)
    data = data.drop('Name', axis=1)
    
    return data

# use and reuse the clean function to clean the data for both sets
data = clean(data)
test = clean(test)

# preview the cleaned data
data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Embarked,AgeBin,FamilyCat,FareBin,Title,HasCabin
0,0,3,male,S,2,1,0,Mr,0
1,1,1,female,C,3,1,3,Mrs,1
2,1,3,female,S,2,0,1,Miss,0
3,1,1,female,S,2,1,3,Mrs,1
4,0,3,male,S,2,0,1,Mr,0


In [4]:
from sklearn import preprocessing

# instantiate a LabelEncoder to use on Sex and Embarked.
# this will assign 0 to female and a 1 to male. need 
# to do this  because LogisticRegression requires all 
# numeric inputs
le = preprocessing.LabelEncoder()

cols = ["Sex", "Embarked", "Title"]

# for col in cols:
#     le.fit(data[col])
#     data[col] = le.transform(data[col])
#     test[col] = le.transform(test[col])
# encode Sex (e.g., female=0, male=1) and Embarked (e.g., C=0, Q=1, S=2, U=3).
for col in cols:
    data[col] = le.fit_transform(data[col])
    test[col] = le.fit_transform(test[col])
    # le.classes_ shows the unique values that the encoder has 
    # identified and will transform into more readable values
    # that will be used in the modeling
    print(le.classes_)

# check out the first 10 rows of the transformed data
data.head(10)

['female' 'male']
['C' 'Q' 'S']
['Master' 'Miss' 'Mr' 'Mrs' 'Rare']


Unnamed: 0,Survived,Pclass,Sex,Embarked,AgeBin,FamilyCat,FareBin,Title,HasCabin
0,0,3,1,2,2,1,0,2,0
1,1,1,0,0,3,1,3,3,1
2,1,3,0,2,2,0,1,1,0
3,1,1,0,2,2,1,3,3,1
4,0,3,1,2,2,0,1,2,0
5,0,3,1,1,Missing,0,1,2,0
6,0,1,1,2,3,0,3,2,1
7,0,3,1,2,0,2,2,0,0
8,1,3,0,2,2,1,1,3,0
9,1,2,0,0,1,1,2,3,0


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# extract Survived as the target (y) and other columns as features (X)

# make a copy of the survived column off of the data frame
y = data["Survived"]
# after making the copy, drop the column before we pass it into the training
X = data.drop("Survived", axis=1)

# divide the data into two parts: one to train the model (80%) and 
# another to test its predictions (20%). this split is important to 
# check how well the model works on new, unseen data, ensuring it 
# doesn’t just memorize the training data. can't use the complete dataset 
# for both training and testing would overestimate the model’s 
# performance, as it would already know the answers. test_size=0.2 
# means 20% is used for testing, and random_state=42 keeps the split 
# consistent each time the code runs.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# this seems to be the real ML part of the model. LogisticRegression takes in
# the training data, which includes the `Survived` column. this is important
# because we need to start to draw conclusions from the characteristics of 
# those who survived and those who did not. the `clf` name is a convention that
# is an abbreviation for "classifier"

# train the model to learn which passenger traits (like sex or class) predict 
# survival. `.fit` uses the training data to figure out how much each trait
# matters for predicting whether someone survived or died, preparing the 
# model to make predictions later. random_state=0 keeps results consistent, 
# and max_iter=1000 ensures the model finishes learning.
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

# printing the feature coefficients shows that the strongest indicator of
# survival was "Sex", which had a coefficient of 2.6. 
print("Feature Coefficients:", dict(zip(X.columns, clf.coef_[0])))

# todo: look into what intercept is
print("Intercept:", clf.intercept_)

ValueError: could not convert string to float: 'Missing'

In [None]:
# predict survival on the validation set. clf.predict computes probabilities and
# outputs 1 (survived) if >= 0.5, or 0 (died)
predictions = clf.predict(X_val)
from sklearn.metrics import accuracy_score

# calculate accuracy by comparing predictions to true labels (y_val). 81% accuracy
# means 81% of predictions are correct
accuracy_score(y_val, predictions)

  return np.asarray(ret, dtype=dtype)


0.7877094972067039

In [None]:
# predict survival for the test set. clf.predict outputs 1 (survived) if probability
# >= 0.5, else 0 (died), for submission
submission_preds = clf.predict(test)

  return np.asarray(ret, dtype=dtype)


In [None]:
# create a data frame with PassengerId and Survived columns.
# test_passenger_ids.values is a list of passenger ids that was
# declared in the first codeblock at the top of this notebook.
# submission_preds is a list of 0s and 1s indicating the results of
# the predictive model. 0 is "died" and 1 is "survived"
df = pd.DataFrame({ "PassengerId": test_passenger_ids.values, "Survived": submission_preds })

In [None]:
# export the data frame into a csv. 
# index=False excludes index values from being added
# to each row
df.to_csv("submission.csv", index=False)